Index: head/sys/kern/subr_gtaskqueue.c =================================================================== --- head/sys/kern/subr_gtaskqueue.c (revision 333174) +++ head/sys/kern/subr_gtaskqueue.c (revision 333175) @@ -1,979 +1,989 @@ /*- * Copyright (c) 2000 Doug Rabson * Copyright (c) 2014 Jeff Roberson * Copyright (c) 2016 Matthew Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_GTASKQUEUE, "gtaskqueue", "Group Task Queues"); static void gtaskqueue_thread_enqueue(void *); static void gtaskqueue_thread_loop(void *arg); TASKQGROUP_DEFINE(softirq, mp_ncpus, 1); +TASKQGROUP_DEFINE(config, 1, 1); struct gtaskqueue_busy { struct gtask *tb_running; TAILQ_ENTRY(gtaskqueue_busy) tb_link; }; static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1; struct gtaskqueue { STAILQ_HEAD(, gtask) tq_queue; gtaskqueue_enqueue_fn tq_enqueue; void *tq_context; char *tq_name; TAILQ_HEAD(, gtaskqueue_busy) tq_active; struct mtx tq_mutex; struct thread **tq_threads; int tq_tcount; int tq_spin; int tq_flags; int tq_callouts; taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS]; void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS]; }; #define TQ_FLAGS_ACTIVE (1 << 0) #define TQ_FLAGS_BLOCKED (1 << 1) #define TQ_FLAGS_UNLOCKED_ENQUEUE (1 << 2) #define DT_CALLOUT_ARMED (1 << 0) #define TQ_LOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_lock_spin(&(tq)->tq_mutex); \ else \ mtx_lock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_LOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_OWNED) #define TQ_UNLOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_unlock_spin(&(tq)->tq_mutex); \ else \ mtx_unlock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_UNLOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED) #ifdef INVARIANTS static void gtask_dump(struct gtask *gtask) { printf("gtask: %p ta_flags=%x ta_priority=%d ta_func=%p ta_context=%p\n", gtask, gtask->ta_flags, gtask->ta_priority, gtask->ta_func, gtask->ta_context); } #endif static __inline int TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm, int t) { if (tq->tq_spin) return (msleep_spin(p, m, wm, t)); return (msleep(p, m, pri, wm, t)); } static struct gtaskqueue * _gtaskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context, int mtxflags, const char *mtxname __unused) { struct gtaskqueue *queue; char *tq_name; tq_name = malloc(TASKQUEUE_NAMELEN, M_GTASKQUEUE, mflags | M_ZERO); if (!tq_name) return (NULL); snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue"); queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO); if (!queue) { free(tq_name, M_GTASKQUEUE); return (NULL); } STAILQ_INIT(&queue->tq_queue); TAILQ_INIT(&queue->tq_active); queue->tq_enqueue = enqueue; queue->tq_context = context; queue->tq_name = tq_name; queue->tq_spin = (mtxflags & MTX_SPIN) != 0; queue->tq_flags |= TQ_FLAGS_ACTIVE; if (enqueue == gtaskqueue_thread_enqueue) queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE; mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags); return (queue); } /* * Signal a taskqueue thread to terminate. */ static void gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq) { while (tq->tq_tcount > 0 || tq->tq_callouts > 0) { wakeup(tq); TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0); } } static void gtaskqueue_free(struct gtaskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_ACTIVE; gtaskqueue_terminate(queue->tq_threads, queue); KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?")); KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks")); mtx_destroy(&queue->tq_mutex); free(queue->tq_threads, M_GTASKQUEUE); free(queue->tq_name, M_GTASKQUEUE); free(queue, M_GTASKQUEUE); } int grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *gtask) { #ifdef INVARIANTS if (queue == NULL) { gtask_dump(gtask); panic("queue == NULL"); } #endif TQ_LOCK(queue); if (gtask->ta_flags & TASK_ENQUEUED) { TQ_UNLOCK(queue); return (0); } STAILQ_INSERT_TAIL(&queue->tq_queue, gtask, ta_link); gtask->ta_flags |= TASK_ENQUEUED; TQ_UNLOCK(queue); if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0) queue->tq_enqueue(queue->tq_context); return (0); } static void gtaskqueue_task_nop_fn(void *context) { } /* * Block until all currently queued tasks in this taskqueue * have begun execution. Tasks queued during execution of * this function are ignored. */ static void gtaskqueue_drain_tq_queue(struct gtaskqueue *queue) { struct gtask t_barrier; if (STAILQ_EMPTY(&queue->tq_queue)) return; /* * Enqueue our barrier after all current tasks, but with * the highest priority so that newly queued tasks cannot * pass it. Because of the high priority, we can not use * taskqueue_enqueue_locked directly (which drops the lock * anyway) so just insert it at tail while we have the * queue lock. */ GTASK_INIT(&t_barrier, 0, USHRT_MAX, gtaskqueue_task_nop_fn, &t_barrier); STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link); t_barrier.ta_flags |= TASK_ENQUEUED; /* * Once the barrier has executed, all previously queued tasks * have completed or are currently executing. */ while (t_barrier.ta_flags & TASK_ENQUEUED) TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0); } /* * Block until all currently executing tasks for this taskqueue * complete. Tasks that begin execution during the execution * of this function are ignored. */ static void gtaskqueue_drain_tq_active(struct gtaskqueue *queue) { struct gtaskqueue_busy tb_marker, *tb_first; if (TAILQ_EMPTY(&queue->tq_active)) return; /* Block taskq_terminate().*/ queue->tq_callouts++; /* * Wait for all currently executing taskqueue threads * to go idle. */ tb_marker.tb_running = TB_DRAIN_WAITER; TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link); while (TAILQ_FIRST(&queue->tq_active) != &tb_marker) TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0); TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link); /* * Wakeup any other drain waiter that happened to queue up * without any intervening active thread. */ tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); /* Release taskqueue_terminate(). */ queue->tq_callouts--; if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0) wakeup_one(queue->tq_threads); } void gtaskqueue_block(struct gtaskqueue *queue) { TQ_LOCK(queue); queue->tq_flags |= TQ_FLAGS_BLOCKED; TQ_UNLOCK(queue); } void gtaskqueue_unblock(struct gtaskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_BLOCKED; if (!STAILQ_EMPTY(&queue->tq_queue)) queue->tq_enqueue(queue->tq_context); TQ_UNLOCK(queue); } static void gtaskqueue_run_locked(struct gtaskqueue *queue) { struct gtaskqueue_busy tb; struct gtaskqueue_busy *tb_first; struct gtask *gtask; KASSERT(queue != NULL, ("tq is NULL")); TQ_ASSERT_LOCKED(queue); tb.tb_running = NULL; while (STAILQ_FIRST(&queue->tq_queue)) { TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link); /* * Carefully remove the first task from the queue and * clear its TASK_ENQUEUED flag */ gtask = STAILQ_FIRST(&queue->tq_queue); KASSERT(gtask != NULL, ("task is NULL")); STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); gtask->ta_flags &= ~TASK_ENQUEUED; tb.tb_running = gtask; TQ_UNLOCK(queue); KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL")); gtask->ta_func(gtask->ta_context); TQ_LOCK(queue); tb.tb_running = NULL; wakeup(gtask); TAILQ_REMOVE(&queue->tq_active, &tb, tb_link); tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); } } static int task_is_running(struct gtaskqueue *queue, struct gtask *gtask) { struct gtaskqueue_busy *tb; TQ_ASSERT_LOCKED(queue); TAILQ_FOREACH(tb, &queue->tq_active, tb_link) { if (tb->tb_running == gtask) return (1); } return (0); } static int gtaskqueue_cancel_locked(struct gtaskqueue *queue, struct gtask *gtask) { if (gtask->ta_flags & TASK_ENQUEUED) STAILQ_REMOVE(&queue->tq_queue, gtask, gtask, ta_link); gtask->ta_flags &= ~TASK_ENQUEUED; return (task_is_running(queue, gtask) ? EBUSY : 0); } int gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask) { int error; TQ_LOCK(queue); error = gtaskqueue_cancel_locked(queue, gtask); TQ_UNLOCK(queue); return (error); } void gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *gtask) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask)) TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0); TQ_UNLOCK(queue); } void gtaskqueue_drain_all(struct gtaskqueue *queue) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); gtaskqueue_drain_tq_queue(queue); gtaskqueue_drain_tq_active(queue); TQ_UNLOCK(queue); } static int _gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, va_list ap) { char ktname[MAXCOMLEN + 1]; struct thread *td; struct gtaskqueue *tq; int i, error; if (count <= 0) return (EINVAL); vsnprintf(ktname, sizeof(ktname), name, ap); tq = *tqp; tq->tq_threads = malloc(sizeof(struct thread *) * count, M_GTASKQUEUE, M_NOWAIT | M_ZERO); if (tq->tq_threads == NULL) { printf("%s: no memory for %s threads\n", __func__, ktname); return (ENOMEM); } for (i = 0; i < count; i++) { if (count == 1) error = kthread_add(gtaskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname); else error = kthread_add(gtaskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s_%d", ktname, i); if (error) { /* should be ok to continue, taskqueue_free will dtrt */ printf("%s: kthread_add(%s): error %d", __func__, ktname, error); tq->tq_threads[i] = NULL; /* paranoid */ } else tq->tq_tcount++; } for (i = 0; i < count; i++) { if (tq->tq_threads[i] == NULL) continue; td = tq->tq_threads[i]; if (mask) { error = cpuset_setthread(td->td_tid, mask); /* * Failing to pin is rarely an actual fatal error; * it'll just affect performance. */ if (error) printf("%s: curthread=%llu: can't pin; " "error=%d\n", __func__, (unsigned long long) td->td_tid, error); } thread_lock(td); sched_prio(td, pri); sched_add(td, SRQ_BORING); thread_unlock(td); } return (0); } static int gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri, const char *name, ...) { va_list ap; int error; va_start(ap, name); error = _gtaskqueue_start_threads(tqp, count, pri, NULL, name, ap); va_end(ap); return (error); } static inline void gtaskqueue_run_callback(struct gtaskqueue *tq, enum taskqueue_callback_type cb_type) { taskqueue_callback_fn tq_callback; TQ_ASSERT_UNLOCKED(tq); tq_callback = tq->tq_callbacks[cb_type]; if (tq_callback != NULL) tq_callback(tq->tq_cb_contexts[cb_type]); } static void gtaskqueue_thread_loop(void *arg) { struct gtaskqueue **tqp, *tq; tqp = arg; tq = *tqp; gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT); TQ_LOCK(tq); while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { /* XXX ? */ gtaskqueue_run_locked(tq); /* * Because taskqueue_run() can drop tq_mutex, we need to * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the * meantime, which means we missed a wakeup. */ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0) break; TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0); } gtaskqueue_run_locked(tq); /* * This thread is on its way out, so just drop the lock temporarily * in order to call the shutdown callback. This allows the callback * to look at the taskqueue, even just before it dies. */ TQ_UNLOCK(tq); gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); TQ_LOCK(tq); /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); kthread_exit(); } static void gtaskqueue_thread_enqueue(void *context) { struct gtaskqueue **tqp, *tq; tqp = context; tq = *tqp; wakeup_one(tq); } static struct gtaskqueue * gtaskqueue_create_fast(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { return _gtaskqueue_create(name, mflags, enqueue, context, MTX_SPIN, "fast_taskqueue"); } struct taskqgroup_cpu { LIST_HEAD(, grouptask) tgc_tasks; struct gtaskqueue *tgc_taskq; int tgc_cnt; int tgc_cpu; }; struct taskqgroup { struct taskqgroup_cpu tqg_queue[MAXCPU]; struct mtx tqg_lock; char * tqg_name; int tqg_adjusting; int tqg_stride; int tqg_cnt; }; struct taskq_bind_task { struct gtask bt_task; int bt_cpuid; }; static void taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu) { struct taskqgroup_cpu *qcpu; qcpu = &qgroup->tqg_queue[idx]; LIST_INIT(&qcpu->tgc_tasks); qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK, taskqueue_thread_enqueue, &qcpu->tgc_taskq); gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT, "%s_%d", qgroup->tqg_name, idx); qcpu->tgc_cpu = cpu; } static void taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx) { gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq); } /* * Find the taskq with least # of tasks that doesn't currently have any * other queues from the uniq identifier. */ static int taskqgroup_find(struct taskqgroup *qgroup, void *uniq) { struct grouptask *n; int i, idx, mincnt; int strict; mtx_assert(&qgroup->tqg_lock, MA_OWNED); if (qgroup->tqg_cnt == 0) return (0); idx = -1; mincnt = INT_MAX; /* * Two passes; First scan for a queue with the least tasks that * does not already service this uniq id. If that fails simply find * the queue with the least total tasks; */ for (strict = 1; mincnt == INT_MAX; strict = 0) { for (i = 0; i < qgroup->tqg_cnt; i++) { if (qgroup->tqg_queue[i].tgc_cnt > mincnt) continue; if (strict) { LIST_FOREACH(n, &qgroup->tqg_queue[i].tgc_tasks, gt_list) if (n->gt_uniq == uniq) break; if (n != NULL) continue; } mincnt = qgroup->tqg_queue[i].tgc_cnt; idx = i; } } if (idx == -1) panic("taskqgroup_find: Failed to pick a qid."); return (idx); } /* * smp_started is unusable since it is not set for UP kernels or even for * SMP kernels when there is 1 CPU. This is usually handled by adding a * (mp_ncpus == 1) test, but that would be broken here since we need to * to synchronize with the SI_SUB_SMP ordering. Even in the pure SMP case * smp_started only gives a fuzzy ordering relative to SI_SUB_SMP. * * So maintain our own flag. It must be set after all CPUs are started * and before SI_SUB_SMP:SI_ORDER_ANY so that the SYSINIT for delayed * adjustment is properly delayed. SI_ORDER_FOURTH is clearly before * SI_ORDER_ANY and unclearly after the CPUs are started. It would be * simpler for adjustment to pass a flag indicating if it is delayed. */ static int tqg_smp_started; static void tqg_record_smp_started(void *arg) { tqg_smp_started = 1; } SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH, tqg_record_smp_started, NULL); void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask, - void *uniq, int irq, char *name) + void *uniq, int irq, const char *name) { cpuset_t mask; int qid, error; gtask->gt_uniq = uniq; snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask"); gtask->gt_irq = irq; gtask->gt_cpu = -1; mtx_lock(&qgroup->tqg_lock); qid = taskqgroup_find(qgroup, uniq); qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; if (irq != -1 && tqg_smp_started) { gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu; CPU_ZERO(&mask); CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask); mtx_unlock(&qgroup->tqg_lock); error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); if (error) printf("%s: setaffinity failed for %s: %d\n", __func__, gtask->gt_name, error); } else mtx_unlock(&qgroup->tqg_lock); } static void taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { cpuset_t mask; int qid, cpu, error; mtx_lock(&qgroup->tqg_lock); qid = taskqgroup_find(qgroup, gtask->gt_uniq); cpu = qgroup->tqg_queue[qid].tgc_cpu; if (gtask->gt_irq != -1) { mtx_unlock(&qgroup->tqg_lock); CPU_ZERO(&mask); CPU_SET(cpu, &mask); error = intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask); mtx_lock(&qgroup->tqg_lock); if (error) printf("%s: %s setaffinity failed: %d\n", __func__, gtask->gt_name, error); } qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); } int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask, void *uniq, int cpu, int irq, char *name) { cpuset_t mask; int i, qid, error; qid = -1; gtask->gt_uniq = uniq; snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask"); gtask->gt_irq = irq; gtask->gt_cpu = cpu; mtx_lock(&qgroup->tqg_lock); if (tqg_smp_started) { for (i = 0; i < qgroup->tqg_cnt; i++) if (qgroup->tqg_queue[i].tgc_cpu == cpu) { qid = i; break; } if (qid == -1) { mtx_unlock(&qgroup->tqg_lock); printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu); return (EINVAL); } } else qid = 0; qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; cpu = qgroup->tqg_queue[qid].tgc_cpu; mtx_unlock(&qgroup->tqg_lock); CPU_ZERO(&mask); CPU_SET(cpu, &mask); if (irq != -1 && tqg_smp_started) { error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); if (error) printf("%s: setaffinity failed: %d\n", __func__, error); } return (0); } static int taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask) { cpuset_t mask; int i, qid, irq, cpu, error; qid = -1; irq = gtask->gt_irq; cpu = gtask->gt_cpu; MPASS(tqg_smp_started); mtx_lock(&qgroup->tqg_lock); for (i = 0; i < qgroup->tqg_cnt; i++) if (qgroup->tqg_queue[i].tgc_cpu == cpu) { qid = i; break; } if (qid == -1) { mtx_unlock(&qgroup->tqg_lock); printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu); return (EINVAL); } qgroup->tqg_queue[qid].tgc_cnt++; LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list); MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL); gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq; mtx_unlock(&qgroup->tqg_lock); CPU_ZERO(&mask); CPU_SET(cpu, &mask); if (irq != -1) { error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask); if (error) printf("%s: setaffinity failed: %d\n", __func__, error); } return (0); } void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask) { int i; mtx_lock(&qgroup->tqg_lock); for (i = 0; i < qgroup->tqg_cnt; i++) if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue) break; if (i == qgroup->tqg_cnt) panic("taskqgroup_detach: task %s not in group\n", gtask->gt_name); qgroup->tqg_queue[i].tgc_cnt--; LIST_REMOVE(gtask, gt_list); mtx_unlock(&qgroup->tqg_lock); gtask->gt_taskqueue = NULL; } static void taskqgroup_binder(void *ctx) { struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx; cpuset_t mask; int error; CPU_ZERO(&mask); CPU_SET(gtask->bt_cpuid, &mask); error = cpuset_setthread(curthread->td_tid, &mask); thread_lock(curthread); sched_bind(curthread, gtask->bt_cpuid); thread_unlock(curthread); if (error) printf("%s: setaffinity failed: %d\n", __func__, error); free(gtask, M_DEVBUF); } static void taskqgroup_bind(struct taskqgroup *qgroup) { struct taskq_bind_task *gtask; int i; /* * Bind taskqueue threads to specific CPUs, if they have been assigned * one. */ if (qgroup->tqg_cnt == 1) return; for (i = 0; i < qgroup->tqg_cnt; i++) { gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK); GTASK_INIT(>ask->bt_task, 0, 0, taskqgroup_binder, gtask); gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu; grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq, >ask->bt_task); } } static int _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) { LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL); struct grouptask *gtask; int i, k, old_cnt, old_cpu, cpu; mtx_assert(&qgroup->tqg_lock, MA_OWNED); if (cnt < 1 || cnt * stride > mp_ncpus || !tqg_smp_started) { printf("%s: failed cnt: %d stride: %d " "mp_ncpus: %d tqg_smp_started: %d\n", __func__, cnt, stride, mp_ncpus, tqg_smp_started); return (EINVAL); } if (qgroup->tqg_adjusting) { printf("%s failed: adjusting\n", __func__); return (EBUSY); } qgroup->tqg_adjusting = 1; old_cnt = qgroup->tqg_cnt; old_cpu = 0; if (old_cnt < cnt) old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu; mtx_unlock(&qgroup->tqg_lock); /* * Set up queue for tasks added before boot. */ if (old_cnt == 0) { LIST_SWAP(>ask_head, &qgroup->tqg_queue[0].tgc_tasks, grouptask, gt_list); qgroup->tqg_queue[0].tgc_cnt = 0; } /* * If new taskq threads have been added. */ cpu = old_cpu; for (i = old_cnt; i < cnt; i++) { taskqgroup_cpu_create(qgroup, i, cpu); for (k = 0; k < stride; k++) cpu = CPU_NEXT(cpu); } mtx_lock(&qgroup->tqg_lock); qgroup->tqg_cnt = cnt; qgroup->tqg_stride = stride; /* * Adjust drivers to use new taskqs. */ for (i = 0; i < old_cnt; i++) { while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) { LIST_REMOVE(gtask, gt_list); qgroup->tqg_queue[i].tgc_cnt--; LIST_INSERT_HEAD(>ask_head, gtask, gt_list); } } mtx_unlock(&qgroup->tqg_lock); while ((gtask = LIST_FIRST(>ask_head))) { LIST_REMOVE(gtask, gt_list); if (gtask->gt_cpu == -1) taskqgroup_attach_deferred(qgroup, gtask); else if (taskqgroup_attach_cpu_deferred(qgroup, gtask)) taskqgroup_attach_deferred(qgroup, gtask); } #ifdef INVARIANTS mtx_lock(&qgroup->tqg_lock); for (i = 0; i < qgroup->tqg_cnt; i++) { MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL); LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list) MPASS(gtask->gt_taskqueue != NULL); } mtx_unlock(&qgroup->tqg_lock); #endif /* * If taskq thread count has been reduced. */ for (i = cnt; i < old_cnt; i++) taskqgroup_cpu_remove(qgroup, i); taskqgroup_bind(qgroup); mtx_lock(&qgroup->tqg_lock); qgroup->tqg_adjusting = 0; return (0); } int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride) { int error; mtx_lock(&qgroup->tqg_lock); error = _taskqgroup_adjust(qgroup, cnt, stride); mtx_unlock(&qgroup->tqg_lock); return (error); } struct taskqgroup * taskqgroup_create(char *name) { struct taskqgroup *qgroup; qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO); mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF); qgroup->tqg_name = name; LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks); return (qgroup); } void taskqgroup_destroy(struct taskqgroup *qgroup) { +} + +void +taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, + const char *name) +{ + + GROUPTASK_INIT(gtask, 0, fn, ctx); + taskqgroup_attach(qgroup_config, gtask, gtask, -1, name); } Index: head/sys/kern/subr_witness.c =================================================================== --- head/sys/kern/subr_witness.c (revision 333174) +++ head/sys/kern/subr_witness.c (revision 333175) @@ -1,3077 +1,3081 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2008 Isilon Systems, Inc. * Copyright (c) 2008 Ilya Maykov * Copyright (c) 1998 Berkeley Software Design, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_stack.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #if !defined(DDB) && !defined(STACK) #error "DDB or STACK options are required for WITNESS" #endif /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ #define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */ /* Define this to check for blessed mutexes */ #undef BLESSING #ifndef WITNESS_COUNT #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (512 + (MAXCPU * 4)) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 /* Prime, gives load factor of ~2 at full load */ #define WITNESS_LO_HASH_SIZE 1021 /* * XXX: This is somewhat bogus, as we assume here that at most 2048 threads * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_NCHILDREN 5 #define LOCK_CHILDCOUNT 2048 #define MAX_W_NAME 64 #define FULLGRAPH_SBUF_SIZE 512 /* * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ #define WITNESS_UNRELATED 0x00 /* No lock order relation. */ #define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ #define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ #define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ #define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ #define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) #define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) #define WITNESS_RELATED_MASK \ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) #define WITNESS_REVERSAL 0x10 /* A lock order reversal has been * observed. */ #define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ #define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ #define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) /* Ancestor to descendant flags */ #define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) #define WITNESS_INDEX_ASSERT(i) \ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count) static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; /* * The main witness structure. One of these per named lock type in the system * (for example, "vnode interlock"). */ struct witness { char w_name[MAX_W_NAME]; uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness *w_hash_next; /* Linked list in hash buckets. */ const char *w_file; /* File where last acquired */ uint32_t w_line; /* Line where last acquired */ uint32_t w_refcount; uint16_t w_num_ancestors; /* direct/indirect * ancestor count */ uint16_t w_num_descendants; /* direct/indirect * descendant count */ int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; STAILQ_HEAD(witness_list, witness); /* * The witness hash table. Keys are witness names (const char *), elements are * witness objects (struct witness *). */ struct witness_hash { struct witness *wh_array[WITNESS_HASH_SIZE]; uint32_t wh_size; uint32_t wh_count; }; /* * Key type for the lock order data hash table. */ struct witness_lock_order_key { uint16_t from; uint16_t to; }; struct witness_lock_order_data { struct stack wlod_stack; struct witness_lock_order_key wlod_key; struct witness_lock_order_data *wlod_next; }; /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; u_int wloh_size; u_int wloh_count; }; #ifdef BLESSING struct witness_blessed { const char *b_lock1; const char *b_lock2; }; #endif struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; }; struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; /* * Returns 0 if one of the locks is a spin lock and the other is not. * Returns 1 otherwise. */ static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { return (a->from == b->from && a->to == b->to); } static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname); static void adopt(struct witness *parent, struct witness *child); #ifdef BLESSING static int blessed(struct witness *, struct witness *); #endif static void depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static struct lock_instance *find_instance(struct lock_list_entry *list, const struct lock_object *lock); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static void itismychild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS); static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent); #ifdef DDB static void witness_ddb_compute_levels(void); static void witness_ddb_display(int(*)(const char *fmt, ...)); static void witness_ddb_display_descendants(int(*)(const char *fmt, ...), struct witness *, int indent); static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_ddb_level_descendants(struct witness *parent, int l); static void witness_ddb_list(struct thread *td); #endif static void witness_debugger(int cond, const char *msg); static void witness_free(struct witness *m); static struct witness *witness_get(void); static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size); static struct witness *witness_hash_get(const char *key); static void witness_hash_put(struct witness *w); static void witness_init_hash_tables(void); static void witness_increment_graph_generation(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_list_entry *witness_lock_list_get(void); static int witness_lock_order_add(struct witness *parent, struct witness *child); static int witness_lock_order_check(struct witness *parent, struct witness *child); static struct witness_lock_order_data *witness_lock_order_get( struct witness *parent, struct witness *child); static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)); static int witness_output(const char *fmt, ...) __printflike(1, 2); static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0); static void witness_setflag(struct lock_object *lock, int flag, int set); static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL, "Witness Locking"); /* * If set to 0, lock order checking is disabled. If set to -1, * witness is completely disabled. Otherwise witness performs full * lock order checking for all locks. At runtime, lock order checking * may be toggled. However, witness cannot be reenabled once it is * completely disabled. */ static int witness_watch = 1; SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #endif /* KDB */ #if defined(DDB) || defined(KDB) /* * When DDB or KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); int badstack_sbuf_size; int witness_count = WITNESS_COUNT; SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* * Output channel for witness messages. By default we print to the console. */ enum witness_channel { WITNESS_CONSOLE, WITNESS_LOG, WITNESS_NONE, }; static enum witness_channel witness_channel = WITNESS_CONSOLE; SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING | CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A", "Output channel for warnings"); /* * Call this to print out the relations between locks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs"); /* * Call this to print out the witness faulty stacks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks"); static struct mtx w_mtx; /* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); /* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); /* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); static struct witness *w_data; static uint8_t **w_rmatrix; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_hash w_hash; /* The witness hash table. */ /* The lock order data hash */ static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT]; static struct witness_lock_order_data *w_lofree = NULL; static struct witness_lock_order_hash w_lohash; static int w_max_used_index = 0; static unsigned int w_generation = 0; static const char w_notrunning[] = "Witness not running\n"; static const char w_stillcold[] = "Witness is still cold\n"; #ifdef __i386__ static const char w_notallowed[] = "The sysctl is disabled on the arch\n"; #endif static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { "time lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * umtx */ { "umtx lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_rw }, { "rtentry", &lock_class_mtx_sleep }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv4 multicast: * protocol locks before interface locks, after UDP locks. */ + { "in_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, - { "in_multi_mtx", &lock_class_mtx_sleep }, + { "in_multi_list_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, + { "ifnet_rw", &lock_class_rw }, { "if_addr_lock", &lock_class_rw }, { NULL, NULL }, /* * IPv6 multicast: * protocol locks before interface locks, after UDP locks. */ + { "in6_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, - { "in6_multi_mtx", &lock_class_mtx_sleep }, + { "in6_multi_list_mtx", &lock_class_mtx_sleep }, { "mld_mtx", &lock_class_mtx_sleep }, + { "ifnet_rw", &lock_class_rw }, { "if_addr_lock", &lock_class_rw }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp_link_rwlock", &lock_class_rw }, { "unp_list_lock", &lock_class_mtx_sleep }, { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ { "udp", &lock_class_rw }, { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ { "tcp", &lock_class_rw }, { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_sx }, { "bpf interface lock", &lock_class_rw }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VM */ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VFS namecache */ { "ncvn", &lock_class_mtx_sleep }, { "ncbuc", &lock_class_rw }, { "vnode interlock", &lock_class_mtx_sleep }, { "ncneg", &lock_class_mtx_sleep }, { NULL, NULL }, /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, { "dr->dt.di.dr_mtx", &lock_class_sx }, { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* * TCP log locks */ { "TCP ID tree", &lock_class_rw }, { "tcp log id bucket", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "TCP log expireq", &lock_class_mtx_sleep }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, { "sio", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif #ifdef __sparc64__ { "pcib_mtx", &lock_class_mtx_spin }, { "rtc_mtx", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { "process slock", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #if defined(SMP) && defined(__sparc64__) { "ipi", &lock_class_mtx_spin }, #endif #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; #ifdef BLESSING /* * Pairs of locks which have been blessed * Don't complain about order problems with blessed locks */ static struct witness_blessed blessed_list[] = { }; #endif /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } /* * Calculate the size of early witness structures. */ int witness_startup_count(void) { int sz; sz = sizeof(struct witness) * witness_count; sz += sizeof(*w_rmatrix) * (witness_count + 1); sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) * (witness_count + 1); return (sz); } /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ void witness_startup(void *mem) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; uintptr_t p; int i; p = (uintptr_t)mem; w_data = (void *)p; p += sizeof(struct witness) * witness_count; w_rmatrix = (void *)p; p += sizeof(*w_rmatrix) * (witness_count + 1); for (i = 0; i < witness_count + 1; i++) { w_rmatrix[i] = (void *)p; p += sizeof(*w_rmatrix[i]) * (witness_count + 1); } badstack_sbuf_size = witness_count * 256; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = witness_count - 1; i >= 0; i--) { w = &w_data[i]; memset(w, 0, sizeof(*w)); w_data[i].w_index = i; /* Witness index never changes. */ witness_free(w); } KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, ("%s: Invalid list of free witness objects", __func__)); /* Witness with index 0 is not used to aid in debugging. */ STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; for (i = 0; i < witness_count; i++) { memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); witness_init_hash_tables(); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; itismychild(w, w1); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ for (i = 0; pending_locks[i].wh_lock != NULL; i++) { lock = pending_locks[i].wh_lock; KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(pending_locks[i].wh_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } void witness_init(struct lock_object *lock, const char *type) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) kassert_panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) kassert_panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) kassert_panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch < 1 || panicstr != NULL || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, " "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) return; w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; if (w->w_refcount == 0) depart(w); mtx_unlock_spin(&w_mtx); } #ifdef DDB static void witness_ddb_compute_levels(void) { struct witness *w; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_ddb_level = -1; /* * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* If the witness has ancestors (is not a root), skip it. */ if (w->w_num_ancestors > 0) continue; witness_ddb_level_descendants(w, 0); } } static void witness_ddb_level_descendants(struct witness *w, int l) { int i; if (w->w_ddb_level >= l) return; w->w_ddb_level = l; l++; for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_level_descendants(&w_data[i], l); } } static void witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), struct witness *w, int indent) { int i; for (i = 0; i < indent; i++) prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); if (w->w_displayed) { prnt(" -- (already displayed)\n"); return; } w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); else prnt(" -- never acquired\n"); indent++; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (db_pager_quit) return; if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_display_descendants(prnt, &w_data[i], indent); } } static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_ddb_level > 0) continue; /* This lock has no anscestors - display its descendants. */ witness_ddb_display_descendants(prnt, w, 0); if (db_pager_quit) return; } } static void witness_ddb_display(int(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); witness_ddb_compute_levels(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_ddb_display_list(prnt, &w_sleep); if (db_pager_quit) return; /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_ddb_display_list(prnt, &w_spin); if (db_pager_quit) return; /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s (type: %s, depth: %d)\n", w->w_name, w->w_class->lc_name, w->w_ddb_level); if (db_pager_quit) return; } } #endif /* DDB */ int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == -1 || panicstr != NULL) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); mtx_assert(&w_mtx, MA_NOTOWNED); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (witness_watch && isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_witness->w_name, lock1->lo_witness->w_name); itismychild(lock1->lo_witness, lock2->lo_witness); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line, struct lock_object *interlock) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1, *lock2, *plock; struct lock_class *class, *iclass; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) kassert_panic("acquiring blockable sleep lock with " "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ lock_list = td->td_sleeplocks; if (lock_list == NULL || lock_list->ll_count == 0) return; } else { /* * If this is the first lock, just return as no order * checking is needed. Avoid problems with thread * migration pinning the thread while checking if * spinlocks are held. If at least one spinlock is held * the thread is in a safe path and it is allowed to * unpin it. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list == NULL || lock_list->ll_count == 0) { sched_unpin(); return; } sched_unpin(); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("excl->share"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("share->excl"); } return; } /* Warn if the interlock is not locked exactly once. */ if (interlock != NULL) { iclass = LOCK_CLASS(interlock); lock1 = find_instance(lock_list, interlock); if (lock1 == NULL) kassert_panic("interlock (%s) %s not locked @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); else if ((lock1->li_flags & LI_RECURSEMASK) != 0) kassert_panic("interlock (%s) %s recursed @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); } /* * Find the previously acquired lock, but ignore interlocks. */ plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) { if (lock_list->ll_count > 1) plock = &lock_list->ll_children[lock_list->ll_count - 2]; else { lle = lock_list->ll_next; /* * The interlock is the only lock we hold, so * simply return. */ if (lle == NULL) return; plock = &lle->ll_children[lle->ll_count - 1]; } } /* * Try to perform most checks without a lock. If this succeeds we * can skip acquiring the lock and return success. Otherwise we redo * the check with the lock held to handle races with concurrent updates. */ w1 = plock->li_lock->lo_witness; if (witness_lock_order_check(w1, w)) return; mtx_lock_spin(&w_mtx); if (witness_lock_order_check(w1, w)) { mtx_unlock_spin(&w_mtx); return; } witness_lock_order_add(w1, w); /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ if (w1 == w) { i = w->w_index; if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) && !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { w_rmatrix[i][i] |= WITNESS_REVERSAL; w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); witness_output(" 2nd %s @ %s:%d\n", lock->lo_name, fixup_filename(file), line); witness_debugger(1, __func__); } else mtx_unlock_spin(&w_mtx); return; } mtx_assert(&w_mtx, MA_OWNED); /* * If we know that the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) goto out; for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; /* * Ignore the interlock. */ if (interlock == lock1->li_lock) continue; /* * If this lock doesn't undergo witness checking, * then skip it. */ w1 = lock1->li_lock->lo_witness; if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ #ifdef BLESSING /* * If the lock order is blessed, just bail. We don't * look for other lock order violations though, which * may be a bug. */ if (blessed(w, w1)) goto out; #endif /* Bail if this violation is known */ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) goto out; /* Record this as a violation */ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; w->w_reversed = w1->w_reversed = 1; witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); #ifdef WITNESS_NO_VNODE /* * There are known LORs between VNODE locks. They are * not an indication of a bug. VNODE locks are flagged * as such (LO_IS_VNODE) and we don't yell if the LOR * is between 2 VNODE locks. */ if ((lock->lo_flags & LO_IS_VNODE) != 0 && (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0) return; #endif /* * Ok, yell about it. */ if (((lock->lo_flags & LO_SLEEPABLE) != 0 && (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0)) witness_output( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); else witness_output("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } else { witness_output(" 1st %p %s (%s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, lock2->li_lock->lo_witness->w_name, fixup_filename(lock2->li_file), lock2->li_line); witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock, lock->lo_name, w->w_name, fixup_filename(file), line); } witness_debugger(1, __func__); return; } } /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(plock->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, w->w_name, plock->li_lock->lo_witness->w_name); itismychild(plock->li_lock->lo_witness, w); } out: mtx_unlock_spin(&w_mtx); } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || panicstr != NULL) return; w = lock->lo_witness; td = curthread; /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags = LI_EXCLUSIVE; else instance->li_flags = 0; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || lock->lo_witness == NULL || panicstr != NULL) return; td = curthread; class = LOCK_CLASS(lock); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } /* * When disabling WITNESS through witness_watch we could end up in * having registered locks in the td_sleeplocks queue. * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ if (witness_watch > 0) { kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } else { return; } found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* The lock is now being dropped, check for NORELEASE flag */ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { witness_output("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* * In order to reduce contention on w_mtx, we want to keep always an * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have * further objects in the list, so the list ownership needs to be * hand over to another object if the current head needs to be freed. */ if ((*lock_list)->ll_count == 0) { if (*lock_list == lle) { if (lle->ll_next == NULL) return; } else lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } void witness_thread_exit(struct thread *td) { struct lock_list_entry *lle; int i, n; lle = td->td_sleeplocks; if (lle == NULL || panicstr != NULL) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { if (n == 0) witness_output( "Thread %p exiting with the following locks held:\n", td); n++; witness_list_lock(&lle->ll_children[i], witness_output); } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * output channel along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch < 1 || panicstr != NULL) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); } n++; witness_list_lock(lock1, printf); } /* * Pin the thread in order to avoid problems with thread migration. * Once that all verifies are passed about spinlocks ownership, * the thread is in a safe path and it can be unpinned. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list != NULL && lock_list->ll_count != 0) { sched_unpin(); /* * We should only have one spinlock and as long as * the flags cannot match for this locks class, * check if the first spinlock is the one curthread * should hold. */ lock1 = &lock_list->ll_children[lock_list->ll_count - 1]; if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0) return (0); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); n += witness_list_locks(&lock_list, printf); } else sched_unpin(); if (flags & WARN_PANIC && n) kassert_panic("%s", __func__); else witness_debugger(n, __func__); return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; MPASS(description != NULL); if (witness_watch == -1 || panicstr != NULL) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK)) { if (witness_skipspin) return (NULL); } else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) { kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); return (NULL); } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); if (w) goto found; if ((w = witness_get()) == NULL) return (NULL); MPASS(strlen(description) < MAX_W_NAME); strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } /* Insert new witness into the hash */ witness_hash_put(w); witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); return (w); found: w->w_refcount++; if (w->w_refcount == 1) w->w_class = lock_class; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } static void depart(struct witness *w) { MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { w_sleep_cnt--; } else { w_spin_cnt--; } /* * Set file to NULL as it may point into a loadable module. */ w->w_file = NULL; w->w_line = 0; witness_increment_graph_generation(); } static void adopt(struct witness *parent, struct witness *child) { int pi, ci, i, j; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); /* If the relationship is already known, there's no work to be done. */ if (isitmychild(parent, child)) return; /* When the structure of the graph changes, bump up the generation. */ witness_increment_graph_generation(); /* * The hard part ... create the direct relationship, then propagate all * indirect relationships. */ pi = parent->w_index; ci = child->w_index; WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); MPASS(pi != ci); w_rmatrix[pi][ci] |= WITNESS_PARENT; w_rmatrix[ci][pi] |= WITNESS_CHILD; /* * If parent was not already an ancestor of child, * then we increment the descendant and ancestor counters. */ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++; child->w_num_ancestors++; } /* * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { /* * Skip children that are already marked as * descendants of 'i'. */ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) continue; /* * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; w_rmatrix[j][i] |= WITNESS_DESCENDANT; w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; /* * Make sure we aren't marking a node as both an * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } } } } static void itismychild(struct witness *parent, struct witness *child) { int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { if (witness_cold == 0) { unlocked = 1; mtx_unlock_spin(&w_mtx); } else { unlocked = 0; } kassert_panic( "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); if (unlocked) mtx_lock_spin(&w_mtx); } adopt(parent, child); } /* * Generic code for the isitmy*() functions. The rmask parameter is the * expected relationship of w1 to w2. */ static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { unsigned char r1, r2; int i1, i2; i1 = w1->w_index; i2 = w2->w_index; WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2); r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; /* The flags on one better be the inverse of the flags on the other */ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ if (!mtx_owned(&w_mtx)) return (0); printf("%s: rmatrix mismatch between %s (index %d) and %s " "(index %d): w_rmatrix[%d][%d] == %hhx but " "w_rmatrix[%d][%d] == %hhx\n", fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, i2, i1, r2); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } return (r1 & rmask); } /* * Checks if @child is a direct child of @parent. */ static int isitmychild(struct witness *parent, struct witness *child) { return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } /* * Checks if @descendant is a direct or inderect descendant of @ancestor. */ static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } #ifdef BLESSING static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < nitems(blessed_list); i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } #endif static struct witness * witness_get(void) { struct witness *w; int index; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (witness_watch == -1) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; MPASS(index > 0 && index == w_max_used_index+1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; if (index > w_max_used_index) w_max_used_index = index; return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == -1) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *list, const struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)) { struct lock_object *lock; lock = instance->li_lock; prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_witness->w_name != lock->lo_name) prnt(" (%s)", lock->lo_witness->w_name); prnt(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, fixup_filename(instance->li_file), instance->li_line); } static int witness_output(const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = witness_voutput(fmt, ap); va_end(ap); return (ret); } static int witness_voutput(const char *fmt, va_list ap) { int ret; ret = 0; switch (witness_channel) { case WITNESS_CONSOLE: ret = vprintf(fmt, ap); break; case WITNESS_LOG: vlog(LOG_NOTICE, fmt, ap); break; case WITNESS_NONE: break; } return (ret); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list, int (*prnt)(const char *fmt, ...)) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i], prnt); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner, int (*prnt)(const char *fmt, ...)) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance, prnt); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; if (instance == NULL) return; instance->li_file = file; instance->li_line = line; } void witness_assert(const struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) instance = find_instance(curthread->td_sleeplocks, lock); else if ((class->lc_flags & LC_SPINLOCK) != 0) instance = find_instance(PCPU_GET(spinlocks), lock); else { kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); return; } switch (flags) { case LA_UNLOCKED: if (instance != NULL) kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } #endif /* INVARIANT_SUPPORT */ } static void witness_setflag(struct lock_object *lock, int flag, int set) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } if (set) instance->li_flags |= flag; else instance->li_flags &= ~flag; } void witness_norelease(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 0); } #ifdef DDB static void witness_ddb_list(struct thread *td) { KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch < 1) return; witness_list_locks(&td->td_sleeplocks, db_printf); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks), db_printf); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; witness_ddb_list(td); } DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, p->p_comm, td, td->td_tid); witness_ddb_list(td); if (db_pager_quit) return; } } } DB_SHOW_ALIAS(alllocks, db_witness_list_all) DB_SHOW_COMMAND(witness, db_witness_display) { witness_ddb_display(db_printf); } #endif static void sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; int generation, i, j; tmp_data1 = NULL; tmp_data2 = NULL; tmp_w1 = NULL; tmp_w2 = NULL; /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w1 = &w_data[i]; if (w1->w_reversed == 0) { mtx_unlock_spin(&w_mtx); continue; } /* Copy w1 locally so we can release the spin lock. */ *tmp_w1 = *w1; mtx_unlock_spin(&w_mtx); if (tmp_w1->w_reversed == 0) continue; for (j = 1; j < w_max_used_index; j++) { if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) continue; mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w2 = &w_data[j]; data1 = witness_lock_order_get(w1, w2); data2 = witness_lock_order_get(w2, w1); /* * Copy information locally so we can release the * spin lock. */ *tmp_w2 = *w2; if (data1) { stack_zero(&tmp_data1->wlod_stack); stack_copy(&data1->wlod_stack, &tmp_data1->wlod_stack); } if (data2 && data2 != data1) { stack_zero(&tmp_data2->wlod_stack); stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); sbuf_printf(sb, "\n"); } if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); sbuf_printf(sb, "\n"); } } } mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* * The graph changed while we were printing stack data, * try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } mtx_unlock_spin(&w_mtx); /* Free temporary storage space. */ free(tmp_data1, M_TEMP); free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); } static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND); if (sb == NULL) return (ENOMEM); sbuf_print_witness_badstacks(sb, &req->oldidx); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } #ifdef DDB static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { return (db_printf("%.*s", len, data)); } DB_SHOW_COMMAND(badstacks, db_witness_badstacks) { struct sbuf sb; char buffer[128]; size_t dummy; sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL); sbuf_print_witness_badstacks(&sb, &dummy); sbuf_finish(&sb); } #endif static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS) { static const struct { enum witness_channel channel; const char *name; } channels[] = { { WITNESS_CONSOLE, "console" }, { WITNESS_LOG, "log" }, { WITNESS_NONE, "none" }, }; char buf[16]; u_int i; int error; buf[0] = '\0'; for (i = 0; i < nitems(channels); i++) if (witness_channel == channels[i].channel) { snprintf(buf, sizeof(buf), "%s", channels[i].name); break; } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); error = EINVAL; for (i = 0; i < nitems(channels); i++) if (strcmp(channels[i].name, buf) == 0) { witness_channel = channels[i].channel; error = 0; break; } return (error); } static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) { struct witness *w; struct sbuf *sb; int error; #ifdef __i386__ error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed)); return (error); #endif if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "\n"); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; STAILQ_FOREACH(w, &w_all, w_list) witness_add_fullgraph(sb, w); mtx_unlock_spin(&w_mtx); /* * Close the sbuf and return to userland. */ error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value > 1 || value < -1 || (witness_watch == -1 && value != witness_watch)) return (EINVAL); witness_watch = value; return (0); } static void witness_add_fullgraph(struct sbuf *sb, struct witness *w) { int i; if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0)) return; w->w_displayed = 1; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name, w_data[i].w_name); witness_add_fullgraph(sb, &w_data[i]); } } } /* * A simple hash function. Takes a key pointer and a key size. If size == 0, * interprets the key as a string and reads until the null * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit * hash value computed from the key. */ static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size) { unsigned int hash = 5381; int i; /* hash = hash * 33 + key[i] */ if (size) for (i = 0; i < size; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; else for (i = 0; key[i] != 0; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; return (hash); } /* * Initializes the two witness hash tables. Called exactly once from * witness_initialize(). */ static void witness_init_hash_tables(void) { int i; MPASS(witness_cold); /* Initialize the hash tables. */ for (i = 0; i < WITNESS_HASH_SIZE; i++) w_hash.wh_array[i] = NULL; w_hash.wh_size = WITNESS_HASH_SIZE; w_hash.wh_count = 0; /* Initialize the lock order data hash. */ w_lofree = NULL; for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) { memset(&w_lodata[i], 0, sizeof(w_lodata[i])); w_lodata[i].wlod_next = w_lofree; w_lofree = &w_lodata[i]; } w_lohash.wloh_size = WITNESS_LO_HASH_SIZE; w_lohash.wloh_count = 0; for (i = 0; i < WITNESS_LO_HASH_SIZE; i++) w_lohash.wloh_array[i] = NULL; } static struct witness * witness_hash_get(const char *key) { struct witness *w; uint32_t hash; MPASS(key != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); hash = witness_hash_djb2(key, 0) % w_hash.wh_size; w = w_hash.wh_array[hash]; while (w != NULL) { if (strcmp(w->w_name, key) == 0) goto out; w = w->w_hash_next; } out: return (w); } static void witness_hash_put(struct witness *w) { uint32_t hash; MPASS(w != NULL); MPASS(w->w_name != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); KASSERT(witness_hash_get(w->w_name) == NULL, ("%s: trying to add a hash entry that already exists!", __func__)); KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; w->w_hash_next = w_hash.wh_array[hash]; w_hash.wh_array[hash] = w; w_hash.wh_count++; } static struct witness_lock_order_data * witness_lock_order_get(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if ((w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { if (witness_lock_order_key_equal(&data->wlod_key, &key)) break; data = data->wlod_next; } out: return (data); } /* * Verify that parent and child have a known relationship, are not the same, * and child is actually a child of parent. This is done without w_mtx * to avoid contention in the common case. */ static int witness_lock_order_check(struct witness *parent, struct witness *child) { if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && isitmychild(parent, child)) return (1); return (0); } static int witness_lock_order_add(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) return (1); hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; if (data == NULL) return (0); w_lofree = data->wlod_next; data->wlod_next = w_lohash.wloh_array[hash]; data->wlod_key = key; w_lohash.wloh_array[hash] = data; w_lohash.wloh_count++; stack_zero(&data->wlod_stack); stack_save(&data->wlod_stack); return (1); } /* Call this whenever the structure of the witness graph changes. */ static void witness_increment_graph_generation(void) { if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; } static int witness_output_drain(void *arg __unused, const char *data, int len) { witness_output("%.*s", len, data); return (len); } static void witness_debugger(int cond, const char *msg) { char buf[32]; struct sbuf sb; struct stack st; if (!cond) return; if (witness_trace) { sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); stack_zero(&st); stack_save(&st); witness_output("stack backtrace:\n"); stack_sbuf_print_ddb(&sb, &st); sbuf_finish(&sb); } #ifdef KDB if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, msg); #endif } Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 333174) +++ head/sys/net/if.c (revision 333175) @@ -1,4467 +1,4477 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ case _IOC_NEWTYPE((cmd), struct ifgroupreq32): #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ case (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*bridge_linkstate_p)(struct ifnet *ifp); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); static void if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void) { u_short idx; IFNET_WLOCK_ASSERT(); retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { if_grow(); goto retry; } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = ifp; } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { IFNET_WLOCK(); ifnet_setbyindex_locked(idx, ifp); IFNET_WUNLOCK(); } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifnet *ifp; struct ifaddr *ifa = NULL; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); if_grow(); /* create initial table */ IFNET_WUNLOCK(); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ TAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void if_grow(void) { int oldlim; u_int n; struct ifnet **e; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return; } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } V_if_indexlim <<= 1; V_ifindex_table = e; } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); IFNET_WLOCK(); idx = ifindex_alloc(); ifnet_setbyindex_locked(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); TAILQ_INIT(&ifp->if_addrhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp, M_IFNET); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) if_free_internal(ifp); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; if_free_internal(ifp); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_attachdomain(void *dummy) { struct ifnet *ifp; TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; /* XXX cannot hold IF_ADDR_WLOCK over called functions. */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; - struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); - TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) + while (!TAILQ_EMPTY(&ifp->if_multiaddrs)) { + ifma = TAILQ_FIRST(&ifp->if_multiaddrs); + TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_delmulti_locked(ifp, ifma, 1); + } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE int shutdown; shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; #endif IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ free(ifp->if_hw_addr, M_IFADDR); ifp->if_hw_addr = NULL; ifp->if_addr = NULL; /* We can now free link ifaddr. */ IF_ADDR_WLOCK(ifp); if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); IFNET_WLOCK(); ifp->if_index = ifindex_alloc(); ifnet_setbyindex_locked(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while (!TAILQ_EMPTY(&ifp->if_groups)) { ifgl = TAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(if_t ifp) { IF_ADDR_RLOCK((struct ifnet *)ifp); } void if_maddr_runlock(if_t ifp) { IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) { counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0) log(LOG_DEBUG, "%s: %s failed for interface %s: %u\n", __func__, otype, if_name(ifp), error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ static struct ifaddr * ifa_ifwithaddr_internal(const struct sockaddr *addr, int getref) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 1)); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 0) != NULL); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { if (ifa_maybe != NULL) ifa_free(ifa_maybe); ifa_maybe = ifa; ifa_ref(ifa_maybe); } } } IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; done: IFNET_RUNLOCK_NOSLEEP(); if (ifa_maybe != NULL) ifa_free(ifa_maybe); return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == NULL) || ((ifp = ifa->ifa_ifp) == NULL) || ((dst = rt_key(rt)) == NULL)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; rt->rt_ifa = ifa; ifa_free(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) (*bridge_linkstate_p)(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) log(LOG_INFO, "%s: permanently promiscuous mode %s\n", ifp->if_xname, ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data; struct ifmediareq ifmr; #endif struct ifmediareq *ifmrp; struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE int shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifmrp = NULL; #ifdef COMPAT_FREEBSD32 switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ static void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; +#ifdef INET + IN_MULTI_LIST_UNLOCK_ASSERT(); +#endif +#ifdef INET6 + IN6_MULTI_LIST_UNLOCK_ASSERT(); +#endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma(struct ifmultiaddr *ifma) { struct ifnet *ifp; int lastref; +#ifdef INET + IN_MULTI_LIST_UNLOCK_ASSERT(); +#endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, 0); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); } if_freemulti(ll_ifma); } } - - if (ifp != NULL) + if (ifp != NULL && detaching == 0) TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { ifa_free(ifa); return (EINVAL); } if (len != sdl->sdl_alen) { /* don't allow length to change */ ifa_free(ifa); return (EINVAL); } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); ifa_free(ifa); break; default: ifa_free(ifa); return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char * fmt, ...) { va_list ap; int retval; retval = printf("%s: ", ifp->if_xname); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/netinet/igmp.c =================================================================== --- head/sys/netinet/igmp.c (revision 333174) +++ head/sys/netinet/igmp.c (revision 333175) @@ -1,3671 +1,3649 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifsoftc *); static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); -static void igmp_v3_process_group_timers(struct igmp_ifsoftc *, +static void igmp_v3_process_group_timers(struct in_multi_head *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: - * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. - * * IN_MULTI_LOCK covers in_multi. + * * IN_MULTI_LIST_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { struct igmp_ifinfo info; info.igi_version = igi->igi_version; info.igi_v1_timer = igi->igi_v1_timer; info.igi_v2_timer = igi->igi_v2_timer; info.igi_v3_timer = igi->igi_v3_timer; info.igi_flags = igi->igi_flags; info.igi_rv = igi->igi_rv; info.igi_qi = igi->igi_qi; info.igi_qri = igi->igi_qri; info.igi_uri = igi->igi_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = (char)IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; - SLIST_INIT(&igi->igi_relinmhead); mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; - struct in_multi *inm, *tinm; - + struct in_multi *inm; + struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); + SLIST_INIT(&inm_free_tmp); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; - if (inm->inm_state == IGMP_LEAVING_MEMBER) { - SLIST_INSERT_HEAD(&igi->igi_relinmhead, - inm, inm_nrele); - } + if (inm->inm_state == IGMP_LEAVING_MEMBER) + inm_rele_locked(&inm_free_tmp, inm); inm_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); - /* - * Free the in_multi reference(s) for this IGMP lifecycle. - */ - SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, - tinm) { - SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); - inm_release_locked(inm); - } + inm_release_list_deferred(&inm_free_tmp); } - IGMP_UNLOCK(); + } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifsoftc *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); - - KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), - ("%s: there are dangling in_multi references", - __func__)); - free(igi, M_IGMP); return; } } } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer); - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 0x%08x query on ifp %p(%s)", ntohl(igmpv3->igmp_group.s_addr), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); } } CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { if (ia != NULL) ifa_free(&ia->ia_ifa); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + sizeof(struct in_addr) * nsrc; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm; + struct in_multi_head inm_free_tmp; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; - IN_MULTI_LOCK(); + SLIST_INIT(&inm_free_tmp); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: - igmp_v3_process_group_timers(igi, &qrq, + igmp_v3_process_group_timers(&inm_free_tmp, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { - struct in_multi *tinm; - igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ - SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, - inm_nrele, tinm) { - SLIST_REMOVE_HEAD(&igi->igi_relinmhead, - inm_nrele); - inm_release_locked(inm); - } + inm_release_list_deferred(&inm_free_tmp); } } out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void -igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, +igmp_v3_process_group_timers(struct in_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; - SLIST_INSERT_HEAD(&igi->igi_relinmhead, - inm, inm_nrele); + inm_rele_locked(inmh, inm); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; - struct in_multi *inm, *tinm; + struct in_multi *inm; + struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); + SLIST_INIT(&inm_free_tmp); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ - SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); + inm_rele_locked(&inm_free_tmp, inm); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); - SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { - SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); - inm_release_locked(inm); - } + + inm_release_list_deferred(&inm_free_tmp); } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifsoftc *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; - IN_MULTI_LOCK_ASSERT(); - error = 0; - + IN_MULTI_LOCK_ASSERT(); /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && - inm->inm_state == IGMP_LEAVING_MEMBER) - inm_release_locked(inm); - + inm->inm_state == IGMP_LEAVING_MEMBER) { + MPASS(inm->inm_refcount > 1); + inm_rele_locked(NULL, inm); + } inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->inm_scq; mbufq_drain(mq); retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave 0x%08x on ifp %p(%s)", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving 0x%08x/%s with %d " "pending retransmissions.", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for 0x%08x/%s", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for 0x%08x/%s", __func__, igmp_rec_type_to_str(type), ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval, loop; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&igi->igi_gq) != 0) goto send; ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); } } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif #ifdef VIMAGE static void vnet_igmp_init(const void *unused __unused) { netisr_register_vnet(&igmp_nh); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister_vnet(&igmp_nh); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_igmp_uninit, NULL); #endif #ifdef DDB DB_SHOW_COMMAND(igi_list, db_show_igi_list) { struct igmp_ifsoftc *igi, *tigi; LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; if (!have_addr) { db_printf("usage: show igi_list \n"); return; } igi_head = (struct _igi_list *)addr; LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { db_printf("igmp_ifsoftc %p:\n", igi); db_printf(" ifp %p\n", igi->igi_ifp); db_printf(" version %u\n", igi->igi_version); db_printf(" v1_timer %u\n", igi->igi_v1_timer); db_printf(" v2_timer %u\n", igi->igi_v2_timer); db_printf(" v3_timer %u\n", igi->igi_v3_timer); db_printf(" flags %#x\n", igi->igi_flags); db_printf(" rv %u\n", igi->igi_rv); db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); - /* SLIST_HEAD(,in_multi) igi_relinmhead */ /* struct mbufq igi_gq; */ db_printf("\n"); } } #endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); break; case MOD_UNLOAD: CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); Index: head/sys/netinet/igmp_var.h =================================================================== --- head/sys/netinet/igmp_var.h (revision 333174) +++ head/sys/netinet/igmp_var.h (revision 333175) @@ -1,233 +1,232 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 * $FreeBSD$ */ #ifndef _NETINET_IGMP_VAR_H_ #define _NETINET_IGMP_VAR_H_ /* * Internet Group Management Protocol (IGMP), * implementation-specific definitions. * * Written by Steve Deering, Stanford, May 1988. * * MULTICAST Revision: 3.5.1.3 */ /* * IGMPv3 protocol statistics. */ struct igmpstat { /* * Structure header (to insulate ABI changes). */ uint32_t igps_version; /* version of this structure */ uint32_t igps_len; /* length of this structure */ /* * Message statistics. */ uint64_t igps_rcv_total; /* total IGMP messages received */ uint64_t igps_rcv_tooshort; /* received with too few bytes */ uint64_t igps_rcv_badttl; /* received with ttl other than 1 */ uint64_t igps_rcv_badsum; /* received with bad checksum */ /* * Query statistics. */ uint64_t igps_rcv_v1v2_queries; /* received IGMPv1/IGMPv2 queries */ uint64_t igps_rcv_v3_queries; /* received IGMPv3 queries */ uint64_t igps_rcv_badqueries; /* received invalid queries */ uint64_t igps_rcv_gen_queries; /* received general queries */ uint64_t igps_rcv_group_queries;/* received group queries */ uint64_t igps_rcv_gsr_queries; /* received group-source queries */ uint64_t igps_drop_gsr_queries; /* dropped group-source queries */ /* * Report statistics. */ uint64_t igps_rcv_reports; /* received membership reports */ uint64_t igps_rcv_badreports; /* received invalid reports */ uint64_t igps_rcv_ourreports; /* received reports for our groups */ uint64_t igps_rcv_nora; /* received w/o Router Alert option */ uint64_t igps_snd_reports; /* sent membership reports */ /* * Padding for future additions. */ uint64_t __igps_pad[4]; }; #define IGPS_VERSION_3 3 /* as of FreeBSD 8.x */ #define IGPS_VERSION3_LEN 168 #ifdef CTASSERT CTASSERT(sizeof(struct igmpstat) == IGPS_VERSION3_LEN); #endif /* * Identifiers for IGMP sysctl nodes */ #define IGMPCTL_STATS 1 /* statistics (read-only) */ #define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) #define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */ /* * IGMP per-group states. */ #define IGMP_NOT_MEMBER 0 /* Can garbage collect in_multi */ #define IGMP_SILENT_MEMBER 1 /* Do not perform IGMP for group */ #define IGMP_REPORTING_MEMBER 2 /* IGMPv1/2/3 we are reporter */ #define IGMP_IDLE_MEMBER 3 /* IGMPv1/2 we reported last */ #define IGMP_LAZY_MEMBER 4 /* IGMPv1/2 other member reporting */ #define IGMP_SLEEPING_MEMBER 5 /* IGMPv1/2 start query response */ #define IGMP_AWAKENING_MEMBER 6 /* IGMPv1/2 group timer will start */ #define IGMP_G_QUERY_PENDING_MEMBER 7 /* IGMPv3 group query pending */ #define IGMP_SG_QUERY_PENDING_MEMBER 8 /* IGMPv3 source query pending */ #define IGMP_LEAVING_MEMBER 9 /* IGMPv3 dying gasp (pending last */ /* retransmission of INCLUDE {}) */ /* * IGMP version tag. */ #define IGMP_VERSION_NONE 0 /* Invalid */ #define IGMP_VERSION_1 1 #define IGMP_VERSION_2 2 #define IGMP_VERSION_3 3 /* Default */ /* * IGMPv3 protocol control variables. */ #define IGMP_RV_INIT 2 /* Robustness Variable */ #define IGMP_RV_MIN 1 #define IGMP_RV_MAX 7 #define IGMP_QI_INIT 125 /* Query Interval (s) */ #define IGMP_QI_MIN 1 #define IGMP_QI_MAX 255 #define IGMP_QRI_INIT 10 /* Query Response Interval (s) */ #define IGMP_QRI_MIN 1 #define IGMP_QRI_MAX 255 #define IGMP_URI_INIT 3 /* Unsolicited Report Interval (s) */ #define IGMP_URI_MIN 0 #define IGMP_URI_MAX 10 #define IGMP_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ #define IGMP_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ #define IGMP_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ #define IGMP_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ #define IGMP_RESPONSE_BURST_INTERVAL (PR_FASTHZ / 2) /* 500ms */ /* * IGMP-specific mbuf flags. */ #define M_IGMPV2 M_PROTO1 /* Packet is IGMPv2 */ #define M_IGMPV3_HDR M_PROTO2 /* Packet has IGMPv3 headers */ #define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ #define M_IGMP_LOOP M_PROTO4 /* transmit on loif, not real ifp */ /* * Default amount of leading space for IGMPv3 to allocate at the * beginning of its mbuf packet chains, to avoid fragmentation and * unnecessary allocation of leading mbufs. */ #define RAOPT_LEN 4 /* Length of IP Router Alert option */ #define IGMP_LEADINGSPACE \ (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report)) /* * Structure returned by net.inet.igmp.ifinfo sysctl. */ struct igmp_ifinfo { uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ uint32_t igi_flags; /* IGMP per-interface flags */ #define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ #define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ uint32_t igi_rv; /* IGMPv3 Robustness Variable */ uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ }; #ifdef _KERNEL #define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val) #define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1) /* * Subsystem lock macros. * The IGMP lock is only taken with IGMP. Currently it is system-wide. * VIMAGE: The lock could be pushed to per-VIMAGE granularity in future. */ #define IGMP_LOCK_INIT() mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF) #define IGMP_LOCK_DESTROY() mtx_destroy(&igmp_mtx) #define IGMP_LOCK() mtx_lock(&igmp_mtx) #define IGMP_LOCK_ASSERT() mtx_assert(&igmp_mtx, MA_OWNED) #define IGMP_UNLOCK() mtx_unlock(&igmp_mtx) #define IGMP_UNLOCK_ASSERT() mtx_assert(&igmp_mtx, MA_NOTOWNED) /* * Per-interface IGMP router version information. */ struct igmp_ifsoftc { LIST_ENTRY(igmp_ifsoftc) igi_link; struct ifnet *igi_ifp; /* pointer back to interface */ uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ uint32_t igi_flags; /* IGMP per-interface flags */ uint32_t igi_rv; /* IGMPv3 Robustness Variable */ uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ - SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ struct mbufq igi_gq; /* general query responses queue */ }; int igmp_change_state(struct in_multi *); void igmp_fasttimo(void); struct igmp_ifsoftc * igmp_domifattach(struct ifnet *); void igmp_domifdetach(struct ifnet *); void igmp_ifdetach(struct ifnet *); int igmp_input(struct mbuf **, int *, int); void igmp_slowtimo(void); SYSCTL_DECL(_net_inet_igmp); #endif /* _KERNEL */ #endif Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 333174) +++ head/sys/netinet/in.c (revision 333175) @@ -1,1513 +1,1508 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, (cmd == SIOCDIFADDR) ? false : true); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); - IN_MULTI_LOCK(); if (ii->ii_allhosts) { - (void)in_leavegroup_locked(ii->ii_allhosts, NULL); + (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } - IN_MULTI_UNLOCK(); } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifaliasreq ifr; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_RLOCK(ifp); */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* IF_ADDR_RUNLOCK(ifp); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } int in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; int found; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; break; } IF_ADDR_RUNLOCK(ifp); return (found); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { - + IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); + IN_MULTI_UNLOCK(); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { - LIST_HEAD(,in_multi) purgeinms; - struct in_multi *inm, *tinm; + struct in_multi_head purgeinms; + struct in_multi *inm; struct ifmultiaddr *ifma; - LIST_INIT(&purgeinms); - IN_MULTI_LOCK(); + SLIST_INIT(&purgeinms); + IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; - LIST_INSERT_HEAD(&purgeinms, inm, inm_link); + inm_rele_locked(&purgeinms, inm); } IF_ADDR_RUNLOCK(ifp); - LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { - LIST_REMOVE(inm, inm_link); - inm_release_locked(inm); - } + inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); - - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(struct llentry *lle) { LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by the datapath to indicate that * the entry was used. */ static void in_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; LLE_REQ_UNLOCK(lle); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); in_lltable_destroy_lle_unlocked(lle); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } /* cancel timer */ if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rt_addrinfo info; struct sockaddr_in rt_key, rt_mask; struct sockaddr rt_gateway; int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); bzero(&rt_key, sizeof(rt_key)); rt_key.sin_len = sizeof(rt_key); bzero(&rt_mask, sizeof(rt_mask)); rt_mask.sin_len = sizeof(rt_mask); bzero(&rt_gateway, sizeof(rt_gateway)); rt_gateway.sa_len = sizeof(rt_gateway); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); rt_flags = info.rti_flags; /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt_flags & RTF_GATEWAY) { if (!(rt_flags & RTF_HOST) || !info.rti_ifp || info.rti_ifp->if_type != IFT_ETHER || (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { rib_free_info(&info); return (EINVAL); } } rib_free_info(&info); /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; const struct sockaddr_in *l3sin; mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); sa = (const char *)&rt_key; addr = (const char *)l3addr; l3sin = (const struct sockaddr_in *)l3addr; lim = addr + l3sin->sin_len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC char addrbuf[INET_ADDRSTRLEN]; log(LOG_INFO, "IPv4 address: \"%s\" " "is not on the network\n", inet_ntoa_r(l3sin->sin_addr, addrbuf)); #endif return (EINVAL); } } } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { in_lltable_destroy_lle_unlocked(lle); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = in_lltable_mark_used; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/in_mcast.c =================================================================== --- head/sys/netinet/in_mcast.c (revision 333174) +++ head/sys/netinet/in_mcast.c (revision 333175) @@ -1,3007 +1,3081 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include +#include + #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: - * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * - * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly + * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ -struct mtx in_multi_mtx; -MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF); +struct mtx in_multi_list_mtx; +MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); +struct mtx in_multi_free_mtx; +MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); + +struct sx in_multi_sx; +SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); + /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_addmulti() * in_delmulti() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() * * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() * and in_delmulti(). */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static int imo_grow(struct ip_moptions *); static size_t imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(const struct ip_moptions *, const size_t, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); +static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static void inp_freemoptions_internal(struct ip_moptions *); static void inp_gcmoptions(void *, int); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); static STAILQ_HEAD(, ip_moptions) imo_gc_list = STAILQ_HEAD_INITIALIZER(imo_gc_list); static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif +static struct grouptask free_gtask; +static struct in_multi_head inm_free_list; +static void inm_release_task(void *arg __unused); +static void inm_init(void) +{ + SLIST_INIT(&inm_free_list); + taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); +} + +SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, + inm_init, NULL); + + +void +inm_release_list_deferred(struct in_multi_head *inmh) +{ + + if (SLIST_EMPTY(inmh)) + return; + mtx_lock(&in_multi_free_mtx); + SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); + mtx_unlock(&in_multi_free_mtx); + GROUPTASK_ENQUEUE(&free_gtask); +} + +void +inm_release_deferred(struct in_multi *inm) +{ + struct in_multi_head tmp; + + IN_MULTI_LIST_LOCK_ASSERT(); + MPASS(inm->inm_refcount > 0); + if (--inm->inm_refcount == 0) { + SLIST_INIT(&tmp); + inm->inm_ifma->ifma_protospec = NULL; + SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); + inm_release_list_deferred(&tmp); + } +} + +static void +inm_release_task(void *arg __unused) +{ + struct in_multi_head inm_free_tmp; + struct in_multi *inm, *tinm; + + SLIST_INIT(&inm_free_tmp); + mtx_lock(&in_multi_free_mtx); + SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); + mtx_unlock(&in_multi_free_mtx); + IN_MULTI_LOCK(); + SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { + SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); + MPASS(inm); + inm_release(inm); + } + IN_MULTI_UNLOCK(); +} + /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. - * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. + * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET) { inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct in_multi *inm; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_RLOCK(ifp); inm = inm_lookup_locked(ifp, ina); IF_ADDR_RUNLOCK(ifp); return (inm); } /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int imo_grow(struct ip_moptions *imo) { struct in_multi **nmships; struct in_multi **omships; struct in_mfilter *nmfilters; struct in_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->imo_membership; omfilters = imo->imo_mfilters; oldmax = imo->imo_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IP_MAX_MEMBERSHIPS) { nmships = (struct in_multi **)realloc(omships, sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); nmfilters = (struct in_mfilter *)realloc(omfilters, sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->imo_max_memberships = newmax; imo->imo_membership = nmships; imo->imo_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IPMOPTS); if (nmfilters != NULL) free(nmfilters, M_INMFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_multi **pinm; int idx; int nmships; gsin = (const struct sockaddr_in *)group; /* The imo_membership array may be lazy allocated. */ if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) return (-1); nmships = imo->imo_num_memberships; pinm = &imo->imo_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(const struct ip_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip_msource find; struct in_mfilter *imf; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The imo_mfilters array may be lazy allocated. */ if (imo->imo_mfilters == NULL) return (NULL); imf = &imo->imo_mfilters[gidx]; /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = imo_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->imo_mfilters[gidx].imf_st[1]; ims = imo_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; - + IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); - ++inm->inm_refcount; + inm_acquire_locked(inm); *pinm = inm; - return (0); } - + IN_MULTI_LIST_UNLOCK(); + if (inm != NULL) + return (0); + memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ + IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif - ++inm->inm_refcount; + inm_acquire_locked(inm); *pinm = inm; - IF_ADDR_WUNLOCK(ifp); - return (0); + goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); + IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; - + out_locked: IF_ADDR_WUNLOCK(ifp); + IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ -void -inm_release_locked(struct in_multi *inm) +static void +inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; + struct ifnet *ifp; - IN_MULTI_LOCK_ASSERT(); - CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); - - if (--inm->inm_refcount > 0) { - CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__, - inm->inm_refcount); - return; - } - + MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; + ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); - KASSERT(ifma->ifma_protospec == inm, - ("%s: ifma_protospec != inm", __func__)); - ifma->ifma_protospec = NULL; - + if (ifp) + CURVNET_SET(ifp->if_vnet); inm_purge(inm); - free(inm, M_IPMADDR); if_delmulti_ifma(ifma); + if (ifp) + CURVNET_RESTORE(); } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; + IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } - + IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } -out_inm_release: + out_inm_release: + IN_MULTI_LIST_UNLOCK(); if (error) { + CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); - inm_release_locked(inm); + inm_release_deferred(inm); } else { *pinm = inm; } return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; error = 0; IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_UNLOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); + inm_release_deferred(inm); + IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); - inm_release_locked(inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Join an IPv4 multicast group in (*,G) exclusive mode. * The group must be a 224.0.0.0/24 link-scope group. * This KPI is for legacy kernel consumers only. */ struct in_multi * in_addmulti(struct in_addr *ap, struct ifnet *ifp) { struct in_multi *pinm; int error; #ifdef INVARIANTS char addrbuf[INET_ADDRSTRLEN]; #endif KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa_r(*ap, addrbuf))); error = in_joingroup(ifp, ap, NULL, &pinm); if (error != 0) pinm = NULL; return (pinm); } /* - * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. - * This KPI is for legacy kernel consumers only. - */ -void -in_delmulti(struct in_multi *inm) -{ - - (void)in_leavegroup(inm, NULL); -} -/*#endif*/ - -/* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } KASSERT(imo->imo_mfilters != NULL, ("%s: imo_mfilters not allocated", __func__)); imf = &imo->imo_mfilters[idx]; inm = imo->imo_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); - + IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); - + IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; struct in_multi **immp; struct in_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, M_INMFILTER, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; imo->imo_num_memberships = 0; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->imo_mfilters = imfp; INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imfp, M_INMFILTER); free(immp, M_IPMOPTS); free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } /* * Discard the IP multicast options (and source filters). To minimize * the amount of work done while holding locks such as the INP's * pcbinfo lock (which is used in the receive path), the free * operation is performed asynchronously in a separate task. * * SMPng: NOTE: assumes INP write lock is held. */ void inp_freemoptions(struct ip_moptions *imo) { + if (imo == NULL) + return; KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); taskqueue_enqueue(taskqueue_thread, &imo_gc_task); } static void inp_freemoptions_internal(struct ip_moptions *imo) { struct in_mfilter *imf; size_t idx, nmships; nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); (void)in_leavegroup(imo->imo_membership[idx], imf); if (imf) imf_purge(imf); } if (imo->imo_mfilters) free(imo->imo_mfilters, M_INMFILTER); free(imo->imo_membership, M_IPMOPTS); free(imo, M_IPMOPTS); } static void inp_gcmoptions(void *context, int pending) { struct ip_moptions *imo; - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); while (!STAILQ_EMPTY(&imo_gc_list)) { imo = STAILQ_FIRST(&imo_gc_list); STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); inp_freemoptions_internal(imo); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); } - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EINVAL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->imo_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; ifa_free(&ia->ia_ifa); } } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found. * * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; struct nhop4_basic nh4; uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); } else { fibnum = inp ? inp->inp_inc.inc_fibnum : 0; if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) ifp = nh4.nh_ifp; else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; break; } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Do argument switcharoo from ip_mreq into * ip_mreq_source to avoid using two instances. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->imo_num_memberships == imo->imo_max_memberships) { error = imo_grow(imo); if (error) goto out_inp_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->imo_num_memberships; imo->imo_membership[idx] = NULL; imo->imo_num_memberships++; KASSERT(imo->imo_mfilters != NULL, ("%s: imf_mfilters vector was not allocated", __func__)); imf = &imo->imo_mfilters[idx]; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_imo_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at IGMP layer. */ + in_pcbref(inp); + INP_WUNLOCK(inp); IN_MULTI_LOCK(); if (is_new) { error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); goto out_imo_free; } imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", - __func__); + __func__); + IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); goto out_in_multi_locked; } } out_in_multi_locked: IN_MULTI_UNLOCK(); - - INP_WLOCK_ASSERT(inp); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (ENXIO); if (error) { imf_rollback(imf); if (is_new) imf_purge(imf); else imf_reap(imf); } else { imf_commit(imf); } out_imo_free: if (error && is_new) { imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; size_t idx; int error, is_final; ifp = NULL; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) INADDR_TO_IFP(mreqs.imr_interface, ifp); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); /* * Find the membership in the membership array. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { imf_leave(imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); } } out_in_multi_locked: IN_MULTI_UNLOCK(); if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); if (is_final) { /* Remove the gap in the membership and filter array. */ for (++idx; idx < imo->imo_num_memberships; ++idx) { imo->imo_membership[idx-1] = imo->imo_membership[idx]; imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; } imo->imo_num_memberships--; } out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { ifp = ifnet_byindex(mreqn.imr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { INADDR_TO_IFP(addr, ifp); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); idx = imo_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->imo_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imo->imo_membership[idx]; imf = &imo->imo_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) return (retval); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { "un", "in", "ex" }; static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 333174) +++ head/sys/netinet/in_pcb.c (revision 333175) @@ -1,3093 +1,3112 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #ifdef TCPHPTS #include #endif #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #include static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ short inp_so_options(const struct inpcb *inp) { short so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; +#ifdef INET6 + struct ip6_moptions *im6o = NULL; +#endif +#ifdef INET + struct ip_moptions *imo = NULL; +#endif KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return; } #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); +#ifdef INET + imo = inp->inp_moptions; + inp->inp_moptions = NULL; +#endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif INP_LIST_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); - if (inp->in6p_moptions != NULL) - ip6_freemoptions(inp->in6p_moptions); + im6o = inp->in6p_moptions; + inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); -#ifdef INET - if (inp->inp_moptions != NULL) - inp_freemoptions(inp->inp_moptions); -#endif RO_INVALIDATE_CACHE(&inp->inp_route); inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); +#if defined(INET) && defined(INET6) + if (imo == NULL && im6o == NULL) + return; +#endif + INP_INFO_WUNLOCK(pcbinfo); +#ifdef INET6 + ip6_freemoptions(im6o); +#endif +#ifdef INET + inp_freemoptions(imo); +#endif + INP_INFO_WLOCK(pcbinfo); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { - in_delmulti(imo->imo_membership[i]); + IN_MULTI_LOCK_ASSERT(); + in_leavegroup_locked(imo->imo_membership[i], NULL); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; bool locked; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); else bzero(&xi->xi_socket, sizeof(struct xsocket)); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) return (EOPNOTSUPP); error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, }; int error; INP_WLOCK_ASSERT(inp); if (inp->inp_snd_tag != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); /* * At success increment the refcount on * the send tag's network interface: */ if (error == 0) if_ref(inp->inp_snd_tag->ifp); } return (error); } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; struct ifnet *ifp; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; ifp = mst->ifp; if (ifp == NULL) return; /* * If the device was detached while we still had reference(s) * on the ifp, we assume if_snd_tag_free() was replaced with * stubs. */ ifp->if_snd_tag_free(mst); /* release reference count on network interface */ if_rele(ifp); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; int error; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { struct socket *socket; bool did_upgrade; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #endif /* RATELIMIT */ Index: head/sys/netinet/in_var.h =================================================================== --- head/sys/netinet/in_var.h (revision 333174) +++ head/sys/netinet/in_var.h (revision 333175) @@ -1,401 +1,429 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.2 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ /* * Argument structure for SIOCAIFADDR. */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ struct sockaddr_in ifra_addr; struct sockaddr_in ifra_broadaddr; #define ifra_dstaddr ifra_broadaddr struct sockaddr_in ifra_mask; int ifra_vhid; }; #ifdef _KERNEL #include #include #include struct igmp_ifsoftc; struct in_multi; struct lltable; +SLIST_HEAD(in_multi_head, in_multi); /* * IPv4 per-interface state. */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ struct igmp_ifsoftc *ii_igmp; /* IGMP state */ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ }; /* * Interface address, Internet version. One of these structures * is allocated for each Internet address on an interface. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ struct in_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags /* ia_subnet{,mask} in host order */ u_long ia_subnet; /* subnet address */ u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ struct callout ia_garp_timer; /* timer for retransmitting GARPs */ int ia_garp_count; /* count of retransmitted GARPs */ }; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) #define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) extern u_char inetctlerrmap[]; #define LLTABLE(ifp) \ ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt /* * Hash table for IP addresses. */ TAILQ_HEAD(in_ifaddrhead, in_ifaddr); LIST_HEAD(in_ifaddrhashhead, in_ifaddr); VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead); VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ #define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl) #define V_in_ifaddrhead VNET(in_ifaddrhead) #define V_in_ifaddrhmask VNET(in_ifaddrhmask) #define INADDR_NHASH_LOG2 9 #define INADDR_NHASH (1 << INADDR_NHASH_LOG2) #define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) extern struct rmlock in_ifaddr_lock; #define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED) #define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED) #define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t)) #define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock) #define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED) #define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock) /* * Macro for finding the internet address structure (in_ifaddr) * corresponding to one of our IP addresses (in_addr). */ #define INADDR_TO_IFADDR(addr, ia) \ /* struct in_addr addr; */ \ /* struct in_ifaddr *ia; */ \ do { \ \ LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ break; \ } while (0) /* * Macro for finding the interface (ifnet structure) corresponding to one * of our IP addresses. */ #define INADDR_TO_IFP(addr, ifp) \ /* struct in_addr addr; */ \ /* struct ifnet *ifp; */ \ { \ struct in_ifaddr *ia; \ \ INADDR_TO_IFADDR(addr, ia); \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ } /* * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ #define IFP_TO_IA(ifp, ia, t) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ /* struct rm_priotracker *t; */ \ do { \ IN_IFADDR_RLOCK((t)); \ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ ifa_ref(&(ia)->ia_ifa); \ IN_IFADDR_RUNLOCK((t)); \ } while (0) /* * Legacy IPv4 IGMP per-link structure. */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ SLIST_ENTRY(router_info) rti_list; }; /* * IPv4 multicast IGMP-layer source entry. */ struct ip_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ struct ims_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } ims_st[2]; /* state at t0, t1 */ uint8_t ims_stp; /* pending query */ }; /* * IPv4 multicast PCB-layer source entry. */ struct in_msource { RB_ENTRY(ip_msource) ims_link; /* RB tree links */ in_addr_t ims_haddr; /* host byte order */ uint8_t imsl_st[2]; /* state before/at commit */ }; RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ static __inline int ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) { if (a->ims_haddr < b->ims_haddr) return (-1); if (a->ims_haddr == b->ims_haddr) return (0); return (1); } RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); /* * IPv4 multicast PCB-layer group filter descriptor. */ struct in_mfilter { struct ip_msource_tree imf_sources; /* source list for (S,G) */ u_long imf_nsrc; /* # of source entries */ uint8_t imf_st[2]; /* state before/at commit */ }; /* * IPv4 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When IGMPv3 is active, inm_timer is the response to group query timer. * The state-change timer inm_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: inm_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in_multi { LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ void *inm_rti; /* unused, legacy field */ u_int inm_refcount; /* reference count */ /* New fields for IGMPv3 follow. */ struct igmp_ifsoftc *inm_igi; /* IGMP info */ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ struct ip_msource_tree inm_srcs; /* tree of sources */ u_long inm_nsrc; /* # of tree entries */ struct mbufq inm_scq; /* queue of pending * state-change packets */ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ uint16_t inm_sctimer; /* state-change timer */ uint16_t inm_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing IGMPv3 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct inm_st { uint16_t iss_fmode; /* IGMP filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } inm_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, uint8_t t) { t = !!t; if (inm->inm_st[t].iss_ex > 0 && inm->inm_st[t].iss_ex == ims->ims_st[t].ex) return (MCAST_EXCLUDE); else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); #endif /* * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ -extern struct mtx in_multi_mtx; -#define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx) -#define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx) -#define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) -#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED) +extern struct mtx in_multi_list_mtx; +extern struct sx in_multi_sx; +#define IN_MULTI_LIST_LOCK() mtx_lock(&in_multi_list_mtx) +#define IN_MULTI_LIST_UNLOCK() mtx_unlock(&in_multi_list_mtx) +#define IN_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_OWNED) +#define IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED) + +#define IN_MULTI_LOCK() sx_xlock(&in_multi_sx) +#define IN_MULTI_UNLOCK() sx_xunlock(&in_multi_sx) +#define IN_MULTI_LOCK_ASSERT() sx_assert(&in_multi_sx, SA_XLOCKED) +#define IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED) + /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) { - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); ++inm->inm_refcount; } +static __inline void +inm_acquire(struct in_multi *inm) +{ + IN_MULTI_LIST_LOCK(); + inm_acquire_locked(inm); + IN_MULTI_LIST_UNLOCK(); +} + +static __inline void +inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm) +{ + MPASS(inm->inm_refcount > 0); + IN_MULTI_LIST_LOCK_ASSERT(); + + if (--inm->inm_refcount == 0) { + MPASS(inmh != NULL); + inm->inm_ifma->ifma_protospec = NULL; + SLIST_INSERT_HEAD(inmh, inm, inm_nrele); + } +} + /* * Return values for imo_multi_filter(). */ #define MCAST_PASS 0 /* Pass */ #define MCAST_NOTGMEMBER 1 /* This host not a member of group */ #define MCAST_NOTSMEMBER 2 /* This host excluded source */ #define MCAST_MUTED 3 /* [deprecated] */ struct rtentry; struct route; struct ip_moptions; struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr); struct in_multi *inm_lookup(struct ifnet *, const struct in_addr); int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); void inm_commit(struct in_multi *); void inm_clear_recorded(struct in_multi *); void inm_print(const struct in_multi *); int inm_record_source(struct in_multi *inm, const in_addr_t); -void inm_release(struct in_multi *); -void inm_release_locked(struct in_multi *); +void inm_release_deferred(struct in_multi *); +void inm_release_list_deferred(struct in_multi_head *); struct in_multi * - in_addmulti(struct in_addr *, struct ifnet *); -void in_delmulti(struct in_multi *); +in_addmulti(struct in_addr *, struct ifnet *); int in_joingroup(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_joingroup_locked(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void in_ifscrub_all(void); void ip_input(struct mbuf *); void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_tryforward(struct mbuf *); void *in_domifattach(struct ifnet *); void in_domifdetach(struct ifnet *, void *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); #endif /* _KERNEL */ /* INET6 stuff */ #include #endif /* _NETINET_IN_VAR_H_ */ Index: head/sys/netinet/ip_carp.c =================================================================== --- head/sys/netinet/ip_carp.c (revision 333174) +++ head/sys/netinet/ip_carp.c (revision 333175) @@ -1,2237 +1,2237 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Michael Shalayeff. * Copyright (c) 2003 Ryan McBride. * Copyright (c) 2011 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #include #include #include #endif #include static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ struct sockaddr_dl sc_addr; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ #endif struct mtx sc_mtx; int sc_vhid; int sc_advskew; int sc_advbase; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 int sc_init_counter; uint64_t sc_counter; /* authentication */ #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; struct carp_if { #ifdef INET int cif_naddrs; #endif #ifdef INET6 int cif_naddrs6; #endif TAILQ_HEAD(, carp_softc) cif_vrs; #ifdef INET struct ip_moptions cif_imo; #endif #ifdef INET6 struct ip6_moptions cif_im6o; #endif struct ifnet *cif_ifp; struct mtx cif_mtx; uint32_t cif_flags; #define CIF_PROMISC 0x00000001 }; #define CARP_INET 0 #define CARP_INET6 1 static int proto_reg[] = {-1, -1}; /* * Brief design of carp(4). * * Any carp-capable ifnet may have a list of carp softcs hanging off * its ifp->if_carp pointer. Each softc represents one unique virtual * host id, or vhid. The softc has a back pointer to the ifnet. All * softcs are joined in a global list, which has quite limited use. * * Any interface address that takes part in CARP negotiation has a * pointer to the softc of its vhid, ifa->ifa_carp. That could be either * AF_INET or AF_INET6 address. * * Although, one can get the softc's backpointer to ifnet and traverse * through its ifp->if_addrhead queue to find all interface addresses * involved in CARP, we keep a growable array of ifaddr pointers. This * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that * do calls into the network stack, thus avoiding LORs. * * Locking: * * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), * callout-driven events and ioctl()s. * * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx. * To traverse the global list we use the mutex carp_mtx. * * Known issues with locking: * * - Sending ad, we put the pointer to the softc in an mtag, and no reference * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ /* Accept incoming CARP packets. */ static VNET_DEFINE(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) /* Preempt slower nodes. */ static VNET_DEFINE(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ static VNET_DEFINE(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ static VNET_DEFINE(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_log), 0, "CARP log level"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_demote_adj_sysctl, "I", "Adjust demotion factor (skew of advskew)"); SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_ifdown_adj), 0, "Interface down demotion factor adjustment"); VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); VNET_PCPUSTAT_SYSINIT(carpstats); VNET_PCPUSTAT_SYSUNINIT(carpstats); #define CARPSTATS_ADD(name, val) \ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ sizeof(uint64_t)], (val)) #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ NULL, MTX_DEF) #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CIF_FREE(cif) do { \ CIF_LOCK(cif); \ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ carp_free_if(cif); \ else \ CIF_UNLOCK(cif); \ } while (0) #define CARP_LOG(...) do { \ if (V_carp_log > 0) \ log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ IF_ADDR_LOCK_ASSERT(ifp); \ TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ CARP_LOCK_ASSERT(sc); \ for (int _i = 0; \ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ ((ifa) = sc->sc_ifas[_i]) != NULL; \ ++_i) #define IFNET_FOREACH_CARP(ifp, sc) \ KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \ sx_xlocked(&carp_sx), ("cif_vrs not locked")); \ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) #define DEMOTE_ADVSKEW(sc) \ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion)) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); static struct carp_softc *carp_alloc(struct ifnet *); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); static void carp_free_if(struct carp_if *); static void carp_set_state(struct carp_softc *, int, const char* reason); static void carp_sc_state(struct carp_softc *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *, const char* reason); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; static struct sx carp_sx; static struct task carp_sendall_task = TASK_INITIALIZER(0, carp_send_ad_all, NULL); static void carp_hmac_prepare(struct carp_softc *sc) { uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET struct in_addr last, cur, in; #endif #ifdef INET6 struct in6_addr last6, cur6, in6; #endif CARP_LOCK_ASSERT(sc); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET cur.s_addr = 0; do { found = 0; last = cur; cur.s_addr = 0xffffffff; CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && ntohl(in.s_addr) < ntohl(cur.s_addr)) { cur.s_addr = in.s_addr; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); #endif /* INET */ #ifdef INET6 memset(&cur6, 0, sizeof(cur6)); do { found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; if (ifa->ifa_addr->sa_family == AF_INET6 && memcmp(&in6, &last6, sizeof(in6)) > 0 && memcmp(&in6, &cur6, sizeof(in6)) < 0) { cur6 = in6; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; } static void carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; CARP_LOCK_ASSERT(sc); /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; iplen = *offp; *mp = NULL; CARPSTATS_INC(carps_ipackets); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s: pullup failed\n", __func__); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ch = (struct carp_header *)((char *)ip + iplen); /* * verify that the received packet length is * equal to the CARP header */ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if ((m = m_pullup(m, len)) == NULL) { CARPSTATS_INC(carps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; if (in_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iplen; carp_input_c(m, ch, AF_INET); return (IPPROTO_DONE); } #endif #ifdef INET6 int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct carp_header *ch; u_int len; CARPSTATS_INC(carps_ipackets6); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); CARP_DEBUG("%s: packet received on non-carp interface: %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that we have a complete carp packet */ len = m->m_len; IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } /* verify the CARP checksum */ m->m_data += *offp; if (in_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= *offp; carp_input_c(m, ch, AF_INET6); return (IPPROTO_DONE); } #endif /* INET6 */ /* * This routine should not be necessary at all, but some switches * (VMWare ESX vswitches) can echo our own packets back at us, * and we must ignore them or they will cause us to drop out of * MASTER mode. * * We cannot catch all cases of network loops. Instead, what we * do here is catch any packet that arrives with a carp header * with a VHID of 0, that comes from an address that is our own. * These packets are by definition "from us" (even if they are from * a misconfigured host that is pretending to be us). * * The VHID test is outside this mini-function. */ static int carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af) { #ifdef INET struct ip *ip4; struct in_addr in4; #endif #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; #endif switch (af) { #ifdef INET case AF_INET: ip4 = mtod(m, struct ip *); in4 = ifatoia(ifa)->ia_addr.sin_addr; return (in4.s_addr == ip4->ip_src.s_addr); #endif #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); in6 = ifatoia6(ifa)->ia_addr.sin6_addr; return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0); #endif default: break; } return (0); } static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa, *match; struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; int error; /* * Verify that the VHID is valid on the receiving interface. * * There should be just one match. If there are none * the VHID is not valid and we drop the packet. If * there are multiple VHID matches, take just the first * one, for compatibility with previous code. While we're * scanning, check for obvious loops in the network topology * (these should never happen, and as noted above, we may * miss real loops; this is just a double-check). */ IF_ADDR_RLOCK(ifp); error = 0; match = NULL; IFNET_FOREACH_IFA(ifp, ifa) { if (match == NULL && ifa->ifa_carp != NULL && ifa->ifa_addr->sa_family == af && ifa->ifa_carp->sc_vhid == ch->carp_vhid) match = ifa; if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af)) error = ELOOP; } ifa = error ? NULL : match; if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); if (ifa == NULL) { if (error == ELOOP) { CARP_DEBUG("dropping looped packet on interface %s\n", ifp->if_xname); CARPSTATS_INC(carps_badif); /* ??? */ } else { CARPSTATS_INC(carps_badvhid); } m_freem(m); return; } /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); ifa_free(ifa); m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); ifa_free(ifa); if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, sc->sc_vhid, ifp->if_xname); goto out; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = 0; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "more frequent advertisement received"); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "preempting a slower master"); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "master will time out"); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT)) == NULL) { m_freem(m); CARPSTATS_INC(carps_onomem); return (ENOMEM); } bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (0); } /* * To avoid LORs and possible recursions this function shouldn't * be called directly, but scheduled via taskqueue. */ static void carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } /* Send a periodic advertisement, executed in callout context. */ static void carp_send_ad(void *v) { struct carp_softc *sc = v; CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } static void carp_send_ad_error(struct carp_softc *sc, int error) { if (error) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send error %d on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, error, sc->sc_carpdev->if_xname); carp_demote_adj(V_carp_senderr_adj, msg); } sc->sc_sendad_success = 0; } else { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS && ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { static const char fmt[] = "send ok on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, sc->sc_carpdev->if_xname); carp_demote_adj(-V_carp_senderr_adj, msg); sc->sc_sendad_errors = 0; } else sc->sc_sendad_errors = 0; } } /* * Pick the best ifaddr on the given ifp for sending CARP * advertisements. * * "Best" here is defined by ifa_preferred(). This function is much * much like ifaof_ifpforaddr() except that we just use ifa_preferred(). * * (This could be simplified to return the actual address, except that * it has a different format in AF_INET and AF_INET6.) */ static struct ifaddr * carp_best_ifa(int af, struct ifnet *ifp) { struct ifaddr *ifa, *best; if (af >= AF_MAX) return (NULL); best = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; } IF_ADDR_RUNLOCK(ifp); if (best != NULL) ifa_ref(best); return (best); } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; int len, advskew; CARP_LOCK_ASSERT(sc); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; /* XXXGL: OpenBSD picks first ifaddr with needed family. */ #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(len); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip_fillid(ip); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; /* set the source address */ ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ resched: callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } static void carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_addroute(ifa); } static void carp_ifa_addroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: in_addprefix(ifatoia(ifa), RTF_UP); ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); break; #endif #ifdef INET6 case AF_INET6: ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_add_ifa_lle(ifatoia6(ifa)); break; #endif } } static void carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_delroute(ifa); } static void carp_ifa_delroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); in_scrubprefix(ifatoia(ifa), LLE_STATIC); break; #endif #ifdef INET6 case AF_INET6: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_rem_ifa_lle(ifatoia6(ifa), 1); break; #endif } } int carp_master(struct ifaddr *ifa) { struct carp_softc *sc = ifa->ifa_carp; return (sc->sc_state == MASTER); } #ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; struct in_addr addr; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr)); } } int carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { *enaddr = LLADDR(&sc->sc_addr); return (1); } return (0); } #endif #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } /* * Returns ifa in case it's a carp address and it is MASTER, or if the address * matches and is not a carp address. Returns NULL otherwise. */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { struct ifaddr *ifa; ifa = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) continue; if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) ifa = NULL; else ifa_ref(ifa); break; } IF_ADDR_RUNLOCK(ifp); return (ifa); } caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; IF_ADDR_RUNLOCK(ifp); mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); if (mtag == NULL) /* Better a bit than nothing. */ return (LLADDR(&sc->sc_addr)); bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (LLADDR(&sc->sc_addr)); } IF_ADDR_RUNLOCK(ifp); return (NULL); } #endif /* INET6 */ int carp_forus(struct ifnet *ifp, u_char *dhost) { struct carp_softc *sc; uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (0); CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), ETHER_ADDR_LEN)) { CARP_UNLOCK(sc); CIF_UNLOCK(ifp->if_carp); return (1); } CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); return (0); } /* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); if (sc->sc_state == BACKUP) { carp_master_down_locked(sc, "master timed out"); } CURVNET_RESTORE(); CARP_UNLOCK(sc); } static void carp_master_down_locked(struct carp_softc *sc, const char *reason) { CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER, reason); carp_send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); #endif carp_setrun(sc, 0); carp_addroute(sc); break; case INIT: case MASTER: #ifdef INVARIANTS panic("carp: VHID %u@%s: master_down event in %s state\n", sc->sc_vhid, sc->sc_carpdev->if_xname, sc->sc_state ? "MASTER" : "INIT"); #endif break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; CARP_LOCK_ASSERT(sc); if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)) return; switch (sc->sc_state) { case INIT: carp_set_state(sc, BACKUP, "initialization complete"); carp_setrun(sc, 0); break; case BACKUP: callout_stop(&sc->sc_ad_tmo); tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); #endif break; } break; case MASTER: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); break; } } /* * Setup multicast structures. */ static int carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { struct ifnet *ifp = cif->cif_ifp; int error = 0; switch (sa) { #ifdef INET case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; struct in_addr addr; if (imo->imo_membership) return (0); imo->imo_membership = (struct in_multi **)malloc( (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, M_WAITOK); imo->imo_mfilters = NULL; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, &imo->imo_membership[0])) != 0) { free(imo->imo_membership, M_CARP); break; } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_addr in6; struct in6_multi *in6m; if (im6o->im6o_membership) return (0); im6o->im6o_membership = (struct in6_multi **)malloc( (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, M_ZERO | M_WAITOK); im6o->im6o_mfilters = NULL; im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { free(im6o->im6o_membership, M_CARP); break; } in6m = NULL; - if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) { free(im6o->im6o_membership, M_CARP); break; } im6o->im6o_membership[0] = in6m; im6o->im6o_num_memberships++; /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { - in6_mc_leave(im6o->im6o_membership[0], NULL); + in6_leavegroup(im6o->im6o_membership[0], NULL); free(im6o->im6o_membership, M_CARP); break; } in6m = NULL; - if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { - in6_mc_leave(im6o->im6o_membership[0], NULL); + if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) { + in6_leavegroup(im6o->im6o_membership[0], NULL); free(im6o->im6o_membership, M_CARP); break; } im6o->im6o_membership[1] = in6m; im6o->im6o_num_memberships++; break; } #endif } return (error); } /* * Free multicast structures. */ static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { sx_assert(&carp_sx, SA_XLOCKED); switch (sa) { #ifdef INET case AF_INET: if (cif->cif_naddrs == 0) { struct ip_moptions *imo = &cif->cif_imo; in_leavegroup(imo->imo_membership[0], NULL); KASSERT(imo->imo_mfilters == NULL, ("%s: imo_mfilters != NULL", __func__)); free(imo->imo_membership, M_CARP); imo->imo_membership = NULL; } break; #endif #ifdef INET6 case AF_INET6: if (cif->cif_naddrs6 == 0) { struct ip6_moptions *im6o = &cif->cif_im6o; - in6_mc_leave(im6o->im6o_membership[0], NULL); - in6_mc_leave(im6o->im6o_membership[1], NULL); + in6_leavegroup(im6o->im6o_membership[0], NULL); + in6_leavegroup(im6o->im6o_membership[1], NULL); KASSERT(im6o->im6o_mfilters == NULL, ("%s: im6o_mfilters != NULL", __func__)); free(im6o->im6o_membership, M_CARP); im6o->im6o_membership = NULL; } break; #endif } } int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &sc, sizeof(sc)); /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_BRIDGE: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = sc->sc_vhid; } break; default: printf("%s: carp is not supported for the %d interface type\n", ifp->if_xname, ifp->if_type); return (EOPNOTSUPP); } return (0); } static struct carp_softc* carp_alloc(struct ifnet *ifp) { struct carp_softc *sc; struct carp_if *cif; sx_assert(&carp_sx, SA_XLOCKED); if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); sc->sc_advbase = CARP_DFLTINTV; sc->sc_vhid = -1; /* required setting */ sc->sc_init_counter = 1; sc->sc_state = INIT; sc->sc_ifasiz = sizeof(struct ifaddr *); sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); sc->sc_carpdev = ifp; CARP_LOCK_INIT(sc); #ifdef INET callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif #ifdef INET6 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); CIF_LOCK(cif); TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carp_list, sc, sc_next); mtx_unlock(&carp_mtx); return (sc); } static void carp_grow_ifas(struct carp_softc *sc) { struct ifaddr **new; new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); CARP_LOCK(sc); bcopy(sc->sc_ifas, new, sc->sc_ifasiz); free(sc->sc_ifas, M_CARP); sc->sc_ifas = new; sc->sc_ifasiz *= 2; CARP_UNLOCK(sc); } static void carp_destroy(struct carp_softc *sc) { struct ifnet *ifp = sc->sc_carpdev; struct carp_if *cif = ifp->if_carp; sx_assert(&carp_sx, SA_XLOCKED); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); CARP_UNLOCK(sc); CIF_LOCK(cif); TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); callout_drain(&sc->sc_ad_tmo); #ifdef INET callout_drain(&sc->sc_md_tmo); #endif #ifdef INET6 callout_drain(&sc->sc_md6_tmo); #endif CARP_LOCK_DESTROY(sc); free(sc->sc_ifas, M_CARP); free(sc, M_CARP); } static struct carp_if* carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; int error; cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if ((error = ifpromisc(ifp, 1)) != 0) printf("%s: ifpromisc(%s) failed: %d\n", __func__, ifp->if_xname, error); else cif->cif_flags |= CIF_PROMISC; CIF_LOCK_INIT(cif); cif->cif_ifp = ifp; TAILQ_INIT(&cif->cif_vrs); IF_ADDR_WLOCK(ifp); ifp->if_carp = cif; if_ref(ifp); IF_ADDR_WUNLOCK(ifp); return (cif); } static void carp_free_if(struct carp_if *cif) { struct ifnet *ifp = cif->cif_ifp; CIF_LOCK_ASSERT(cif); KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", __func__)); IF_ADDR_WLOCK(ifp); ifp->if_carp = NULL; IF_ADDR_WUNLOCK(ifp); CIF_LOCK_DESTROY(cif); if (cif->cif_flags & CIF_PROMISC) ifpromisc(ifp, 0); if_rele(ifp); free(cif, M_CARP); } static void carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) { CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; carpr->carpr_advbase = sc->sc_advbase; carpr->carpr_advskew = sc->sc_advskew; if (priv) bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); else bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); CARP_UNLOCK(sc); } int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; struct ifnet *ifp; struct carp_softc *sc = NULL; int error = 0, locked = 0; if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr))) return (error); ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) return (ENXIO); switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: break; default: error = EOPNOTSUPP; goto out; } if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } sx_xlock(&carp_sx); switch (cmd) { case SIOCSVH: if ((error = priv_check(td, PRIV_NETINET_CARP))) break; if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { error = EINVAL; break; } if (ifp->if_carp) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; } if (sc == NULL) { sc = carp_alloc(ifp); CARP_LOCK(sc); sc->sc_vhid = carpr.carpr_vhid; LLADDR(&sc->sc_addr)[0] = 0; LLADDR(&sc->sc_addr)[1] = 0; LLADDR(&sc->sc_addr)[2] = 0x5e; LLADDR(&sc->sc_addr)[3] = 0; LLADDR(&sc->sc_addr)[4] = 1; LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; } else CARP_LOCK(sc); locked = 1; if (carpr.carpr_advbase > 0) { if (carpr.carpr_advbase > 255 || carpr.carpr_advbase < CARP_DFLTINTV) { error = EINVAL; break; } sc->sc_advbase = carpr.carpr_advbase; } if (carpr.carpr_advskew >= 255) { error = EINVAL; break; } sc->sc_advskew = carpr.carpr_advskew; if (carpr.carpr_key[0] != '\0') { bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); carp_hmac_prepare(sc); } if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { switch (carpr.carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "user requested via ifconfig"); carp_setrun(sc, 0); carp_delroute(sc); break; case MASTER: carp_master_down_locked(sc, "user requested via ifconfig"); break; default: break; } } break; case SIOCGVH: { int priveleged; if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { error = EINVAL; break; } if (carpr.carpr_count < 1) { error = EMSGSIZE; break; } if (ifp->if_carp == NULL) { error = ENOENT; break; } priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); if (carpr.carpr_vhid != 0) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; if (sc == NULL) { error = ENOENT; break; } carp_carprcp(&carpr, sc, priveleged); error = copyout(&carpr, ifr_data_get_ptr(ifr), sizeof(carpr)); } else { int i, count; count = 0; IFNET_FOREACH_CARP(ifp, sc) count++; if (count > carpr.carpr_count) { CIF_UNLOCK(ifp->if_carp); error = EMSGSIZE; break; } i = 0; IFNET_FOREACH_CARP(ifp, sc) { carp_carprcp(&carpr, sc, priveleged); carpr.carpr_count = count; error = copyout(&carpr, (caddr_t)ifr_data_get_ptr(ifr) + (i * sizeof(carpr)), sizeof(carpr)); if (error) { CIF_UNLOCK(ifp->if_carp); break; } i++; } } break; } default: error = EINVAL; } sx_xunlock(&carp_sx); out: if (locked) CARP_UNLOCK(sc); if_rele(ifp); return (error); } static int carp_get_vhid(struct ifaddr *ifa) { if (ifa == NULL || ifa->ifa_carp == NULL) return (0); return (ifa->ifa_carp->sc_vhid); } int carp_attach(struct ifaddr *ifa, int vhid) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; int index, error; KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif break; default: return (EPROTOTYPE); } sx_xlock(&carp_sx); if (ifp->if_carp == NULL) { sx_xunlock(&carp_sx); return (ENOPROTOOPT); } IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == vhid) break; if (sc == NULL) { sx_xunlock(&carp_sx); return (ENOENT); } error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); if (error) { CIF_FREE(cif); sx_xunlock(&carp_sx); return (error); } index = sc->sc_naddrs + sc->sc_naddrs6 + 1; if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) carp_grow_ifas(sc); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs++; sc->sc_naddrs++; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6++; sc->sc_naddrs6++; break; #endif } ifa_ref(ifa); CARP_LOCK(sc); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); sx_xunlock(&carp_sx); return (0); } void carp_detach(struct ifaddr *ifa, bool keep_cif) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc = ifa->ifa_carp; int i, index; KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); sx_xlock(&carp_sx); CARP_LOCK(sc); /* Shift array. */ index = sc->sc_naddrs + sc->sc_naddrs6; for (i = 0; i < index; i++) if (sc->sc_ifas[i] == ifa) break; KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); for (; i < index - 1; i++) sc->sc_ifas[i] = sc->sc_ifas[i+1]; sc->sc_ifas[index - 1] = NULL; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs--; sc->sc_naddrs--; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6--; sc->sc_naddrs6--; break; #endif } carp_ifa_delroute(ifa); carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); ifa->ifa_carp = NULL; ifa_free(ifa); carp_hmac_prepare(sc); carp_sc_state(sc); if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) carp_destroy(sc); else CARP_UNLOCK(sc); if (!keep_cif) CIF_FREE(cif); sx_xunlock(&carp_sx); } static void carp_set_state(struct carp_softc *sc, int state, const char *reason) { CARP_LOCK_ASSERT(sc); if (sc->sc_state != state) { const char *carp_states[] = { CARP_STATES }; char subsys[IFNAMSIZ+5]; snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, sc->sc_carpdev->if_xname); CARP_LOG("%s: %s -> %s (%s)\n", subsys, carp_states[sc->sc_state], carp_states[state], reason); sc->sc_state = state; devctl_notify("CARP", subsys, carp_states[state], NULL); } } static void carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); carp_sc_state(sc); CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); } static void carp_sc_state(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP)) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); #endif #ifdef INET6 callout_stop(&sc->sc_md6_tmo); #endif carp_set_state(sc, INIT, "hardware interface down"); carp_setrun(sc, 0); if (!sc->sc_suppress) carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { carp_set_state(sc, INIT, "hardware interface up"); carp_setrun(sc, 0); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } } static void carp_demote_adj(int adj, char *reason) { atomic_add_int(&V_carp_demotion, adj); CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); } static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_demotion; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); carp_demote_adj(new, "sysctl"); return (0); } #ifdef INET extern struct domain inetdomain; static struct protosw in_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif #ifdef INET6 extern struct domain inet6domain; static struct protosw in6_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }; #endif static void carp_mod_cleanup(void) { #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET] = -1; } carp_iamatch_p = NULL; #endif #ifdef INET6 if (proto_reg[CARP_INET6] == 0) { (void)ip6proto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET6] = -1; } carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif carp_ioctl_p = NULL; carp_attach_p = NULL; carp_detach_p = NULL; carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; carp_demote_adj_p = NULL; carp_master_p = NULL; mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); sx_destroy(&carp_sx); } static int carp_mod_load(void) { int err; mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); sx_init(&carp_sx, "carp_sx"); LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; carp_linkstate_p = carp_linkstate; carp_ioctl_p = carp_ioctl; carp_attach_p = carp_attach; carp_detach_p = carp_detach; carp_demote_adj_p = carp_demote_adj; carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); return (proto_reg[CARP_INET6]); } err = ip6proto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); return (err); } #endif #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); return (proto_reg[CARP_INET]); } err = ipproto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); return (err); } #endif return (0); } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: mtx_lock(&carp_mtx); if (LIST_EMPTY(&carp_list)) carp_mod_cleanup(); else { mtx_unlock(&carp_mtx); return (EBUSY); } break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); Index: head/sys/netinet6/in6.c =================================================================== --- head/sys/netinet6/in6.c (revision 333174) +++ head/sys/netinet6/in6.c (revision 333175) @@ -1,2524 +1,2551 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * struct in6_ifreq and struct ifreq must be type punnable for common members * of ifr_ifru to allow accessors to be shared. */ _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) == offsetof(struct ifreq, ifr_ifru), "struct in6_ifreq and struct ifreq are not type punnable"); VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; int fibnum; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ia62ifa(ia)->ifa_ifp->if_fib; /* Announce arrival of local address to this FIB. */ rt_newaddrmsg_fib(cmd, &ia->ia_ifa, 0, &rt, fibnum); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) { if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, true); ifa_free(&ia->ia_ifa); } if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa, false); goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_addrcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } nd6_prefix_rele(pr); /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr != NULL && pr->ndpr_addrcnt == 0) { ND6_WLOCK(); nd6_prefix_unlink(pr, NULL); ND6_WUNLOCK(); nd6_prefix_del(pr); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } +static struct in6_multi_mship * +in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr, + int *errorp, int delay) +{ + struct in6_multi_mship *imm; + int error; + + imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); + if (imm == NULL) { + *errorp = ENOBUFS; + return (NULL); + } + + delay = (delay * PR_FASTHZ) / hz; + + error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); + if (error) { + *errorp = error; + free(imm, M_IP6MADDR); + return (NULL); + } + + return (imm); +} /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } - imm = in6_joingroup(ifp, &mltaddr, &error, delay); + imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ - imm = in6_joingroup(ifp, &mltaddr, &error, 0); + imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ - imm = in6_joingroup(ifp, &mltaddr, &error, delay); + imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { - imm = in6_joingroup(ifp, &mltaddr, &error, delay); + imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ - imm = in6_joingroup(ifp, &mltaddr, &error, 0); + imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa, false); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); - in6_leavegroup(imm); + if (imm->i6mm_maddr != NULL) + in6_leavegroup(imm->i6mm_maddr, NULL); + free(imm, M_IP6MADDR); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6 ? RTF_HOST : 0)); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_addrcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_addrcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } IF_ADDR_RUNLOCK(ifp); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) return (error); } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) return (error); ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * find the interface address corresponding to a given IPv6 address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); IF_ADDR_RUNLOCK(ifp); return (besta); } TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); IF_ADDR_RUNLOCK(ifp); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); IF_ADDR_RUNLOCK(ifp); return dep[1]; } IF_ADDR_RUNLOCK(ifp); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } IF_ADDR_RUNLOCK(ifp); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) || (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD)) return (0); /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionally, users may try to assign addresses * before the interface becomes up (or running). * This function returns EAGAIN in that case. * The caller should mark "tentative" on the address instead of * performing DAD immediately. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return (EAGAIN); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } IFNET_RUNLOCK_NOSLEEP(); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_BRIDGE: /* bridge(4) only does Ethernet-like links */ case IFT_INFINIBAND: return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in6_lltable_destroy_lle_unlocked(struct llentry *lle) { LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); in6_lltable_destroy_lle_unlocked(lle); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6; struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; int error; char ip6buf[INET6_ADDRSTRLEN]; int fibnum; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); sin6 = (const struct sockaddr_in6 *)l3addr; in6_splitscope(&sin6->sin6_addr, &dst, &scopeid); fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib; error = fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6); if (error != 0 || (nh6.nh_flags & NHF_GATEWAY) || nh6.nh_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { ifa_free(ifa); return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &sin6->sin6_addr)); return EINVAL; } return 0; } /* * Called by the datapath to indicate that the entry was used. */ static void in6_lltable_mark_used(struct llentry *lle) { LLE_REQ_LOCK(lle); lle->r_skip_req = 0; /* * Set the hit time so the callback function * can determine the remaining time before * transiting to the DELAY state. */ lle->lle_hittime = time_uptime; LLE_REQ_UNLOCK(lle); } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { in6_lltable_destroy_lle_unlocked(lle); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", flags)); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } if (lle->la_expire != 0) ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; if (lle->ln_router != 0) ndpc.rtm.rtm_flags |= RTF_GATEWAY; ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked; /* Store state in rmx_weight value */ ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; llt->llt_mark_used = in6_lltable_mark_used; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return ifp->if_mtu; return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ifp, ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: head/sys/netinet6/in6_ifattach.c =================================================================== --- head/sys/netinet6/in6_ifattach.c (revision 333174) +++ head/sys/netinet6/in6_ifattach.c (revision 333175) @@ -1,905 +1,901 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DEFINE(unsigned long, in6_maxmtu) = 0; #ifdef IP6_AUTO_LINKLOCAL VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL; #else VNET_DEFINE(int, ip6_auto_linklocal) = 1; /* enabled by default */ #endif VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch); #define V_in6_tmpaddrtimer_ch VNET(in6_tmpaddrtimer_ch) VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *); static int in6_ifattach_loopback(struct ifnet *); static void in6_purgemaddrs(struct ifnet *); #define EUI64_GBIT 0x01 #define EUI64_UBIT 0x02 #define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0) #define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT) #define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6)) #define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT) #define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6)) #define IFID_LOCAL(in6) (!EUI64_LOCAL(in6)) #define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6)) /* * Generate a last-resort interface identifier, when the machine has no * IEEE802/EUI64 address sources. * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. * * in6 - upper 64bits are preserved */ static int get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6) { MD5_CTX ctxt; struct prison *pr; u_int8_t digest[16]; int hostnamelen; pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); hostnamelen = strlen(pr->pr_hostname); #if 0 /* we need at least several letters as seed for ifid */ if (hostnamelen < 3) { mtx_unlock(&pr->pr_mtx); return -1; } #endif /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, pr->pr_hostname, hostnamelen); mtx_unlock(&pr->pr_mtx); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ bcopy(digest, &in6->s6_addr[8], 8); /* make sure to set "u" bit to local, and "g" bit to individual. */ in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */ in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */ /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); return 0; } static int generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret) { MD5_CTX ctxt; u_int8_t seed[16], digest[16], nullbuf[8]; u_int32_t val32; /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; for (i = 0; i < 2; i++) { val32 = arc4random(); bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32)); } } else bcopy(seed0, seed, 8); /* copy the right-most 64-bits of the given address */ /* XXX assumption on the size of IFID */ bcopy(seed1, &seed[8], 8); if (0) { /* for debugging purposes only */ int i; printf("generate_tmp_ifid: new randomized ID from: "); for (i = 0; i < 16; i++) printf("%02x", seed[i]); printf(" "); } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, seed, sizeof(seed)); MD5Final(digest, &ctxt); /* * RFC 3041 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ bcopy(digest, ret, 8); ret[0] &= ~EUI64_UBIT; /* * XXX: we'd like to ensure that the generated value is not zero * for simplicity. If the caclculated digest happens to be zero, * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { nd6log((LOG_INFO, "generate_tmp_ifid: computed MD5 value is zero.\n")); val32 = arc4random(); val32 = 1 + (val32 % (0xffffffff - 1)); } /* * RFC 3041 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); if (0) { /* for debugging purposes only */ int i; printf("to: "); for (i = 0; i < 16; i++) printf("%02x", digest[i]); printf("\n"); } return 0; } /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface * * in6 - upper 64bits are preserved */ int in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6) { struct ifaddr *ifa; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) continue; if (sdl->sdl_alen == 0) continue; goto found; } IF_ADDR_RUNLOCK(ifp); return -1; found: IF_ADDR_LOCK_ASSERT(ifp); addr = LLADDR(sdl); addrlen = sdl->sdl_alen; /* get EUI64 */ switch (ifp->if_type) { case IFT_BRIDGE: case IFT_ETHER: case IFT_L2VLAN: case IFT_ATM: case IFT_IEEE1394: /* IEEE802/EUI64 cases - what others? */ /* IEEE1394 uses 16byte length address starting with EUI64 */ if (addrlen > 8) addrlen = 8; /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) { IF_ADDR_RUNLOCK(ifp); return -1; } /* * check for invalid MAC address - on bsdi, we see it a lot * since wildboar configures all-zero MAC on pccard before * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } if (bcmp(addr, allone, addrlen) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } /* make EUI64 address */ if (addrlen == 8) bcopy(addr, &in6->s6_addr[8], 8); else if (addrlen == 6) { in6->s6_addr[8] = addr[0]; in6->s6_addr[9] = addr[1]; in6->s6_addr[10] = addr[2]; in6->s6_addr[11] = 0xff; in6->s6_addr[12] = 0xfe; in6->s6_addr[13] = addr[3]; in6->s6_addr[14] = addr[4]; in6->s6_addr[15] = addr[5]; } break; case IFT_GIF: case IFT_STF: /* * RFC2893 says: "SHOULD use IPv4 address as ifid source". * however, IPv4 address is not very suitable as unique * identifier source (can be renumbered). * we don't do this. */ IF_ADDR_RUNLOCK(ifp); return -1; default: IF_ADDR_RUNLOCK(ifp); return -1; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) { IF_ADDR_RUNLOCK(ifp); return -1; } /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); /* * sanity check: ifid must not be all zero, avoid conflict with * subnet router anycast */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { IF_ADDR_RUNLOCK(ifp); return -1; } IF_ADDR_RUNLOCK(ifp); return 0; } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. * * altifp - secondary EUI64 source */ static int get_ifid(struct ifnet *ifp0, struct ifnet *altifp, struct in6_addr *in6) { struct ifnet *ifp; /* first, try to get it from the interface itself */ if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; } /* next, try to get it from some other hardware interface */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp == ifp0) continue; if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* * to borrow ifid from other interface, ifid needs to be * globally unique */ if (IFID_UNIVERSAL(in6)) { nd6log((LOG_DEBUG, "%s: borrow interface identifier from %s\n", if_name(ifp0), if_name(ifp))); IFNET_RUNLOCK_NOSLEEP(); goto success; } } IFNET_RUNLOCK_NOSLEEP(); /* last resort: get from random number source */ if (get_rand_ifid(ifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: interface identifier generated by random number\n", if_name(ifp0))); goto success; } printf("%s: failed to get interface identifier\n", if_name(ifp0)); return -1; success: nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10], in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13], in6->s6_addr[14], in6->s6_addr[15])); return 0; } /* * altifp - secondary EUI64 source */ static int in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; struct in6_aliasreq ifra; struct nd_prefixctl pr0; struct nd_prefix *pr; int error; /* * configure link-local address. */ in6_prepare_ifra(&ifra, NULL, &in6mask64); ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1); } else { if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, "%s: no ifid available\n", if_name(ifp))); return (-1); } } if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) return (-1); /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * Now call in6_update_ifa() to do a bunch of procedures to configure * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ if ((error = in6_update_ifa(ifp, &ifra, NULL, IN6_IFAUPDATE_DADDELAY)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", if_name(ifp), error)); return (-1); } ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ KASSERT(ia != NULL, ("%s: ia == NULL, ifp=%p", __func__, ifp)); ifa_free(&ia->ia_ifa); /* * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); pr0.ndpr_prefix = ifra.ifra_addr; /* apply the mask for safety. (nd6_prelist_add will apply it again) */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64); /* * Initialize parameters. The link-local prefix must always be * on-link, and its lifetimes never expire. */ pr0.ndpr_raf_onlink = 1; pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. * For example, if we first remove the (only) existing link-local * address, and then reconfigure another one, the prefix is still * valid with referring to the old link-local address. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0) return (error); } else nd6_prefix_rele(pr); return 0; } /* * ifp - must be IFT_LOOP */ static int in6_ifattach_loopback(struct ifnet *ifp) { struct in6_aliasreq ifra; int error; in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128); /* * Always initialize ia_dstaddr (= broadcast address) to loopback * address. Follows IPv4 practice - see in_ifinit(). */ ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_dstaddr.sin6_family = AF_INET6; ifra.ifra_dstaddr.sin6_addr = in6addr_loopback; /* the loopback address should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; /* * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) { nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", if_name(ifp), error)); return (-1); } return 0; } /* * compute NI group address, based on the current hostname setting. * see RFC 4620. * * when ifp == NULL, the caller is responsible for filling scopeid. * * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. */ static int in6_nigroup0(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6, int oldmcprefix) { struct prison *pr; const char *p; u_char *q; MD5_CTX ctxt; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ /* * If no name is given and namelen is -1, * we try to do the hostname lookup ourselves. */ if (!name && namelen == -1) { pr = curthread->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); name = pr->pr_hostname; namelen = strlen(name); } else pr = NULL; if (!name || !namelen) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; if (p == name || p - name > sizeof(n) - 1) { if (pr != NULL) mtx_unlock(&pr->pr_mtx); return -1; /* label too long */ } l = p - name; strncpy(n, name, l); if (pr != NULL) mtx_unlock(&pr->pr_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') *q = *q - 'A' + 'a'; } /* generate 16 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); MD5Update(&ctxt, &l, sizeof(l)); MD5Update(&ctxt, n, l); MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; if (oldmcprefix == 0) { in6->s6_addr8[12] = 0xff; /* Copy the first 24 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr8[13], 3); } else { /* Copy the first 32 bits of 128-bit hash into the address. */ bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); } if (in6_setscope(in6, ifp, NULL)) return (-1); /* XXX: should not fail */ return 0; } int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 0)); } int in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { return (in6_nigroup0(ifp, name, namelen, in6, 1)); } /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. * XXX multiple link-local address case * * altifp - secondary EUI64 source */ void in6_ifattach(struct ifnet *ifp, struct ifnet *altifp) { struct in6_ifaddr *ia; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * quirks based on interface type */ switch (ifp->if_type) { case IFT_STF: /* * 6to4 interface is a very special kind of beast. * no multicast, no linklocal. RFC2529 specifies how to make * linklocals for 6to4 interface, but there's no use and * it is rather harmful to have one. */ ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL; break; default: break; } /* * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { nd6log((LOG_INFO, "in6_ifattach: " "%s is not multicast capable, IPv6 not enabled\n", if_name(ifp))); return; } /* * assign loopback address for loopback interface. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* * check that loopback address doesn't exist yet. */ ia = in6ifa_ifwithaddr(&in6addr_loopback, 0); if (ia == NULL) in6_ifattach_loopback(ifp); else ifa_free(&ia->ia_ifa); } /* * assign a link-local address, if there's none. */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) { ia = in6ifa_ifpforlinklocal(ifp, 0); if (ia == NULL) in6_ifattach_linklocal(ifp, altifp); else ifa_free(&ia->ia_ifa); } /* update dynamically. */ if (V_in6_maxmtu < ifp->if_mtu) V_in6_maxmtu = ifp->if_mtu; } /* * NOTE: in6_ifdetach() does not support loopback if at this moment. * * When shutting down a VNET we clean up layers top-down. In that case * upper layer protocols (ulp) are cleaned up already and locks are destroyed * and we must not call into these cleanup functions anymore, thus purgeulp * is set to 0 in that case by in6_ifdetach_destroy(). * The normal case of destroying a (cloned) interface still needs to cleanup * everything related to the interface and will have purgeulp set to 1. */ static void _in6_ifdetach(struct ifnet *ifp, int purgeulp) { struct ifaddr *ifa, *next; if (ifp->if_afdata[AF_INET6] == NULL) return; /* * nuke any of IPv6 addresses we have * XXX: all addresses should be already removed */ TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } if (purgeulp) { in6_pcbpurgeif0(&V_udbinfo, ifp); in6_pcbpurgeif0(&V_ulitecbinfo, ifp); in6_pcbpurgeif0(&V_ripcbinfo, ifp); } /* leave from all multicast groups joined */ in6_purgemaddrs(ifp); /* * Remove neighbor management table. * Enabling the nd6_purge will panic on vmove for interfaces on VNET * teardown as the IPv6 layer is cleaned up already and the locks * are destroyed. */ if (purgeulp) nd6_purge(ifp); } void in6_ifdetach(struct ifnet *ifp) { _in6_ifdetach(ifp, 1); } void in6_ifdetach_destroy(struct ifnet *ifp) { _in6_ifdetach(ifp, 0); } int in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf, const u_int8_t *baseid, int generate) { u_int8_t nullbuf[8]; struct nd_ifinfo *ndi = ND_IFINFO(ifp); bzero(nullbuf, sizeof(nullbuf)); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) { /* we've never created a random ID. Create a new one. */ generate = 1; } if (generate) { bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1)); /* generate_tmp_ifid will update seedn and buf */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } bcopy(ndi->randomid, retbuf, 8); return (0); } void in6_tmpaddrtimer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *ndi; u_int8_t nullbuf[8]; struct ifnet *ifp; callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); bzero(nullbuf, sizeof(nullbuf)); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; ndi = ND_IFINFO(ifp); if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* * We've been generating a random ID on this interface. * Create a new one. */ (void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1, ndi->randomid); } } CURVNET_RESTORE(); } static void in6_purgemaddrs(struct ifnet *ifp) { - LIST_HEAD(,in6_multi) purgeinms; - struct in6_multi *inm, *tinm; + struct in6_multi_head purgeinms; + struct in6_multi *inm; struct ifmultiaddr *ifma; - LIST_INIT(&purgeinms); + SLIST_INIT(&purgeinms); IN6_MULTI_LOCK(); - + IN6_MULTI_LIST_LOCK(); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; - LIST_INSERT_HEAD(&purgeinms, inm, in6m_entry); + in6m_rele_locked(&purgeinms, inm); } IF_ADDR_RUNLOCK(ifp); - - LIST_FOREACH_SAFE(inm, &purgeinms, in6m_entry, tinm) { - LIST_REMOVE(inm, in6m_entry); - in6m_release_locked(inm); - } mld_ifdetach(ifp); - + IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); + in6m_release_list_deferred(&purgeinms); } void in6_ifattach_destroy(void) { callout_drain(&V_in6_tmpaddrtimer_ch); } static void in6_ifattach_init(void *dummy) { /* Timer for regeneranation of temporary addresses randomize ID. */ callout_init(&V_in6_tmpaddrtimer_ch, 0); callout_reset(&V_in6_tmpaddrtimer_ch, (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor - V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet); } /* * Cheat. * This must be after route_init(), which is now SI_ORDER_THIRD. */ SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, in6_ifattach_init, NULL); Index: head/sys/netinet6/in6_mcast.c =================================================================== --- head/sys/netinet6/in6_mcast.c (revision 333174) +++ head/sys/netinet6/in6_mcast.c (revision 333175) @@ -1,2833 +1,2898 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv6 multicast socket, group, and socket option processing module. * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include +#include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include + #include +#include #include +#include +#include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in6 sin6; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter", "IPv6 multicast PCB-layer source filter"); -static MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); +MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options"); static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource", "IPv6 multicast MLD-layer source filter"); RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: * - Lock order is: Giant, INP_WLOCK, IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. * * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly * any need for in6_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ -struct mtx in6_multi_mtx; -MTX_SYSINIT(in6_multi_mtx, &in6_multi_mtx, "in6_multi_mtx", MTX_DEF); +struct mtx in6_multi_list_mtx; +MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF); +struct mtx in6_multi_free_mtx; +MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF); + +struct sx in6_multi_sx; +SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx"); + + + static void im6f_commit(struct in6_mfilter *); static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **); static struct in6_msource * im6f_graft(struct in6_mfilter *, const uint8_t, const struct sockaddr_in6 *); static void im6f_leave(struct in6_mfilter *); static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); static int im6o_grow(struct ip6_moptions *); static size_t im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * im6o_match_source(const struct ip6_moptions *, const size_t, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); -static int in6_mc_get(struct ifnet *, const struct in6_addr *, +static int in6_getmulti(struct ifnet *, const struct in6_addr *, struct in6_multi **); static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims); #ifdef KTR static int in6m_is_ifp_detached(const struct in6_multi *); #endif static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); static void in6m_purge(struct in6_multi *); static void in6m_reap(struct in6_multi *); static struct ip6_moptions * in6p_findmoptions(struct inpcb *); static int in6p_get_source_filters(struct inpcb *, struct sockopt *); static int in6p_join_group(struct inpcb *, struct sockopt *); static int in6p_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in6 *); static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); static int in6p_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv6 multicast"); static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0, "Max source filters per socket"); /* TODO Virtualize this switch. */ int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline in6m_is_ifp_detached(const struct in6_multi *inm) { struct ifnet *ifp; KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that network-layer notion of ifp is the * same as that of link-layer. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in6_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void im6f_init(struct in6_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in6_mfilter)); RB_INIT(&imf->im6f_sources); imf->im6f_st[0] = st0; imf->im6f_st[1] = st1; } /* * Resize the ip6_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. */ static int im6o_grow(struct ip6_moptions *imo) { struct in6_multi **nmships; struct in6_multi **omships; struct in6_mfilter *nmfilters; struct in6_mfilter *omfilters; size_t idx; size_t newmax; size_t oldmax; nmships = NULL; nmfilters = NULL; omships = imo->im6o_membership; omfilters = imo->im6o_mfilters; oldmax = imo->im6o_max_memberships; newmax = ((oldmax + 1) * 2) - 1; if (newmax <= IPV6_MAX_MEMBERSHIPS) { nmships = (struct in6_multi **)realloc(omships, sizeof(struct in6_multi *) * newmax, M_IP6MOPTS, M_NOWAIT); nmfilters = (struct in6_mfilter *)realloc(omfilters, sizeof(struct in6_mfilter) * newmax, M_IN6MFILTER, M_NOWAIT); if (nmships != NULL && nmfilters != NULL) { /* Initialize newly allocated source filter heads. */ for (idx = oldmax; idx < newmax; idx++) { im6f_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); } imo->im6o_max_memberships = newmax; imo->im6o_membership = nmships; imo->im6o_mfilters = nmfilters; } } if (nmships == NULL || nmfilters == NULL) { if (nmships != NULL) free(nmships, M_IP6MOPTS); if (nmfilters != NULL) free(nmfilters, M_IN6MFILTER); return (ETOOMANYREFS); } return (0); } /* * Find an IPv6 multicast group entry for this ip6_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static size_t im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; struct in6_multi **pinm; int idx; int nmships; gsin6 = (const struct sockaddr_in6 *)group; /* The im6o_membership array may be lazy allocated. */ if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) return (-1); nmships = imo->im6o_num_memberships; pinm = &imo->im6o_membership[0]; for (idx = 0; idx < nmships; idx++, pinm++) { if (*pinm == NULL) continue; if ((ifp == NULL || ((*pinm)->in6m_ifp == ifp)) && IN6_ARE_ADDR_EQUAL(&(*pinm)->in6m_addr, &gsin6->sin6_addr)) { break; } } if (idx >= nmships) idx = -1; return (idx); } /* * Find an IPv6 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * XXX TODO: The scope ID, if present in src, is stripped before * any comparison. We SHOULD enforce scope/zone checks where the source * filter entry has a link scope. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in6_msource * im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, const struct sockaddr *src) { struct ip6_msource find; struct in6_mfilter *imf; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); KASSERT(gidx != -1 && gidx < imo->im6o_num_memberships, ("%s: invalid index %d\n", __func__, (int)gidx)); /* The im6o_mfilters array may be lazy allocated. */ if (imo->im6o_mfilters == NULL) return (NULL); imf = &imo->im6o_mfilters[gidx]; psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); return ((struct in6_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { size_t gidx; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); gidx = im6o_match_group(imo, ifp, group); if (gidx == -1) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ mode = imo->im6o_mfilters[gidx].im6f_st[1]; ims = im6o_match_source(imo, gidx, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in6_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN6_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int -in6_mc_get(struct ifnet *ifp, const struct in6_addr *group, +in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; int error; error = 0; /* * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK; * if_addmulti() takes this mutex itself, so we must drop and * re-acquire around the call. */ IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); - inm = in6m_lookup_locked(ifp, group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, inm->in6m_refcount)); - ++inm->in6m_refcount; + in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } memset(&gsin6, 0, sizeof(gsin6)); gsin6.sin6_family = AF_INET6; gsin6.sin6_len = sizeof(struct sockaddr_in6); gsin6.sin6_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ + IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); if (error != 0) return (error); + IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet6 is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in6_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET6, ("%s: ifma not AF_INET6", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp || !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)) panic("%s: ifma %p is inconsistent with %p (%p)", __func__, ifma, inm, group); #endif - ++inm->in6m_refcount; + in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in6_multi record is needed; allocate and initialize it. * We DO NOT perform an MLD join as the in6_ layer may need to * push an initial source list down to MLD to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. * Pending state-changes per group are subject to a bounds check. */ inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { + IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->in6m_addr = *group; inm->in6m_ifp = ifp; inm->in6m_mli = MLD_IFINFO(ifp); inm->in6m_ifma = ifma; inm->in6m_refcount = 1; inm->in6m_state = MLD_NOT_MEMBER; mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES); inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->in6m_srcs); ifma->ifma_protospec = inm; *pinm = inm; -out_locked: + out_locked: + IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); return (error); } /* * Drop a reference to an in6_multi record. * * If the refcount drops to 0, free the in6_multi record and * delete the underlying link-layer membership. */ -void -in6m_release_locked(struct in6_multi *inm) +static void +in6m_release(struct in6_multi *inm) { struct ifmultiaddr *ifma; + struct ifnet *ifp; - IN6_MULTI_LOCK_ASSERT(); - CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount); - if (--inm->in6m_refcount > 0) { - CTR2(KTR_MLD, "%s: refcount is now %d", __func__, - inm->in6m_refcount); - return; - } - + MPASS(inm->in6m_refcount == 0); CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm); ifma = inm->in6m_ifma; + ifp = inm->in6m_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma); - KASSERT(ifma->ifma_protospec == inm, - ("%s: ifma_protospec != inm", __func__)); - ifma->ifma_protospec = NULL; + KASSERT(ifma->ifma_protospec == NULL, + ("%s: ifma_protospec != NULL", __func__)); + if (ifp) + CURVNET_SET(ifp->if_vnet); in6m_purge(inm); - free(inm, M_IP6MADDR); if_delmulti_ifma(ifma); + if (ifp) + CURVNET_RESTORE(); } +static struct grouptask free_gtask; +static struct in6_multi_head in6m_free_list; +static void in6m_release_task(void *arg __unused); +static void in6m_init(void) +{ + SLIST_INIT(&in6m_free_list); + taskqgroup_config_gtask_init(NULL, &free_gtask, in6m_release_task, "in6m release task"); +} + +SYSINIT(in6m_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, + in6m_init, NULL); + + +void +in6m_release_list_deferred(struct in6_multi_head *inmh) +{ + if (SLIST_EMPTY(inmh)) + return; + mtx_lock(&in6_multi_free_mtx); + SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele); + mtx_unlock(&in6_multi_free_mtx); + GROUPTASK_ENQUEUE(&free_gtask); +} + +void +in6m_release_deferred(struct in6_multi *inm) +{ + struct in6_multi_head tmp; + struct ifnet *ifp; + struct ifaddr *ifa; + struct in6_ifaddr *ifa6; + struct in6_multi_mship *imm; + + IN6_MULTI_LIST_LOCK_ASSERT(); + KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm)); + if (--inm->in6m_refcount == 0) { + ifp = inm->in6m_ifp; + IF_ADDR_LOCK_ASSERT(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + ifa6 = (void *)ifa; + LIST_FOREACH(imm, &ifa6->ia6_memberships, i6mm_chain) { + if (inm == imm->i6mm_maddr) + imm->i6mm_maddr = NULL; + } + } + SLIST_INIT(&tmp); + inm->in6m_ifma->ifma_protospec = NULL; + SLIST_INSERT_HEAD(&tmp, inm, in6m_nrele); + in6m_release_list_deferred(&tmp); + } +} + +static void +in6m_release_task(void *arg __unused) +{ + struct in6_multi_head in6m_free_tmp; + struct in6_multi *inm, *tinm; + + SLIST_INIT(&in6m_free_tmp); + mtx_lock(&in6_multi_free_mtx); + SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele); + mtx_unlock(&in6_multi_free_mtx); + IN6_MULTI_LOCK(); + SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) { + SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele); + in6m_release(inm); + } + IN6_MULTI_UNLOCK(); +} + /* * Clear recorded source entries for a group. * Used by the MLD code. Caller must hold the IN6_MULTI lock. * FIXME: Should reap. */ void in6m_clear_recorded(struct in6_multi *inm) { struct ip6_msource *ims; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { if (ims->im6s_stp) { ims->im6s_stp = 0; --inm->in6m_st[1].iss_rec; } } KASSERT(inm->in6m_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group MLDv2 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet6.mld.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) { struct ip6_msource find; struct ip6_msource *ims, *nims; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims && ims->im6s_stp) return (0); if (ims == NULL) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->im6s_addr = find.im6s_addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->im6s_stp; ++inm->in6m_st[1].iss_rec; return (1); } /* * Return a pointer to an in6_msource owned by an in6_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * addr is the source address. * * SMPng: May be called with locks held; malloc must not block. */ static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **plims) { struct ip6_msource find; struct ip6_msource *ims, *nims; struct in6_msource *lims; int error; error = 0; ims = NULL; lims = NULL; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); lims = (struct in6_msource *)ims; if (lims == NULL) { if (imf->im6f_nsrc == in6_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in6_msource *)nims; lims->im6s_addr = find.im6s_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in6_msource * im6f_graft(struct in6_mfilter *imf, const uint8_t st1, const struct sockaddr_in6 *psin) { struct ip6_msource *nims; struct in6_msource *lims; nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in6_msource *)nims; lims->im6s_addr = psin->sin6_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; lims->im6sl_st[1] = st1; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) { struct ip6_msource find; struct ip6_msource *ims; struct in6_msource *lims; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void im6f_rollback(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) { /* no change at t1 */ continue; } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->im6sl_st[1] = lims->im6sl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } imf->im6f_st[1] = imf->im6f_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void im6f_leave(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; } imf->im6f_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void im6f_commit(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[0] = lims->im6sl_st[1]; } imf->im6f_st[0] = imf->im6f_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void im6f_reap(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && (lims->im6sl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_MLD, "%s: free lims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } } /* * Purge socket-layer filter set. */ static void im6f_purge(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * addr is the IPv6 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims) { struct ip6_msource find; struct ip6_msource *ims, *nims; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims == NULL && !noalloc) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->im6s_addr = *addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; CTR3(KTR_MLD, "%s: allocated %s as %p", __func__, ip6_sprintf(ip6tbuf, addr), ims); } *pims = ims; return (0); } /* * Merge socket-layer source into MLD-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; ip6_sprintf(ip6tbuf, &lims->im6s_addr); #endif if (lims->im6sl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex -= n; } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in -= n; } if (lims->im6sl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex += n; } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in += n; } } /* * Atomically update the global in6_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct ip6_msource *ims, *nims; struct in6_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; + IN6_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); ++schanged; if (error) break; im6s_merge(nims, lims, 0); } if (error) { struct ip6_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims); if (bims == NULL) continue; im6s_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->im6f_st[0] == imf->im6f_st[1] && imf->im6f_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->im6f_st[0] != imf->im6f_st[1]) { CTR3(KTR_MLD, "%s: imf transition %d to %d", __func__, imf->im6f_st[0], imf->im6f_st[1]); if (imf->im6f_st[0] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__); --inm->in6m_st[1].iss_ex; } else if (imf->im6f_st[0] == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } if (imf->im6f_st[1] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__); inm->in6m_st[1].iss_ex++; } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__); inm->in6m_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the MLD lifecycle for this group should finish. */ if (inm->in6m_st[1].iss_ex > 0) { CTR1(KTR_MLD, "%s: transition to EX", __func__); inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->in6m_st[1].iss_in > 0) { CTR1(KTR_MLD, "%s: transition to IN", __func__); inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_MLD, "%s: transition to UNDEF", __func__); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->im6f_st[1] != MCAST_EXCLUDE) || (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__); --inm->in6m_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__); inm->in6m_st[1].iss_asm++; } CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm); in6m_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_MLD, "%s: sources changed; reaping", __func__); in6m_reap(inm); } return (error); } /* * Mark an in6_multi's filter set deltas as committed. * Called by MLD after a state change has been enqueued. */ void in6m_commit(struct in6_multi *inm) { struct ip6_msource *ims; CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm); CTR1(KTR_MLD, "%s: pre commit:", __func__); in6m_print(inm); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { ims->im6s_st[0] = ims->im6s_st[1]; } inm->in6m_st[0] = inm->in6m_st[1]; } /* * Reap unreferenced nodes from an in6_multi's filter set. */ static void in6m_reap(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || ims->im6s_stp != 0) continue; CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } } /* * Purge all source nodes from an in6_multi's filter set. */ static void in6m_purge(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } /* Free state-change requests that might be queued. */ mbufq_drain(&inm->in6m_scq); } /* * Join a multicast address w/o sources. * KAME compatibility entry point. * * SMPng: Assume no mc locks held by caller. */ -struct in6_multi_mship * -in6_joingroup(struct ifnet *ifp, struct in6_addr *mcaddr, - int *errorp, int delay) -{ - struct in6_multi_mship *imm; - int error; - - imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT); - if (imm == NULL) { - *errorp = ENOBUFS; - return (NULL); - } - - delay = (delay * PR_FASTHZ) / hz; - - error = in6_mc_join(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); - if (error) { - *errorp = error; - free(imm, M_IP6MADDR); - return (NULL); - } - - return (imm); -} - -/* - * Leave a multicast address w/o sources. - * KAME compatibility entry point. - * - * SMPng: Assume no mc locks held by caller. - */ int -in6_leavegroup(struct in6_multi_mship *imm) -{ - - if (imm->i6mm_maddr != NULL) - in6_mc_leave(imm->i6mm_maddr, NULL); - free(imm, M_IP6MADDR); - return 0; -} - -/* - * Join a multicast group; unlocked entry point. - * - * SMPng: XXX: in6_mc_join() is called from in6_control() when upper - * locks are not held. Fortunately, ifp is unlikely to have been detached - * at this point, so we assume it's OK to recurse. - */ -int -in6_mc_join(struct ifnet *ifp, const struct in6_addr *mcaddr, +in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { int error; IN6_MULTI_LOCK(); - error = in6_mc_join_locked(ifp, mcaddr, imf, pinm, delay); + error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay); IN6_MULTI_UNLOCK(); - return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the MLD downcall fails, the group is not joined, and an error * code is returned. */ int -in6_mc_join_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, +in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { struct in6_mfilter timf; struct in6_multi *inm; + struct ifmultiaddr *ifma; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif #ifdef INVARIANTS /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. */ KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr), ("%s: not a multicast address", __func__)); if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } #endif IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__, ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp)); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } - - error = in6_mc_get(ifp, mcaddr, &inm); + error = in6_getmulti(ifp, mcaddr, &inm); if (error) { - CTR1(KTR_MLD, "%s: in6_mc_get() failure", __func__); + CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__); return (error); } + IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); goto out_in6m_release; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, delay); if (error) { CTR1(KTR_MLD, "%s: failed to update source", __func__); goto out_in6m_release; } out_in6m_release: if (error) { CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); - in6m_release_locked(inm); + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_protospec == inm) { + ifma->ifma_protospec = NULL; + break; + } + } + in6m_release_deferred(inm); + IF_ADDR_RUNLOCK(ifp); } else { *pinm = inm; } - + IN6_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int -in6_mc_leave(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) +in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { int error; IN6_MULTI_LOCK(); - error = in6_mc_leave_locked(inm, imf); + error = in6_leavegroup_locked(inm, imf); IN6_MULTI_UNLOCK(); - return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as in6m_release(*) as this function also * makes a state change downcall into MLD. */ int -in6_mc_leave_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) +in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct in6_mfilter timf; + struct ifnet *ifp; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif error = 0; IN6_MULTI_LOCK_ASSERT(); CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr), (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at MLD layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ + + ifp = inm->in6m_ifp; + IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); - in6m_release_locked(inm); + if (ifp) + IF_ADDR_RLOCK(ifp); + in6m_release_deferred(inm); + if (ifp) + IF_ADDR_RUNLOCK(ifp); + IN6_MULTI_LIST_UNLOCK(); return (error); } + /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An MLD downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; size_t idx; uint16_t fmode; int error, doblock; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } KASSERT(imo->im6o_mfilters != NULL, ("%s: im6o_mfilters not allocated", __func__)); imf = &imo->im6o_mfilters[idx]; inm = imo->im6o_membership[idx]; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->im6f_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = im6o_match_source(imo, idx, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_MLD, "%s: %s source", __func__, "block"); ims = im6f_graft(imf, fmode, &ssa->sin6); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); error = im6f_prune(imf, &ssa->sin6); } if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_im6f_rollback; } /* * Begin state merge transaction at MLD layer. */ - IN6_MULTI_LOCK(); - + IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; struct in6_multi **immp; struct in6_mfilter *imfp; size_t idx; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) return (inp->in6p_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); immp = malloc(sizeof(*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS, M_WAITOK | M_ZERO); imfp = malloc(sizeof(struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS, M_IN6MFILTER, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; imo->im6o_num_memberships = 0; imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; imo->im6o_membership = immp; /* Initialize per-group source filters. */ for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++) im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); imo->im6o_mfilters = imfp; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { free(imfp, M_IN6MFILTER); free(immp, M_IP6MOPTS); free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } inp->in6p_moptions = imo; return (imo); } /* * Discard the IPv6 multicast options (and source filters). * * SMPng: NOTE: assumes INP write lock is held. */ void ip6_freemoptions(struct ip6_moptions *imo) { struct in6_mfilter *imf; size_t idx, nmships; - KASSERT(imo != NULL, ("%s: ip6_moptions is NULL", __func__)); + if (imo == NULL) + return; nmships = imo->im6o_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->im6o_mfilters ? &imo->im6o_mfilters[idx] : NULL; if (imf) im6f_leave(imf); /* XXX this will thrash the lock(s) */ - (void)in6_mc_leave(imo->im6o_membership[idx], imf); + (void)in6_leavegroup(imo->im6o_membership[idx], imf); if (imf) im6f_purge(imf); } if (imo->im6o_mfilters) free(imo->im6o_mfilters, M_IN6MFILTER); free(imo->im6o_membership, M_IP6MOPTS); free(imo, M_IP6MOPTS); } /* * Atomically get source filters on a socket for an IPv6 multicast group. * Called with INP lock held; returns with lock released. */ static int in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip6_moptions *imo; struct in6_mfilter *imf; struct ip6_msource *ims; struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t idx, nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->in6p_moptions; KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); INP_WLOCK(inp); /* * Lookup group on the socket. */ idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } imf = &imo->im6o_mfilters[idx]; /* * Ignore memberships which are in limbo. */ if (imf->im6f_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->im6f_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == MCAST_UNDEFINED || lims->im6sl_st[0] != imf->im6f_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in6 *)ptss; psin->sin6_family = AF_INET6; psin->sin6_len = sizeof(struct sockaddr_in6); psin->sin6_addr = lims->im6s_addr; psin->sin6_port = 0; --nsrcs; ++ptss; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; u_int optval; INP_WLOCK(inp); im6o = inp->in6p_moptions; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { optval = 0; } else { optval = im6o->im6o_multicast_ifp->if_index; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = V_ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = in6_mcast_loop; /* XXX VIMAGE */ else optval = im6o->im6o_multicast_loop; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MSFILTER: if (im6o == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = in6p_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the address of an IPv6 group. * * This routine exists to support legacy IPv6 multicast applications. * * If inp is non-NULL, use this socket's current FIB number for any * required FIB lookup. Look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * If the FIB lookup fails, return NULL. * * FUTURE: Support multiple forwarding tables for IPv6. * * Returns NULL if no ifp could be found. */ static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *in6p, const struct sockaddr_in6 *gsin6) { struct nhop6_basic nh6; struct in6_addr dst; uint32_t scopeid; uint32_t fibnum; KASSERT(in6p->inp_vflag & INP_IPV6, ("%s: not INP_IPV6 inpcb", __func__)); KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); fibnum = in6p ? in6p->inp_inc.inc_fibnum : RT_DEFAULT_FIB; if (fib6_lookup_nh_basic(fibnum, &dst, scopeid, 0, 0, &nh6) != 0) return (NULL); return (nh6.nh_ifp); } /* * Join an IPv6 multicast group, possibly with a source. * * FIXME: The KAME use of the unspecified address (::) * to join *all* multicast groups is currently unsupported. */ static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; size_t idx; int error, is_new; ifp = NULL; imf = NULL; lims = NULL; error = 0; is_new = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything into struct group_source_req. * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * Ignore passed-in scope ID. */ switch (sopt->sopt_name) { case IPV6_JOIN_GROUP: { struct ipv6_mreq mreq; error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; if (mreq.ipv6mr_interface == 0) { ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { if (V_if_index < mreq.ipv6mr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(mreq.ipv6mr_interface); } CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p", __func__, mreq.ipv6mr_interface, ifp); } break; case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); ssa->sin6.sin6_port = 0; ssa->sin6.sin6_scope_id = 0; } if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) return (EADDRNOTAVAIL); ifp = ifnet_byindex(gsr.gsr_interface); break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; /* * Always set the scope zone ID on memberships created from userland. * Use the passed-in ifp to do this. * XXX The in6_setscope() return value is meaningless. * XXX SCOPE6_LOCK() is taken by in6_setscope(). */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1) { is_new = 1; } else { inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->im6f_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in6_msource is transactioned just as for anything * else in SSM -- but note naive use of in6m_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = im6o_match_source(imo, idx, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_in6p_locked; } } else { /* * MCAST_JOIN_GROUP alone, on any existing membership, * is rejected, to stop the same inpcb tying up * multiple refs to the in_multi. * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; goto out_in6p_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); if (is_new) { if (imo->im6o_num_memberships == imo->im6o_max_memberships) { error = im6o_grow(imo); if (error) goto out_in6p_locked; } /* * Allocate the new slot upfront so we can deal with * grafting the new source filter in same code path * as for join-source on existing membership. */ idx = imo->im6o_num_memberships; imo->im6o_membership[idx] = NULL; imo->im6o_num_memberships++; KASSERT(imo->im6o_mfilters != NULL, ("%s: im6f_mfilters vector was not allocated", __func__)); imf = &imo->im6o_mfilters[idx]; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); if (lims == NULL) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_im6o_free; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); } } /* * Begin state merge transaction at MLD layer. */ + in_pcbref(inp); + INP_WUNLOCK(inp); IN6_MULTI_LOCK(); if (is_new) { - error = in6_mc_join_locked(ifp, &gsa->sin6.sin6_addr, imf, + error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, &inm, 0); if (error) { IN6_MULTI_UNLOCK(); goto out_im6o_free; } imo->im6o_membership[idx] = inm; } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); + IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } + IN6_MULTI_LIST_UNLOCK(); } IN6_MULTI_UNLOCK(); - INP_WLOCK_ASSERT(inp); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (ENXIO); if (error) { im6f_rollback(imf); if (is_new) im6f_purge(imf); else im6f_reap(imf); } else { im6f_commit(imf); } out_im6o_free: if (error && is_new) { imo->im6o_membership[idx] = NULL; --imo->im6o_num_memberships; } out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Leave an IPv6 multicast group on an inpcb, possibly with a source. */ static int in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct ipv6_mreq mreq; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; size_t idx; int error, is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; ifindex = 0; error = 0; is_final = 1; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything passed in up into a struct group_source_req * as that is easier to process. * Note: Any embedded scope ID in the multicast group passed * in by userland is ignored, the interface index is the recommended * mechanism to specify an interface; see below. */ switch (sopt->sopt_name) { case IPV6_LEAVE_GROUP: error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = mreq.ipv6mr_interface; break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); } gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = gsr.gsr_interface; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * Validate interface index if provided. If no interface index * was provided separately, attempt to look the membership up * from the default scope as a last resort to disambiguate * the membership we are being asked to leave. * XXX SCOPE6 lock potentially taken here. */ if (ifindex != 0) { if (V_if_index < ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); } else { error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone); if (error) return (EADDRNOTAVAIL); /* * Some badly behaved applications don't pass an ifindex * or a scope ID, which is an API violation. In this case, * perform a lookup as per a v6 join. * * XXX For now, stomp on zone ID for the corner case. * This is not the 'KAME way', but we need to see the ifp * directly until such time as this implementation is * refactored, assuming the scope IDs are the way to go. */ ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); if (ifindex == 0) { CTR2(KTR_MLD, "%s: warning: no ifindex, looking up " "ifp for group %s.", __func__, ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr)); ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { ifp = ifnet_byindex(ifindex); } if (ifp == NULL) return (EADDRNOTAVAIL); } CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); /* * Find the membership in the membership array. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; if (ssa->ss.ss_family != AF_UNSPEC) is_final = 0; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { im6f_leave(imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } ims = im6o_match_source(imo, idx, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } CTR2(KTR_MLD, "%s: %s source", __func__, "block"); error = im6f_prune(imf, &ssa->sin6); if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_in6p_locked; } } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LOCK(); if (is_final) { /* * Give up the multicast address record to which * the membership points. */ - (void)in6_mc_leave_locked(inm, imf); + (void)in6_leavegroup_locked(inm, imf); } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); + IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } + IN6_MULTI_LIST_UNLOCK(); } IN6_MULTI_UNLOCK(); if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); if (is_final) { /* Remove the gap in the membership array. */ for (++idx; idx < imo->im6o_num_memberships; ++idx) { imo->im6o_membership[idx-1] = imo->im6o_membership[idx]; imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx]; } imo->im6o_num_memberships--; } out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Select the interface for transmitting IPv6 multicast datagrams. * * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn * may be passed to this socket option. An address of in6addr_any or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct ifnet *ifp; struct ip6_moptions *imo; u_int ifindex; int error; if (sopt->sopt_valsize != sizeof(u_int)) return (EINVAL); error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); if (error) return (error); if (V_if_index < ifindex) return (EINVAL); if (ifindex == 0) ifp = NULL; else { ifp = ifnet_byindex(ifindex); if (ifp == NULL) return (EINVAL); if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); } imo = in6p_findmoptions(inp); imo->im6o_multicast_ifp = ifp; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv6 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; size_t idx; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); if (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); gsa->sin6.sin6_port = 0; /* ignore port */ if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) return (EADDRNOTAVAIL); ifp = ifnet_byindex(msfr.msfr_ifindex); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); idx = im6o_match_group(imo, ifp, &gsa->sa); if (idx == -1 || imo->im6o_mfilters == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imo->im6o_membership[idx]; imf = &imo->im6o_mfilters[idx]; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->im6f_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_MLD, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as im6f_leave() * will set it to INCLUDE. */ im6f_leave(imf); imf->im6f_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in6 *)pkss; if (psin->sin6_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (psin->sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; break; } if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { error = EINVAL; break; } /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&psin->sin6_addr); error = im6f_get_source(imf, psin, &lims); if (error) break; lims->im6sl_st[1] = imf->im6f_st[1]; } free(kss, M_TEMP); } if (error) goto out_im6f_rollback; INP_WLOCK_ASSERT(inp); - IN6_MULTI_LOCK(); + IN6_MULTI_LIST_LOCK(); /* * Begin state merge transaction at MLD layer. */ CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv6 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. */ int ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; error = 0; /* * If socket is neither of type SOCK_RAW or SOCK_DGRAM, * or is a divert socket, reject it. */ if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || (inp->inp_socket->so_proto->pr_type != SOCK_RAW && inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: error = in6p_set_multicast_if(inp, sopt); break; case IPV6_MULTICAST_HOPS: { int hlim; if (sopt->sopt_valsize != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); if (error) break; if (hlim < -1 || hlim > 255) { error = EINVAL; break; } else if (hlim == -1) { hlim = V_ip6_defmcasthlim; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_hlim = hlim; INP_WUNLOCK(inp); break; } case IPV6_MULTICAST_LOOP: { u_int loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (sopt->sopt_valsize != sizeof(u_int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); if (error) break; if (loop > 1) { error = EINVAL; break; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_loop = loop; INP_WUNLOCK(inp); break; } case IPV6_JOIN_GROUP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = in6p_join_group(inp, sopt); break; case IPV6_LEAVE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = in6p_leave_group(inp, sopt); break; case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = in6p_block_unblock_source(inp, sopt); break; case IPV6_MSFILTER: error = in6p_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose MLD's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in6_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; struct ip6_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); /* int: ifindex + 4 * 32 bits of IPv6 address */ if (namelen != 5) return (EINVAL); ifindex = name[0]; if (ifindex <= 0 || ifindex > V_if_index) { CTR2(KTR_MLD, "%s: ifindex %u out of range", __func__, ifindex); return (ENOENT); } memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { CTR2(KTR_MLD, "%s: group %s is not multicast", __func__, ip6_sprintf(ip6tbuf, &mcaddr)); return (EINVAL); } ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR2(KTR_MLD, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } /* * Internal MLD lookups require that scope/zone ID is set. */ (void)in6_setscope(&mcaddr, ifp, NULL); retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr))); if (retval) return (retval); IN6_MULTI_LOCK(); - + IN6_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { CTR2(KTR_MLD, "%s: visit node %p", __func__, ims); /* * Only copy-out sources which are in-mode. */ if (fmode != im6s_get_mode(inm, ims, 1)) { CTR1(KTR_MLD, "%s: skip non-in-mode", __func__); continue; } src = ims->im6s_addr; retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); if (retval != 0) break; } } IF_ADDR_RUNLOCK(ifp); + IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (retval); } #ifdef KTR static const char *in6m_modestrs[] = { "un", "in", "ex" }; static const char * in6m_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (in6m_modestrs[mode]); return ("??"); } static const char *in6m_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * in6m_state_str(const int state) { if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) return (in6m_statestrs[state]); return ("??"); } /* * Dump an in6_multi structure to the console. */ void in6m_print(const struct in6_multi *inm) { int t; char ip6tbuf[INET6_ADDRSTRLEN]; if ((ktr_mask & KTR_MLD) == 0) return; printf("%s: --- begin in6m %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp), inm->in6m_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->in6m_timer, in6m_state_str(inm->in6m_state), inm->in6m_refcount, mbufq_len(&inm->in6m_scq)); printf("mli %p nsrc %lu sctimer %u scrv %u\n", inm->in6m_mli, inm->in6m_nsrc, inm->in6m_sctimer, inm->in6m_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, in6m_mode_str(inm->in6m_st[t].iss_fmode), inm->in6m_st[t].iss_asm, inm->in6m_st[t].iss_ex, inm->in6m_st[t].iss_in, inm->in6m_st[t].iss_rec); } printf("%s: --- end in6m %p ---\n", __func__, inm); } #else /* !KTR */ void in6m_print(const struct in6_multi *inm) { } #endif /* KTR */ Index: head/sys/netinet6/in6_pcb.c =================================================================== --- head/sys/netinet6/in6_pcb.c (revision 333174) +++ head/sys/netinet6/in6_pcb.c (revision 333175) @@ -1,1315 +1,1314 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); int in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int error, lookupflags = 0; int reuseport = (so->so_options & SO_REUSEPORT); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in6_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); } else { sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * family check. */ if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if ((error = prison_local_ip6(cred, &sin6->sin6_addr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); lport = sin6->sin6_port; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL && (inp->inp_flags & INP_BINDANY) == 0) { return (EADDRNOTAVAIL); } /* * XXX: bind to an anycast address might accidentally * cause sending a packet with anycast source address. * We should allow to bind to a deprecated address, since * the application dares to use it. */ if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { ifa_free(ifa); return (EADDRNOTAVAIL); } if (ifa != NULL) ifa_free(ifa); } if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } #endif } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { tw = intotw(t); if (tw == NULL) return (EADDRINUSE); if ((reuseport & tw->tw_so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || ((inp->inp_vflag & INP_IPV6PROTO) == (t->inp_vflag & INP_IPV6PROTO)))) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_vflag & INP_IPV6PROTO) != 0)) return (EADDRINUSE); } #endif } inp->in6p_laddr = sin6->sin6_addr; } if (lport == 0) { if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; return (error); } } else { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } } return (0); } /* * Transform old in6_pcbconnect() into an inner subroutine for new * in6_pcbconnect(): Do some validity-checking on the remote * address (in mbuf 'nam') and then determine local host address * (i.e., which interface) to use to access that remote host. * * This preserves definition of in6_pcbconnect(), while supporting a * slightly different version for T/TCP. (This is more than * a bit of a kludge, but cleaning up the internal interfaces would * have forced minor changes in every protocol). */ static int in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in6_addr *plocal_addr6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; int scope_ambiguous = 0; struct in6_addr in6a; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */ if (nam->sa_len != sizeof (*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if (!TAILQ_EMPTY(&V_in6_ifaddrhead)) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0) return (error); error = in6_selectsrc_socket(sin6, inp->in6p_outputopts, inp, inp->inp_cred, scope_ambiguous, &in6a, NULL); if (error) return (error); /* * Do not update this earlier, in case we return with an error. * * XXX: this in6_selectsrc_socket result might replace the bound local * address with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ *plocal_addr6 = in6a; /* * Don't do pcblookup call here; return interface in * plocal_addr6 * and exit to caller, that will do the lookup. */ return (0); } /* * Outer subroutine: * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr addr6; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return (error); if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, cred); if (error) return (error); } inp->in6p_laddr = addr6; } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; if (inp->inp_flags & IN6P_AUTOFLOWLABEL) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash_mbuf(inp, m); return (0); } int in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in6_pcbconnect_mbuf(inp, nam, cred, NULL)); } void in6_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); } struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK); bzero(sin6, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; sin6->sin6_addr = *addr_p; (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ return (struct sockaddr *)sin6; } struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = port; sin.sin_addr = *addr_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); in6_sin_2_v4mapsin6(&sin, sin6_p); return (struct sockaddr *)sin6_p; } int in6_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->in6p_laddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->in6p_faddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getsockaddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif { /* scope issues will be handled in in6_getsockaddr(). */ error = in6_getsockaddr(so, nam); } return error; } int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getpeeraddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif /* scope issues will be handled in in6_getpeeraddr(). */ error = in6_getpeeraddr(so, nam); return error; } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; int errno; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return; sa6_dst = (struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6_rtchange to invalidate the route cache. * Dead host indications: also use in6_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); if ((inp->inp_vflag & INP_IPV6) == 0) { INP_WUNLOCK(inp); continue; } /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && cmdarg != NULL) ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, *(u_int32_t *)cmdarg); /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr) || inp->inp_socket == 0 || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { INP_WUNLOCK(inp); continue; } do_notify: if (notify) { if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; int matchwild = 3, wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_WLOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_lport == lport) { /* Found. */ if (cred == NULL || prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) continue; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) wildcard++; if (!IN6_IS_ADDR_UNSPECIFIED( &inp->in6p_laddr)) { if (IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; else if (!IN6_ARE_ADDR_EQUAL( &inp->in6p_laddr, laddr)) continue; } else { if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *in6p; struct ip6_moptions *im6o; int i, gap; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { /* * Unselect the outgoing ifp for multicast if it * is being detached. */ if (im6o->im6o_multicast_ifp == ifp) im6o->im6o_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ gap = 0; for (i = 0; i < im6o->im6o_num_memberships; i++) { if (im6o->im6o_membership[i]->in6m_ifp == ifp) { - in6_mc_leave(im6o->im6o_membership[i], - NULL); + in6_leavegroup(im6o->im6o_membership[i], NULL); gap++; } else if (gap != 0) { im6o->im6o_membership[i - gap] = im6o->im6o_membership[i]; } } im6o->im6o_num_memberships -= gap; } INP_WUNLOCK(in6p); } INP_INFO_WUNLOCK(pcbinfo); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in6_losing(struct inpcb *in6p) { if (in6p->inp_route6.ro_rt) { RTFREE(in6p->inp_route6.ro_rt); in6p->inp_route6.ro_rt = (struct rtentry *)NULL; } if (in6p->inp_route.ro_lle) LLE_FREE(in6p->inp_route.ro_lle); /* zeros ro_lle */ return; } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ struct inpcb * in6_rtchange(struct inpcb *inp, int errno) { if (inp->inp_route6.ro_rt) { RTFREE(inp->inp_route6.ro_rt); inp->inp_route6.ro_rt = (struct rtentry *)NULL; } if (inp->inp_route.ro_lle) LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */ return inp; } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } /* * Then look for a wildcard match in the pcbgroup. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[ INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking buf", __func__); if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list. */ static struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) return (inp); else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ /* * Not found. */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; bool locked; INP_HASH_RLOCK(pcbinfo); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (!locked) in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in6_pcbgroup.c. */ struct inpcb * in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && M_HASHTYPE_TEST(m, M_HASHTYPE_NONE) == 0) { pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int srcordst) { struct ip6_hdr *ip; ip = mtod(m, struct ip6_hdr *); bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_addr = srcordst ? ip->ip6_dst : ip->ip6_src; (void)sa6_recoverscope(sin6); /* XXX: should catch errors... */ return; } Index: head/sys/netinet6/in6_var.h =================================================================== --- head/sys/netinet6/in6_var.h (revision 333174) +++ head/sys/netinet6/in6_var.h (revision 333175) @@ -1,834 +1,879 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_var.h,v 1.56 2001/03/29 05:34:31 itojun Exp $ */ /*- * Copyright (c) 1985, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_var.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET6_IN6_VAR_H_ #define _NETINET6_IN6_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif /* * Interface address, Internet version. One of these structures * is allocated for each interface with an Internet address. * The ifaddr structure contains the protocol-independent part * of the structure and is assumed to be first. */ /* * pltime/vltime are just for future reference (required to implements 2 * hour rule for hosts). they should never be modified by nd6_timeout or * anywhere else. * userland -> kernel: accept pltime/vltime * kernel -> userland: throw up everything * in kernel: modify preferred/expire only */ struct in6_addrlifetime { time_t ia6t_expire; /* valid lifetime expiration time */ time_t ia6t_preferred; /* preferred lifetime expiration time */ u_int32_t ia6t_vltime; /* valid lifetime */ u_int32_t ia6t_pltime; /* prefix lifetime */ }; struct nd_ifinfo; struct scope6_id; struct lltable; struct mld_ifsoftc; +struct in6_multi; struct in6_ifextra { counter_u64_t *in6_ifstat; counter_u64_t *icmp6_ifstat; struct nd_ifinfo *nd_ifinfo; struct scope6_id *scope6_id; struct lltable *lltable; struct mld_ifsoftc *mld_ifinfo; }; #define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable) #ifdef _KERNEL + +SLIST_HEAD(in6_multi_head, in6_multi); +MALLOC_DECLARE(M_IP6MADDR); + struct in6_ifaddr { struct ifaddr ia_ifa; /* protocol-independent info */ #define ia_ifp ia_ifa.ifa_ifp #define ia_flags ia_ifa.ifa_flags struct sockaddr_in6 ia_addr; /* interface address */ struct sockaddr_in6 ia_net; /* network number of interface */ struct sockaddr_in6 ia_dstaddr; /* space for destination addr */ struct sockaddr_in6 ia_prefixmask; /* prefix mask */ u_int32_t ia_plen; /* prefix length */ TAILQ_ENTRY(in6_ifaddr) ia_link; /* list of IPv6 addresses */ int ia6_flags; struct in6_addrlifetime ia6_lifetime; time_t ia6_createtime; /* the creation time of this address, which is * currently used for temporary addresses only. */ time_t ia6_updatetime; /* back pointer to the ND prefix (for autoconfigured addresses only) */ struct nd_prefix *ia6_ndpr; /* multicast addresses joined from the kernel */ LIST_HEAD(, in6_multi_mship) ia6_memberships; /* entry in bucket of inet6 addresses */ LIST_ENTRY(in6_ifaddr) ia6_hash; }; /* List of in6_ifaddr's. */ TAILQ_HEAD(in6_ifaddrhead, in6_ifaddr); LIST_HEAD(in6_ifaddrlisthead, in6_ifaddr); #endif /* _KERNEL */ /* control structure to manage address selection policy */ struct in6_addrpolicy { struct sockaddr_in6 addr; /* prefix address */ struct sockaddr_in6 addrmask; /* prefix mask */ int preced; /* precedence */ int label; /* matching label */ u_quad_t use; /* statistics */ }; /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). */ struct in6_ifstat { uint64_t ifs6_in_receive; /* # of total input datagram */ uint64_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */ uint64_t ifs6_in_toobig; /* # of datagrams exceeded MTU */ uint64_t ifs6_in_noroute; /* # of datagrams with no route */ uint64_t ifs6_in_addrerr; /* # of datagrams with invalid dst */ uint64_t ifs6_in_protounknown; /* # of datagrams with unknown proto */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_truncated; /* # of truncated datagrams */ uint64_t ifs6_in_discard; /* # of discarded datagrams */ /* NOTE: fragment timeout is not here */ uint64_t ifs6_in_deliver; /* # of datagrams delivered to ULP */ /* NOTE: increment on final dst if */ uint64_t ifs6_out_forward; /* # of datagrams forwarded */ /* NOTE: increment on outgoing if */ uint64_t ifs6_out_request; /* # of outgoing datagrams from ULP */ /* NOTE: does not include forwrads */ uint64_t ifs6_out_discard; /* # of discarded datagrams */ uint64_t ifs6_out_fragok; /* # of datagrams fragmented */ uint64_t ifs6_out_fragfail; /* # of datagrams failed on fragment */ uint64_t ifs6_out_fragcreat; /* # of fragment datagrams */ /* NOTE: this is # after fragment */ uint64_t ifs6_reass_reqd; /* # of incoming fragmented packets */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_ok; /* # of reassembled packets */ /* NOTE: this is # after reass */ /* NOTE: increment on final dst if */ uint64_t ifs6_reass_fail; /* # of reass failures */ /* NOTE: may not be packet count */ /* NOTE: increment on final dst if */ uint64_t ifs6_in_mcast; /* # of inbound multicast datagrams */ uint64_t ifs6_out_mcast; /* # of outbound multicast datagrams */ }; /* * ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry. * XXX: I'm not sure if this file is the right place for this structure... */ struct icmp6_ifstat { /* * Input statistics */ /* ipv6IfIcmpInMsgs, total # of input messages */ uint64_t ifs6_in_msg; /* ipv6IfIcmpInErrors, # of input error messages */ uint64_t ifs6_in_error; /* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */ uint64_t ifs6_in_dstunreach; /* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */ uint64_t ifs6_in_adminprohib; /* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */ uint64_t ifs6_in_timeexceed; /* ipv6IfIcmpInParmProblems, # of input parameter problem errors */ uint64_t ifs6_in_paramprob; /* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */ uint64_t ifs6_in_pkttoobig; /* ipv6IfIcmpInEchos, # of input echo requests */ uint64_t ifs6_in_echo; /* ipv6IfIcmpInEchoReplies, # of input echo replies */ uint64_t ifs6_in_echoreply; /* ipv6IfIcmpInRouterSolicits, # of input router solicitations */ uint64_t ifs6_in_routersolicit; /* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */ uint64_t ifs6_in_routeradvert; /* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */ uint64_t ifs6_in_neighborsolicit; /* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */ uint64_t ifs6_in_neighboradvert; /* ipv6IfIcmpInRedirects, # of input redirects */ uint64_t ifs6_in_redirect; /* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */ uint64_t ifs6_in_mldquery; /* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */ uint64_t ifs6_in_mldreport; /* ipv6IfIcmpInGroupMembReductions, # of input MLD done */ uint64_t ifs6_in_mlddone; /* * Output statistics. We should solve unresolved routing problem... */ /* ipv6IfIcmpOutMsgs, total # of output messages */ uint64_t ifs6_out_msg; /* ipv6IfIcmpOutErrors, # of output error messages */ uint64_t ifs6_out_error; /* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */ uint64_t ifs6_out_dstunreach; /* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */ uint64_t ifs6_out_adminprohib; /* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */ uint64_t ifs6_out_timeexceed; /* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */ uint64_t ifs6_out_paramprob; /* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */ uint64_t ifs6_out_pkttoobig; /* ipv6IfIcmpOutEchos, # of output echo requests */ uint64_t ifs6_out_echo; /* ipv6IfIcmpOutEchoReplies, # of output echo replies */ uint64_t ifs6_out_echoreply; /* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */ uint64_t ifs6_out_routersolicit; /* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */ uint64_t ifs6_out_routeradvert; /* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */ uint64_t ifs6_out_neighborsolicit; /* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */ uint64_t ifs6_out_neighboradvert; /* ipv6IfIcmpOutRedirects, # of output redirects */ uint64_t ifs6_out_redirect; /* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */ uint64_t ifs6_out_mldquery; /* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */ uint64_t ifs6_out_mldreport; /* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */ uint64_t ifs6_out_mlddone; }; struct in6_ifreq { char ifr_name[IFNAMSIZ]; union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; int ifru_flags; int ifru_flags6; int ifru_metric; caddr_t ifru_data; struct in6_addrlifetime ifru_lifetime; struct in6_ifstat ifru_stat; struct icmp6_ifstat ifru_icmp6stat; u_int32_t ifru_scope_id[16]; } ifr_ifru; }; struct in6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; int ifra_vhid; }; /* pre-10.x compat */ struct oin6_aliasreq { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; struct sockaddr_in6 ifra_dstaddr; struct sockaddr_in6 ifra_prefixmask; int ifra_flags; struct in6_addrlifetime ifra_lifetime; }; /* prefix type macro */ #define IN6_PREFIX_ND 1 #define IN6_PREFIX_RR 2 /* * prefix related flags passed between kernel(NDP related part) and * user land command(ifconfig) and daemon(rtadvd). */ struct in6_prflags { struct prf_ra { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } prf_ra; u_char prf_reserved1; u_short prf_reserved2; /* want to put this on 4byte offset */ struct prf_rr { u_char decrvalid : 1; u_char decrprefd : 1; u_char reserved : 6; } prf_rr; u_char prf_reserved3; u_short prf_reserved4; }; struct in6_prefixreq { char ipr_name[IFNAMSIZ]; u_char ipr_origin; u_char ipr_plen; u_int32_t ipr_vltime; u_int32_t ipr_pltime; struct in6_prflags ipr_flags; struct sockaddr_in6 ipr_prefix; }; #define PR_ORIG_RA 0 #define PR_ORIG_RR 1 #define PR_ORIG_STATIC 2 #define PR_ORIG_KERNEL 3 #define ipr_raf_onlink ipr_flags.prf_ra.onlink #define ipr_raf_auto ipr_flags.prf_ra.autonomous #define ipr_statef_onlink ipr_flags.prf_state.onlink #define ipr_rrf_decrvalid ipr_flags.prf_rr.decrvalid #define ipr_rrf_decrprefd ipr_flags.prf_rr.decrprefd struct in6_rrenumreq { char irr_name[IFNAMSIZ]; u_char irr_origin; u_char irr_m_len; /* match len for matchprefix */ u_char irr_m_minlen; /* minlen for matching prefix */ u_char irr_m_maxlen; /* maxlen for matching prefix */ u_char irr_u_uselen; /* uselen for adding prefix */ u_char irr_u_keeplen; /* keeplen from matching prefix */ struct irr_raflagmask { u_char onlink : 1; u_char autonomous : 1; u_char reserved : 6; } irr_raflagmask; u_int32_t irr_vltime; u_int32_t irr_pltime; struct in6_prflags irr_flags; struct sockaddr_in6 irr_matchprefix; struct sockaddr_in6 irr_useprefix; }; #define irr_raf_mask_onlink irr_raflagmask.onlink #define irr_raf_mask_auto irr_raflagmask.autonomous #define irr_raf_mask_reserved irr_raflagmask.reserved #define irr_raf_onlink irr_flags.prf_ra.onlink #define irr_raf_auto irr_flags.prf_ra.autonomous #define irr_statef_onlink irr_flags.prf_state.onlink #define irr_rrf irr_flags.prf_rr #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 */ #define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr)) #define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr)) #define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr)) #define IA6_SIN6(ia) (&((ia)->ia_addr)) #define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr)) #define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr) #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) #ifdef _KERNEL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) #define IN6_MASK_ADDR(a, m) do { \ (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \ (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \ (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \ (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \ } while (0) #endif #define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq) #define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq) #ifdef _KERNEL /* * SIOCSxxx ioctls should be unused (see comments in in6.c), but * we do not shift numbers for binary compatibility. */ #define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq) #define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq) #endif #define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq) #define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq) #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) #define OSIOCAIFADDR_IN6 _IOW('i', 26, struct oin6_aliasreq) #define SIOCAIFADDR_IN6 _IOW('i', 27, struct in6_aliasreq) #define SIOCSIFPHYADDR_IN6 _IOW('i', 70, struct in6_aliasreq) #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 71, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 72, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) #ifdef _KERNEL #define OSIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #endif #define SIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ndireq) #define SIOCSIFINFO_IN6 _IOWR('i', 109, struct in6_ndireq) #define SIOCSNDFLUSH_IN6 _IOWR('i', 77, struct in6_ifreq) #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) #define SIOCSPFXFLUSH_IN6 _IOWR('i', 79, struct in6_ifreq) #define SIOCSRTRFLUSH_IN6 _IOWR('i', 80, struct in6_ifreq) #define SIOCGIFALIFETIME_IN6 _IOWR('i', 81, struct in6_ifreq) #define SIOCGIFSTAT_IN6 _IOWR('i', 83, struct in6_ifreq) #define SIOCGIFSTAT_ICMP6 _IOWR('i', 84, struct in6_ifreq) #define SIOCSDEFIFACE_IN6 _IOWR('i', 85, struct in6_ndifreq) #define SIOCGDEFIFACE_IN6 _IOWR('i', 86, struct in6_ndifreq) #define SIOCSIFINFO_FLAGS _IOWR('i', 87, struct in6_ndireq) /* XXX */ #define SIOCSSCOPE6 _IOW('i', 88, struct in6_ifreq) #define SIOCGSCOPE6 _IOWR('i', 89, struct in6_ifreq) #define SIOCGSCOPE6DEF _IOWR('i', 90, struct in6_ifreq) #define SIOCSIFPREFIX_IN6 _IOW('i', 100, struct in6_prefixreq) /* set */ #define SIOCGIFPREFIX_IN6 _IOWR('i', 101, struct in6_prefixreq) /* get */ #define SIOCDIFPREFIX_IN6 _IOW('i', 102, struct in6_prefixreq) /* del */ #define SIOCAIFPREFIX_IN6 _IOW('i', 103, struct in6_rrenumreq) /* add */ #define SIOCCIFPREFIX_IN6 _IOW('i', 104, \ struct in6_rrenumreq) /* change */ #define SIOCSGIFPREFIX_IN6 _IOW('i', 105, \ struct in6_rrenumreq) /* set global */ #define SIOCGETSGCNT_IN6 _IOWR('u', 106, \ struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ #define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) #define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #define IN6_IFF_ANYCAST 0x01 /* anycast address */ #define IN6_IFF_TENTATIVE 0x02 /* tentative address */ #define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */ #define IN6_IFF_DETACHED 0x08 /* may be detached from the link */ #define IN6_IFF_DEPRECATED 0x10 /* deprecated address */ #define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address * (obsolete) */ #define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */ #define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */ #define IN6_IFF_PREFER_SOURCE 0x0100 /* preferred address for SAS */ /* do not input/output */ #define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED) #ifdef _KERNEL #define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b)) #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif #ifdef _KERNEL VNET_DECLARE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DECLARE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DECLARE(u_long, in6_ifaddrhmask); #define V_in6_ifaddrhead VNET(in6_ifaddrhead) #define V_in6_ifaddrhashtbl VNET(in6_ifaddrhashtbl) #define V_in6_ifaddrhmask VNET(in6_ifaddrhmask) #define IN6ADDR_NHASH_LOG2 8 #define IN6ADDR_NHASH (1 << IN6ADDR_NHASH_LOG2) #define IN6ADDR_HASHVAL(x) (in6_addrhash(x)) #define IN6ADDR_HASH(x) \ (&V_in6_ifaddrhashtbl[IN6ADDR_HASHVAL(x) & V_in6_ifaddrhmask]) static __inline uint32_t in6_addrhash(const struct in6_addr *in6) { uint32_t x; x = in6->s6_addr32[0] ^ in6->s6_addr32[1] ^ in6->s6_addr32[2] ^ in6->s6_addr32[3]; return (fnv_32_buf(&x, sizeof(x), FNV1_32_INIT)); } extern struct rmlock in6_ifaddr_lock; #define IN6_IFADDR_LOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_LOCKED) #define IN6_IFADDR_RLOCK(t) rm_rlock(&in6_ifaddr_lock, (t)) #define IN6_IFADDR_RLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_RLOCKED) #define IN6_IFADDR_RUNLOCK(t) rm_runlock(&in6_ifaddr_lock, (t)) #define IN6_IFADDR_WLOCK() rm_wlock(&in6_ifaddr_lock) #define IN6_IFADDR_WLOCK_ASSERT() rm_assert(&in6_ifaddr_lock, RA_WLOCKED) #define IN6_IFADDR_WUNLOCK() rm_wunlock(&in6_ifaddr_lock) #define in6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ counter_u64_add(((struct in6_ifextra *) \ ((ifp)->if_afdata[AF_INET6]))->in6_ifstat[ \ offsetof(struct in6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) extern u_char inet6ctlerrmap[]; VNET_DECLARE(unsigned long, in6_maxmtu); #define V_in6_maxmtu VNET(in6_maxmtu) #endif /* _KERNEL */ /* * IPv6 multicast MLD-layer source entry. */ struct ip6_msource { RB_ENTRY(ip6_msource) im6s_link; /* RB tree links */ struct in6_addr im6s_addr; struct im6s_st { uint16_t ex; /* # of exclusive members */ uint16_t in; /* # of inclusive members */ } im6s_st[2]; /* state at t0, t1 */ uint8_t im6s_stp; /* pending query */ }; RB_HEAD(ip6_msource_tree, ip6_msource); /* * IPv6 multicast PCB-layer source entry. * * NOTE: overlapping use of struct ip6_msource fields at start. */ struct in6_msource { RB_ENTRY(ip6_msource) im6s_link; /* Common field */ struct in6_addr im6s_addr; /* Common field */ uint8_t im6sl_st[2]; /* state before/at commit */ }; #ifdef _KERNEL /* * IPv6 source tree comparison function. * * An ordered predicate is necessary; bcmp() is not documented to return * an indication of order, memcmp() is, and is an ISO C99 requirement. */ static __inline int ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b) { return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr))); } RB_PROTOTYPE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * IPv6 multicast PCB-layer group filter descriptor. */ struct in6_mfilter { struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ u_long im6f_nsrc; /* # of source entries */ uint8_t im6f_st[2]; /* state before/at commit */ }; /* * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; LIST_ENTRY(in6_multi_mship) i6mm_chain; }; /* * IPv6 group descriptor. * * For every entry on an ifnet's if_multiaddrs list which represents * an IP multicast group, there is one of these structures. * * If any source filters are present, then a node will exist in the RB-tree * to permit fast lookup by source whenever an operation takes place. * This permits pre-order traversal when we issue reports. * Source filter trees are kept separately from the socket layer to * greatly simplify locking. * * When MLDv2 is active, in6m_timer is the response to group query timer. * The state-change timer in6m_sctimer is separate; whenever state changes * for the group the state change record is generated and transmitted, * and kept if retransmissions are necessary. * * FUTURE: in6m_link is now only used when groups are being purged * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but * because it is at the very start of the struct, we can't do this * w/o breaking the ABI for ifmcstat. */ struct in6_multi { - LIST_ENTRY(in6_multi) in6m_entry; /* list glue */ struct in6_addr in6m_addr; /* IPv6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ struct ifmultiaddr *in6m_ifma; /* back pointer to ifmultiaddr */ u_int in6m_refcount; /* reference count */ u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ /* New fields for MLDv2 follow. */ struct mld_ifsoftc *in6m_mli; /* MLD info */ SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ struct ip6_msource_tree in6m_srcs; /* tree of sources */ u_long in6m_nsrc; /* # of tree entries */ struct mbufq in6m_scq; /* queue of pending * state-change packets */ struct timeval in6m_lastgsrtv; /* last G-S-R query */ uint16_t in6m_sctimer; /* state-change timer */ uint16_t in6m_scrv; /* state-change rexmit count */ /* * SSM state counters which track state at T0 (the time the last * state-change report's RV timer went to zero) and T1 * (time of pending report, i.e. now). * Used for computing MLDv2 state-change reports. Several refcounts * are maintained here to optimize for common use-cases. */ struct in6m_st { uint16_t iss_fmode; /* MLD filter mode */ uint16_t iss_asm; /* # of ASM listeners */ uint16_t iss_ex; /* # of exclusive members */ uint16_t iss_in; /* # of inclusive members */ uint16_t iss_rec; /* # of recorded sources */ } in6m_st[2]; /* state at t0, t1 */ }; /* * Helper function to derive the filter mode on a source entry * from its internal counters. Predicates are: * A source is only excluded if all listeners exclude it. * A source is only included if no listeners exclude it, * and at least one listener includes it. * May be used by ifmcstat(8). */ static __inline uint8_t im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims, uint8_t t) { t = !!t; if (inm->in6m_st[t].iss_ex > 0 && inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex) return (MCAST_EXCLUDE); else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0) return (MCAST_INCLUDE); return (MCAST_UNDEFINED); } /* * Lock macros for IPv6 layer multicast address lists. IPv6 lock goes * before link layer multicast locks in the lock order. In most cases, * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ -extern struct mtx in6_multi_mtx; -#define IN6_MULTI_LOCK() mtx_lock(&in6_multi_mtx) -#define IN6_MULTI_UNLOCK() mtx_unlock(&in6_multi_mtx) -#define IN6_MULTI_LOCK_ASSERT() mtx_assert(&in6_multi_mtx, MA_OWNED) -#define IN6_MULTI_UNLOCK_ASSERT() mtx_assert(&in6_multi_mtx, MA_NOTOWNED) +extern struct mtx in6_multi_list_mtx; +extern struct sx in6_multi_sx; +#define IN6_MULTI_LIST_LOCK() mtx_lock(&in6_multi_list_mtx) +#define IN6_MULTI_LIST_UNLOCK() mtx_unlock(&in6_multi_list_mtx) +#define IN6_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_OWNED) +#define IN6_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in6_multi_list_mtx, MA_NOTOWNED) + +#define IN6_MULTI_LOCK() sx_xlock(&in6_multi_sx) +#define IN6_MULTI_UNLOCK() sx_xunlock(&in6_multi_sx) +#define IN6_MULTI_LOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XLOCKED) +#define IN6_MULTI_UNLOCK_ASSERT() sx_assert(&in6_multi_sx, SA_XUNLOCKED) + /* * Look up an in6_multi record for an IPv6 multicast address * on the interface ifp. * If no record found, return NULL. * * SMPng: The IN6_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. */ static __inline struct in6_multi * in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct ifmultiaddr *ifma; struct in6_multi *inm; - IN6_MULTI_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family == AF_INET6) { inm = (struct in6_multi *)ifma->ifma_protospec; + if (inm == NULL) + continue; if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr)) break; inm = NULL; } } return (inm); } /* * Wrapper for in6m_lookup_locked(). * * SMPng: Assumes that neithr the IN6_MULTI_LOCK() or IF_ADDR_LOCK() are held. */ static __inline struct in6_multi * in6m_lookup(struct ifnet *ifp, const struct in6_addr *mcaddr) { struct in6_multi *inm; - IN6_MULTI_LOCK(); + IN6_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); inm = in6m_lookup_locked(ifp, mcaddr); IF_ADDR_RUNLOCK(ifp); - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); return (inm); } /* Acquire an in6_multi record. */ static __inline void in6m_acquire_locked(struct in6_multi *inm) { - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); ++inm->in6m_refcount; } +static __inline void +in6m_acquire(struct in6_multi *inm) +{ + IN6_MULTI_LIST_LOCK(); + in6m_acquire_locked(inm); + IN6_MULTI_LIST_UNLOCK(); +} + +static __inline void +in6m_rele_locked(struct in6_multi_head *inmh, struct in6_multi *inm) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct in6_ifaddr *ifa6; + struct in6_multi_mship *imm; + + KASSERT(inm->in6m_refcount > 0, ("refcount == %d inm: %p", inm->in6m_refcount, inm)); + IN6_MULTI_LIST_LOCK_ASSERT(); + + if (--inm->in6m_refcount == 0) { + ifp = inm->in6m_ifp; + IF_ADDR_LOCK_ASSERT(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + + ifa6 = (void *)ifa; + LIST_FOREACH(imm, &ifa6->ia6_memberships, i6mm_chain) { + if (inm == imm->i6mm_maddr) + imm->i6mm_maddr = NULL; + } + } + inm->in6m_ifma->ifma_protospec = NULL; + SLIST_INSERT_HEAD(inmh, inm, in6m_nrele); + } +} + struct ip6_moptions; struct sockopt; /* Multicast KPIs. */ int im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); -int in6_mc_join(struct ifnet *, const struct in6_addr *, +int in6_joingroup(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); -int in6_mc_join_locked(struct ifnet *, const struct in6_addr *, +int in6_joingroup_locked(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); -int in6_mc_leave(struct in6_multi *, struct in6_mfilter *); -int in6_mc_leave_locked(struct in6_multi *, struct in6_mfilter *); +int in6_leavegroup(struct in6_multi *, struct in6_mfilter *); +int in6_leavegroup_locked(struct in6_multi *, struct in6_mfilter *); void in6m_clear_recorded(struct in6_multi *); void in6m_commit(struct in6_multi *); void in6m_print(const struct in6_multi *); int in6m_record_source(struct in6_multi *, const struct in6_addr *); -void in6m_release_locked(struct in6_multi *); +void in6m_release_deferred(struct in6_multi *); +void in6m_release_list_deferred(struct in6_multi_head *); void ip6_freemoptions(struct ip6_moptions *); int ip6_getmoptions(struct inpcb *, struct sockopt *); int ip6_setmoptions(struct inpcb *, struct sockopt *); - -/* Legacy KAME multicast KPIs. */ -struct in6_multi_mship * - in6_joingroup(struct ifnet *, struct in6_addr *, int *, int); -int in6_leavegroup(struct in6_multi_mship *); /* flags to in6_update_ifa */ #define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ int in6_mask2len(struct in6_addr *, u_char *); int in6_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); void in6_prepare_ifra(struct in6_aliasreq *, const struct in6_addr *, const struct in6_addr *); void in6_purgeaddr(struct ifaddr *); int in6if_do_dad(struct ifnet *); void in6_savemkludge(struct in6_ifaddr *); void *in6_domifattach(struct ifnet *); void in6_domifdetach(struct ifnet *, void *); int in6_domifmtu(struct ifnet *); void in6_setmaxmtu(void); int in6_if2idlen(struct ifnet *); struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int); struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, const struct in6_addr *); struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t); struct in6_ifaddr *in6ifa_llaonifp(struct ifnet *); int in6_addr2zoneid(struct ifnet *, struct in6_addr *, u_int32_t *); int in6_matchlen(struct in6_addr *, struct in6_addr *); int in6_are_prefix_equal(struct in6_addr *, struct in6_addr *, int); void in6_prefixlen2mask(struct in6_addr *, int); int in6_prefix_ioctl(struct socket *, u_long, caddr_t, struct ifnet *); int in6_prefix_add_ifid(int, struct in6_ifaddr *); void in6_prefix_remove_ifid(int, struct in6_ifaddr *); void in6_purgeprefix(struct ifnet *); int in6_is_addr_deprecated(struct sockaddr_in6 *); int in6_src_ioctl(u_long, caddr_t); void in6_newaddrmsg(struct in6_ifaddr *, int); /* * Extended API for IPv6 FIB support. */ struct mbuf *ip6_tryforward(struct mbuf *); void in6_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int in6_rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); void in6_rtalloc(struct route_in6 *, u_int); void in6_rtalloc_ign(struct route_in6 *, u_long, u_int); struct rtentry *in6_rtalloc1(struct sockaddr *, int, u_long, u_int); #endif /* _KERNEL */ #endif /* _NETINET6_IN6_VAR_H_ */ Index: head/sys/netinet6/mld6.c =================================================================== --- head/sys/netinet6/mld6.c (revision 333174) +++ head/sys/netinet6/mld6.c (revision 333175) @@ -1,3317 +1,3305 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif static struct mld_ifsoftc * mli_alloc_locked(struct ifnet *); static void mli_delete_locked(const struct ifnet *); static void mld_dispatch_packet(struct mbuf *); static void mld_dispatch_queue(struct mbufq *, int); static void mld_final_leave(struct in6_multi *, struct mld_ifsoftc *); static void mld_fasttimo_vnet(void); static int mld_handle_state_change(struct in6_multi *, struct mld_ifsoftc *); static int mld_initial_join(struct in6_multi *, struct mld_ifsoftc *, const int); #ifdef KTR static char * mld_rec_type_to_str(const int); #endif static void mld_set_version(struct mld_ifsoftc *, const int); static void mld_slowtimo_vnet(void); static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); -static void mld_v1_process_group_timer(struct mld_ifsoftc *, +static void mld_v1_process_group_timer(struct in6_multi_head *, struct in6_multi *); static void mld_v1_process_querier_timers(struct mld_ifsoftc *); static int mld_v1_transmit_report(struct in6_multi *, const int); static void mld_v1_update_group(struct in6_multi *, const int); static void mld_v2_cancel_link_timers(struct mld_ifsoftc *); static void mld_v2_dispatch_general_query(struct mld_ifsoftc *); static struct mbuf * mld_v2_encap_report(struct ifnet *, struct mbuf *); static int mld_v2_enqueue_filter_change(struct mbufq *, struct in6_multi *); static int mld_v2_enqueue_group_record(struct mbufq *, struct in6_multi *, const int, const int, const int, const int); static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, struct mbuf *, const int, const int); static int mld_v2_merge_state_changes(struct in6_multi *, struct mbufq *); -static void mld_v2_process_group_timers(struct mld_ifsoftc *, +static void mld_v2_process_group_timers(struct in6_multi_head *, struct mbufq *, struct mbufq *, struct in6_multi *, const int); static int mld_v2_process_group_query(struct in6_multi *, struct mld_ifsoftc *mli, int, struct mbuf *, const int); static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS); /* * Normative references: RFC 2710, RFC 3590, RFC 3810. * * Locking: * * The MLD subsystem lock ends up being system-wide for the moment, * but could be per-VIMAGE later on. * * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * IN6_MULTI_LOCK covers in_multi. * * MLD_LOCK covers per-link state and any global variables in this file. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * XXX LOR PREVENTION * A special case for IPv6 is the in6_setscope() routine. ip6_output() * will not accept an ifp; it wants an embedded scope ID, unlike * ip_output(), which happily takes the ifp given to it. The embedded * scope ID is only used by MLD to select the outgoing interface. * * During interface attach and detach, MLD will take MLD_LOCK *after* * the IF_AFDATA_LOCK. * As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call * it with MLD_LOCK held without triggering an LOR. A netisr with indirect * dispatch could work around this, but we'd rather not do that, as it * can introduce other races. * * As such, we exploit the fact that the scope ID is just the interface * index, and embed it in the IPv6 destination address accordingly. * This is potentially NOT VALID for MLDv1 reports, as they * are always sent to the multicast group itself; as MLDv2 * reports are always sent to ff02::16, this is not an issue * when MLDv2 is in use. * * This does not however eliminate the LOR when ip6_output() itself * calls in6_setscope() internally whilst MLD_LOCK is held. This will * trigger a LOR warning in WITNESS when the ifnet is detached. * * The right answer is probably to make IF_AFDATA_LOCK an rwlock, given * how it's used across the network stack. Here we're simply exploiting * the fact that MLD runs at a similar layer in the stack to scope6.c. * * VIMAGE: * * Each in6_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. */ static struct mtx mld_mtx; static MALLOC_DEFINE(M_MLD, "mld", "mld state"); #define MLD_EMBEDSCOPE(pin6, zoneid) \ if (IN6_IS_SCOPE_LINKLOCAL(pin6) || \ IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \ (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \ /* * VIMAGE-wide globals. */ static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0}; static VNET_DEFINE(LIST_HEAD(, mld_ifsoftc), mli_head); static VNET_DEFINE(int, interface_timers_running6); static VNET_DEFINE(int, state_change_timers_running6); static VNET_DEFINE(int, current_state_timers_running6); #define V_mld_gsrdelay VNET(mld_gsrdelay) #define V_mli_head VNET(mli_head) #define V_interface_timers_running6 VNET(interface_timers_running6) #define V_state_change_timers_running6 VNET(state_change_timers_running6) #define V_current_state_timers_running6 VNET(current_state_timers_running6) SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0, "IPv6 Multicast Listener Discovery"); /* * Virtualized sysctls. */ SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I", "Rate limit for MLDv2 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); static int mld_v1enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, &mld_v1enable, 0, "Enable fallback to MLDv1"); static int mld_use_allow = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. */ struct mld_raopt { struct ip6_hbh hbh; struct ip6_opt pad; struct ip6_opt_router ra; } __packed; /* * Router Alert hop-by-hop option header. */ static struct mld_raopt mld_ra = { .hbh = { 0, 0 }, .pad = { .ip6o_type = IP6OPT_PADN, 0 }, .ra = { .ip6or_type = IP6OPT_ROUTER_ALERT, .ip6or_len = IP6OPT_RTALERT_LEN - 2, .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF), .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF) } }; static struct ip6_pktopts mld_po; static __inline void mld_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void mld_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t mld_restore_context(struct mbuf *m) { #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr, ("%s: called when curvnet was not restored: cuvnet %p m ptr %p", __func__, curvnet, m->m_pkthdr.PH_loc.ptr)); #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by MLD lock. */ static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); MLD_LOCK(); i = V_mld_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d", V_mld_gsrdelay.tv_sec, i); V_mld_gsrdelay.tv_sec = i; out_locked: MLD_UNLOCK(); return (error); } /* * Expose struct mld_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct mld_ifsoftc *mli; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo)); if (error) return (error); IN6_MULTI_LOCK(); + IN6_MULTI_LIST_LOCK(); MLD_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (ifp == mli->mli_ifp) { struct mld_ifinfo info; info.mli_version = mli->mli_version; info.mli_v1_timer = mli->mli_v1_timer; info.mli_v2_timer = mli->mli_v2_timer; info.mli_flags = mli->mli_flags; info.mli_rv = mli->mli_rv; info.mli_qi = mli->mli_qi; info.mli_qri = mli->mli_qri; info.mli_uri = mli->mli_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: MLD_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains. * VIMAGE: Assumes the vnet pointer has been set. */ static void mld_dispatch_queue(struct mbufq *mq, int limit) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m); mld_dispatch_packet(m); if (--limit == 0) break; } } /* * Filter outgoing MLD report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) * and node-local addresses. However, kernel and socket consumers * always embed the KAME scope ID in the address provided, so strip it * when performing comparison. * Note: This is not the same as the *multicast* scope. * * Return zero if the given group is one for which MLD reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int mld_is_addr_reported(const struct in6_addr *addr) { KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__)); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) return (0); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { struct in6_addr tmp = *addr; in6_clearscope(&tmp); if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) return (0); } return (1); } /* * Attach MLD when PF_INET6 is attached to an interface. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ struct mld_ifsoftc * mld_domifattach(struct ifnet *ifp) { struct mld_ifsoftc *mli; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli = mli_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) mli->mli_flags |= MLIF_SILENT; if (mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_UNLOCK(); return (mli); } /* * VIMAGE: assume curvnet set by caller. */ static struct mld_ifsoftc * mli_alloc_locked(/*const*/ struct ifnet *ifp) { struct mld_ifsoftc *mli; MLD_LOCK_ASSERT(); mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_NOWAIT|M_ZERO); if (mli == NULL) goto out; mli->mli_ifp = ifp; mli->mli_version = MLD_VERSION_2; mli->mli_flags = 0; mli->mli_rv = MLD_RV_INIT; mli->mli_qi = MLD_QI_INIT; mli->mli_qri = MLD_QRI_INIT; mli->mli_uri = MLD_URI_INIT; - SLIST_INIT(&mli->mli_relinmhead); mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_mli_head, mli, mli_link); CTR2(KTR_MLD, "allocate mld_ifsoftc for ifp %p(%s)", ifp, if_name(ifp)); out: return (mli); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * Run before link-layer cleanup; cleanup groups, but do not free MLD state. * * SMPng: Caller must hold IN6_MULTI_LOCK(). * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator. * XXX This routine is also bitten by unlocked ifma_protospec access. */ void mld_ifdetach(struct ifnet *ifp) { struct mld_ifsoftc *mli; struct ifmultiaddr *ifma; - struct in6_multi *inm, *tinm; + struct in6_multi *inm; + struct in6_multi_head inmh; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); - IN6_MULTI_LOCK_ASSERT(); + SLIST_INIT(&inmh); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK(); mli = MLD_IFINFO(ifp); if (mli->mli_version == MLD_VERSION_2) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; if (inm->in6m_state == MLD_LEAVING_MEMBER) { - SLIST_INSERT_HEAD(&mli->mli_relinmhead, - inm, in6m_nrele); + in6m_rele_locked(&inmh, inm); + ifma->ifma_protospec = NULL; } in6m_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); - SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, - tinm) { - SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); - in6m_release_locked(inm); - } } MLD_UNLOCK(); + in6m_release_list_deferred(&inmh); } /* * Hook for domifdetach. * Runs after link-layer cleanup; free MLD state. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ void mld_domifdetach(struct ifnet *ifp) { CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli_delete_locked(ifp); MLD_UNLOCK(); } static void mli_delete_locked(const struct ifnet *ifp) { struct mld_ifsoftc *mli, *tmli; CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK_ASSERT(); LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) { if (mli->mli_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&mli->mli_gq); LIST_REMOVE(mli, mli_link); - KASSERT(SLIST_EMPTY(&mli->mli_relinmhead), - ("%s: there are dangling in_multi references", - __func__)); - free(mli, M_MLD); return; } } } /* * Process a received MLDv1 general or address-specific query. * Assumes that the query header has been pulled up to sizeof(mld_hdr). * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct ifmultiaddr *ifma; struct mld_ifsoftc *mli; struct in6_multi *inm; int is_general_query; uint16_t timer; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif is_general_query = 0; if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * Do address field validation upfront before we accept * the query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * MLDv1 General Query. * If this was not sent to the all-nodes group, ignore it. */ struct in6_addr dst; dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks. */ in6_setscope(&mld->mld_addr, ifp, NULL); } - IN6_MULTI_LOCK(); + IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * Switch to MLDv1 host compatibility mode. */ mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); mld_set_version(mli, MLD_VERSION_1); timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; IF_ADDR_RLOCK(ifp); if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)", ifp, if_name(ifp)); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; mld_v1_update_group(inm, timer); } } else { /* * MLDv1 Group-Specific Query. * If this is a group-specific MLDv1 query, we need only * look up the single group to process it. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); mld_v1_update_group(inm, timer); } /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); } IF_ADDR_RUNLOCK(ifp); MLD_UNLOCK(); - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an MLDv1 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to MLDv2. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike MLDv2, the delay per group should be jittered * to avoid bursts of MLDv1 reports. */ static void mld_v1_update_group(struct in6_multi *inm, const int timer) { #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), timer); - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: if (inm->in6m_timer != 0 && inm->in6m_timer <= timer) { CTR1(KTR_MLD, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case MLD_SG_QUERY_PENDING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: CTR1(KTR_MLD, "%s: ->REPORTING", __func__); inm->in6m_state = MLD_REPORTING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; break; case MLD_SLEEPING_MEMBER: CTR1(KTR_MLD, "%s: ->AWAKENING", __func__); inm->in6m_state = MLD_AWAKENING_MEMBER; break; case MLD_LEAVING_MEMBER: break; } } /* * Process a received MLDv2 general, group-specific or * group-and-source-specific query. * * Assumes that the query header has been pulled up to sizeof(mldv2_query). * * Return 0 if successful, otherwise an appropriate error code is returned. */ static int mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, struct mbuf *m, const int off, const int icmp6len) { struct mld_ifsoftc *mli; struct mldv2_query *mld; struct in6_multi *inm; uint32_t maxdelay, nsrc, qqi; int is_general_query; uint16_t timer; uint8_t qrv; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif is_general_query = 0; /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off); maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ if (maxdelay >= 32768) { maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << (MLD_MRC_EXP(maxdelay) + 3); } timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; qrv = MLD_QRV(mld->mld_misc); if (qrv < 2) { CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__, qrv, MLD_RV_INIT); qrv = MLD_RV_INIT; } qqi = mld->mld_qqi; if (qqi >= 128) { qqi = MLD_QQIC_MANT(mld->mld_qqi) << (MLD_QQIC_EXP(mld->mld_qqi) + 3); } nsrc = ntohs(mld->mld_numsrc); if (nsrc > MLD_MAX_GS_SOURCES) return (EMSGSIZE); if (icmp6len < sizeof(struct mldv2_query) + (nsrc * sizeof(struct in6_addr))) return (EMSGSIZE); /* * Do further input validation upfront to avoid resetting timers * should we need to discard this query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * A general query with a source list has undefined * behaviour; discard it. */ if (nsrc > 0) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks (due to KAME * locking lameness). We own this mbuf chain just now. */ in6_setscope(&mld->mld_addr, ifp, NULL); } - IN6_MULTI_LOCK(); + IN6_MULTI_LIST_LOCK(); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * Discard the v2 query if we're in Compatibility Mode. * The RFC is pretty clear that hosts need to stay in MLDv1 mode * until the Old Version Querier Present timer expires. */ if (mli->mli_version != MLD_VERSION_2) goto out_locked; mld_set_version(mli, MLD_VERSION_2); mli->mli_rv = qrv; mli->mli_qi = qqi; mli->mli_qri = maxdelay; CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi, maxdelay); if (is_general_query) { /* * MLDv2 General Query. * * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)", ifp, if_name(ifp)); if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); V_interface_timers_running6 = 1; } } else { /* * MLDv2 Group-specific or Group-and-source-specific Query. * * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ IF_ADDR_RLOCK(ifp); inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm == NULL) { IF_ADDR_RUNLOCK(ifp); goto out_locked; } if (nsrc > 0) { if (!ratecheck(&inm->in6m_lastgsrtv, &V_mld_gsrdelay)) { CTR1(KTR_MLD, "%s: GS query throttled.", __func__); IF_ADDR_RUNLOCK(ifp); goto out_locked; } } CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)", ifp, if_name(ifp)); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) mld_v2_process_group_query(inm, mli, timer, m, off); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); IF_ADDR_RUNLOCK(ifp); } out_locked: MLD_UNLOCK(); - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received MLDv2 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, int timer, struct mbuf *m0, const int off) { struct mldv2_query *mld; int retval; uint16_t nsrc; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); retval = 0; mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LEAVING_MEMBER: return (retval); break; case MLD_REPORTING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(mld->mld_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { in6m_clear_recorded(inm); timer = min(inm->in6m_timer, timer); } inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { timer = min(inm->in6m_timer, timer); inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. */ if (inm->in6m_nsrc > 0) { struct mbuf *m; uint8_t *sp; int i, nrecorded; int soff; m = m0; soff = off + sizeof(struct mldv2_query); nrecorded = 0; for (i = 0; i < nsrc; i++) { sp = mtod(m, uint8_t *) + soff; retval = in6m_record_source(inm, (const struct in6_addr *)sp); if (retval < 0) break; nrecorded += retval; soff += sizeof(struct in6_addr); if (soff >= m->m_len) { soff = soff - m->m_len; m = m->m_next; if (m == NULL) break; } } if (nrecorded > 0) { CTR1(KTR_MLD, "%s: schedule response to SG query", __func__); inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; } } return (retval); } /* * Process a received MLDv1 host membership report. * Assumes mld points to mld_hdr in pulled up mbuf chain. * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct in6_addr src, dst; struct in6_ifaddr *ia; struct in6_multi *inm; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } if (ifp->if_flags & IFF_LOOPBACK) return (0); /* * MLDv1 reports must originate from a host's link-local address, * or the unspecified address (when booting). */ src = ip6->ip6_src; in6_clearscope(&src); if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (EINVAL); } /* * RFC2710 Section 4: MLDv1 reports must pertain to a multicast * group, and must be directed to the group itself. */ dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_dst), ifp, if_name(ifp)); return (EINVAL); } /* * Make sure we don't hear our own membership report, as fast * leave requires knowing that we are the only member of a * group. Assume we used the link-local address if available, * otherwise look for ::. * * XXX Note that scope ID comparison is needed for the address * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be * performed for the on-wire address. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) || (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); /* * Embed scope ID of receiving interface in MLD query for lookup * whilst we don't hold other locks (due to KAME locking lameness). */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) in6_setscope(&mld->mld_addr, ifp, NULL); - IN6_MULTI_LOCK(); + IN6_MULTI_LIST_LOCK(); MLD_LOCK(); IF_ADDR_RLOCK(ifp); /* * MLDv1 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { struct mld_ifsoftc *mli; mli = inm->in6m_mli; KASSERT(mli != NULL, ("%s: no mli for ifp %p", __func__, ifp)); /* * If we are in MLDv2 host mode, do not allow the * other host's MLDv1 report to suppress our reports. */ if (mli->mli_version == MLD_VERSION_2) goto out_locked; inm->in6m_timer = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_AWAKENING_MEMBER: CTR3(KTR_MLD, "report suppressed for %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); case MLD_LAZY_MEMBER: inm->in6m_state = MLD_LAZY_MEMBER; break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } out_locked: IF_ADDR_RUNLOCK(ifp); MLD_UNLOCK(); - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); return (0); } /* * MLD input path. * * Assume query messages which fit in a single ICMPv6 message header * have been pulled up. * Assume that userland will want to see the message, even if it * otherwise fails kernel input validation; do not free it. * Pullup may however free the mbuf chain m if it fails. * * Return IPPROTO_DONE if we freed m. Otherwise, return 0. */ int mld_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct mld_hdr *mld; int mldlen; CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off); ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); /* Pullup to appropriate size. */ mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); if (mld->mld_type == MLD_LISTENER_QUERY && icmp6len >= sizeof(struct mldv2_query)) { mldlen = sizeof(struct mldv2_query); } else { mldlen = sizeof(struct mld_hdr); } IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); if (mld == NULL) { ICMP6STAT_INC(icp6s_badlen); return (IPPROTO_DONE); } /* * Userland needs to see all of this traffic for implementing * the endpoint discovery portion of multicast routing. */ switch (mld->mld_type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(ifp, ifs6_in_mldquery); if (icmp6len == sizeof(struct mld_hdr)) { if (mld_v1_input_query(ifp, ip6, mld) != 0) return (0); } else if (icmp6len >= sizeof(struct mldv2_query)) { if (mld_v2_input_query(ifp, ip6, m, off, icmp6len) != 0) return (0); } break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); if (mld_v1_input_report(ifp, ip6, mld) != 0) return (0); break; case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(ifp, ifs6_in_mlddone); break; default: break; } return (0); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * * VIMAGE: Assume caller has set up our curvnet. */ static void mld_fasttimo_vnet(void) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct mld_ifsoftc *mli; struct ifmultiaddr *ifma; struct in6_multi *inm, *tinm; + struct in6_multi_head inmh; int uri_fasthz; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running6 && !V_interface_timers_running6 && !V_state_change_timers_running6) return; - IN6_MULTI_LOCK(); + SLIST_INIT(&inmh); + IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * MLDv2 General Query response timer processing. */ if (V_interface_timers_running6) { CTR1(KTR_MLD, "%s: interface timers running", __func__); V_interface_timers_running6 = 0; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (mli->mli_v2_timer == 0) { /* Do nothing. */ } else if (--mli->mli_v2_timer == 0) { mld_v2_dispatch_general_query(mli); } else { V_interface_timers_running6 = 1; } } } if (!V_current_state_timers_running6 && !V_state_change_timers_running6) goto out_locked; V_current_state_timers_running6 = 0; V_state_change_timers_running6 = 0; CTR1(KTR_MLD, "%s: state change timers running", __func__); /* * MLD host report and state-change timer processing. * Note: Processing a v2 group timer may remove a node. */ LIST_FOREACH(mli, &V_mli_head, mli_link) { ifp = mli->mli_ifp; if (mli->mli_version == MLD_VERSION_2) { uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * PR_FASTHZ); mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS); mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; switch (mli->mli_version) { case MLD_VERSION_1: - mld_v1_process_group_timer(mli, inm); + mld_v1_process_group_timer(&inmh, inm); break; case MLD_VERSION_2: - mld_v2_process_group_timers(mli, &qrq, + mld_v2_process_group_timers(&inmh, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); switch (mli->mli_version) { case MLD_VERSION_1: /* * Transmit reports for this lifecycle. This * is done while not holding IF_ADDR_LOCK * since this can call * in6ifa_ifpforlinklocal() which locks * IF_ADDR_LOCK internally as well as * ip6_output() to transmit a packet. */ - SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, - in6m_nrele, tinm) { - SLIST_REMOVE_HEAD(&mli->mli_relinmhead, + SLIST_FOREACH_SAFE(inm, &inmh, in6m_nrele, tinm) { + SLIST_REMOVE_HEAD(&inmh, in6m_nrele); (void)mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); } break; case MLD_VERSION_2: mld_dispatch_queue(&qrq, 0); mld_dispatch_queue(&scq, 0); /* * Free the in_multi reference(s) for * this lifecycle. */ - SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, - in6m_nrele, tinm) { - SLIST_REMOVE_HEAD(&mli->mli_relinmhead, - in6m_nrele); - in6m_release_locked(inm); - } + in6m_release_list_deferred(&inmh); break; } } out_locked: MLD_UNLOCK(); - IN6_MULTI_UNLOCK(); + IN6_MULTI_LIST_UNLOCK(); } /* * Update host report group timer. * Will update the global pending timer flags. */ static void -mld_v1_process_group_timer(struct mld_ifsoftc *mli, struct in6_multi *inm) +mld_v1_process_group_timer(struct in6_multi_head *inmh, struct in6_multi *inm) { int report_timer_expired; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); if (inm->in6m_timer == 0) { report_timer_expired = 0; } else if (--inm->in6m_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running6 = 1; return; } switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_REPORTING_MEMBER: if (report_timer_expired) { inm->in6m_state = MLD_IDLE_MEMBER; - SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, - in6m_nrele); + in6m_rele_locked(inmh, inm); } break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } /* * Update a group's timers for MLDv2. * Will update the global pending timer flags. * Note: Unlocked read from mli. */ static void -mld_v2_process_group_timers(struct mld_ifsoftc *mli, +mld_v2_process_group_timers(struct in6_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in6_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from compatibility mode back to MLDv2, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->in6m_timer == 0) { query_response_timer_expired = 0; } else if (--inm->in6m_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running6 = 1; } if (inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running6 = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); inm->in6m_state = MLD_REPORTING_MEMBER; in6m_clear_recorded(inm); } /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: case MLD_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->in6m_scrv > 0) { inm->in6m_sctimer = uri_fasthz; V_state_change_timers_running6 = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)mld_v2_merge_state_changes(inm, scq); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * If we are leaving the group for good, make sure * we release MLD's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->in6m_state == MLD_LEAVING_MEMBER && inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; - SLIST_INSERT_HEAD(&mli->mli_relinmhead, - inm, in6m_nrele); + in6m_rele_locked(inmh, inm); } } break; } } /* * Switch to a different version on the given interface, * as per Section 9.12. */ static void mld_set_version(struct mld_ifsoftc *mli, const int version) { int old_version_timer; MLD_LOCK_ASSERT(); CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__, version, mli->mli_ifp, if_name(mli->mli_ifp)); if (version == MLD_VERSION_1) { /* * Compute the "Older Version Querier Present" timer as per * Section 9.12. */ old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; old_version_timer *= PR_SLOWHZ; mli->mli_v1_timer = old_version_timer; } if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { mli->mli_version = MLD_VERSION_1; mld_v2_cancel_link_timers(mli); } } /* * Cancel pending MLDv2 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. */ static void mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) { struct ifmultiaddr *ifma; struct ifnet *ifp; - struct in6_multi *inm, *tinm; + struct in6_multi *inm; + struct in6_multi_head inmh; CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__, mli->mli_ifp, if_name(mli->mli_ifp)); - IN6_MULTI_LOCK_ASSERT(); + SLIST_INIT(&inmh); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * Fast-track this potentially expensive operation * by checking all the global 'timer pending' flags. */ if (!V_interface_timers_running6 && !V_state_change_timers_running6 && !V_current_state_timers_running6) return; mli->mli_v2_timer = 0; ifp = mli->mli_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; inm = (struct in6_multi *)ifma->ifma_protospec; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_LEAVING_MEMBER: /* * If we are leaving the group and switching * version, we need to release the final * reference held for issuing the INCLUDE {}. */ - SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, - in6m_nrele); + in6m_rele_locked(&inmh, inm); + ifma->ifma_protospec = NULL; /* FALLTHROUGH */ case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: in6m_clear_recorded(inm); /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: inm->in6m_sctimer = 0; inm->in6m_timer = 0; inm->in6m_state = MLD_REPORTING_MEMBER; /* * Free any pending MLDv2 state-change records. */ mbufq_drain(&inm->in6m_scq); break; } } IF_ADDR_RUNLOCK(ifp); - SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) { - SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); - in6m_release_locked(inm); - } + in6m_release_list_deferred(&inmh); } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void mld_slowtimo_vnet(void) { struct mld_ifsoftc *mli; MLD_LOCK(); LIST_FOREACH(mli, &V_mli_head, mli_link) { mld_v1_process_querier_timers(mli); } MLD_UNLOCK(); } /* * Update the Older Version Querier Present timers for a link. * See Section 9.12 of RFC 3810. */ static void mld_v1_process_querier_timers(struct mld_ifsoftc *mli) { MLD_LOCK_ASSERT(); if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { /* * MLDv1 Querier Present timer expired; revert to MLDv2. */ CTR5(KTR_MLD, "%s: transition from v%d -> v%d on %p(%s)", __func__, mli->mli_version, MLD_VERSION_2, mli->mli_ifp, if_name(mli->mli_ifp)); mli->mli_version = MLD_VERSION_2; } } /* * Transmit an MLDv1 report immediately. */ static int mld_v1_transmit_report(struct in6_multi *in6m, const int type) { struct ifnet *ifp; struct in6_ifaddr *ia; struct ip6_hdr *ip6; struct mbuf *mh, *md; struct mld_hdr *mld; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); ifp = in6m->in6m_ifp; ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); /* ia may be NULL if link-local address is tentative. */ mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } md = m_get(M_NOWAIT, MT_DATA); if (md == NULL) { m_free(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } mh->m_next = md; /* * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so * that ether_output() does not need to allocate another mbuf * for the header in the most common case. */ M_ALIGN(mh, sizeof(struct ip6_hdr)); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = in6m->in6m_addr; md->m_len = sizeof(struct mld_hdr); mld = mtod(md, struct mld_hdr *); mld->mld_type = type; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_maxdelay = 0; mld->mld_reserved = 0; mld->mld_addr = in6m->in6m_addr; in6_clearscope(&mld->mld_addr); mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); mld_save_context(mh, ifp); mh->m_flags |= M_MLDV1; mld_dispatch_packet(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } /* * Process a state change from the upper layer for the given IPv6 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to MLD to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the MLDv2 state machine at group level. The MLd module * however makes the decision as to which MLD protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * If delay is non-zero, and the state change is an initial multicast * join, the state change report will be delayed by 'delay' ticks * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise * the initial MLDv2 state change report will be delayed by whichever * is sooner, a pending state-change timer or delay itself. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int mld_change_state(struct in6_multi *inm, const int delay) { struct mld_ifsoftc *mli; struct ifnet *ifp; int error; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet6's notion of ifp is the * same as net's. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an MLD * life cycle for this group. */ if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__, inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode); if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: initial join", __func__); error = mld_initial_join(inm, mli, delay); goto out_locked; } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: final leave", __func__); mld_final_leave(inm, mli); goto out_locked; } } else { CTR1(KTR_MLD, "%s: filter set change", __func__); } error = mld_handle_state_change(inm, mli); out_locked: MLD_UNLOCK(); return (error); } /* * Perform the initial join for an MLD group. * * When joining a group: * If the group should have its MLD traffic suppressed, do nothing. * MLDv1 starts sending MLDv1 host membership reports. * MLDv2 will schedule an MLDv2 state-change report containing the * initial state of the membership. * * If the delay argument is non-zero, then we must delay sending the * initial state change for delay ticks (in units of PR_FASTHZ). */ static int mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli, const int delay) { struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; int odelay; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); error = 0; syncstates = 1; ifp = inm->in6m_ifp; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * enter the MLD_SILENT_MEMBER state and * are never reported in any protocol exchanges. * All other groups enter the appropriate state machine * for the version in use on this link. * A link marked as MLIF_SILENT causes MLD to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); inm->in6m_state = MLD_SILENT_MEMBER; inm->in6m_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (mli->mli_version == MLD_VERSION_2 && inm->in6m_state == MLD_LEAVING_MEMBER) - in6m_release_locked(inm); + in6m_release_deferred(inm); inm->in6m_state = MLD_REPORTING_MEMBER; switch (mli->mli_version) { case MLD_VERSION_1: /* * If a delay was provided, only use it if * it is greater than the delay normally * used for an MLDv1 state change report, * and delay sending the initial MLDv1 report * by not transitioning to the IDLE state. */ odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ); if (delay) { inm->in6m_timer = max(delay, odelay); V_current_state_timers_running6 = 1; } else { inm->in6m_state = MLD_IDLE_MEMBER; error = mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); if (error == 0) { inm->in6m_timer = odelay; V_current_state_timers_running6 = 1; } } break; case MLD_VERSION_2: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->in6m_scq; mbufq_drain(mq); retval = mld_v2_enqueue_group_record(mq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next mld_fasttimo (~200ms), * giving us an opportunity to merge the reports. * * If a delay was provided to this function, only * use this delay if sooner than the existing one. */ KASSERT(mli->mli_rv > 1, ("%s: invalid robustness %d", __func__, mli->mli_rv)); inm->in6m_scrv = mli->mli_rv; if (delay) { if (inm->in6m_sctimer > 1) { inm->in6m_sctimer = min(inm->in6m_sctimer, delay); } else inm->in6m_sctimer = delay; } else inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); } return (error); } /* * Issue an intermediate state change during the life-cycle. */ static int mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli) { struct ifnet *ifp; int retval; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); ifp = inm->in6m_ifp; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr) || (mli->mli_version != MLD_VERSION_2)) { if (!mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_MLD, "%s: nothing to do", __func__); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } mbufq_drain(&inm->in6m_scq); retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->in6m_scrv = mli->mli_rv; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; return (0); } /* * Perform the final leave for a multicast address. * * When leaving a group: * MLDv1 sends a DONE message, if and only if we are the reporter. * MLDv2 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli) { int syncstates; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif syncstates = 1; CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: if (mli->mli_version == MLD_VERSION_1) { #ifdef INVARIANTS if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) panic("%s: MLDv2 state reached, not MLDv2 mode", __func__); #endif mld_v1_transmit_report(inm, MLD_LISTENER_DONE); inm->in6m_state = MLD_NOT_MEMBER; V_current_state_timers_running6 = 1; } else if (mli->mli_version == MLD_VERSION_2) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->in6m_scq); inm->in6m_timer = 0; inm->in6m_scrv = mli->mli_rv; CTR4(KTR_MLD, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), inm->in6m_scrv); if (inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; inm->in6m_sctimer = 0; } else { int retval; in6m_acquire_locked(inm); retval = mld_v2_enqueue_group_record( &inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->in6m_state = MLD_LEAVING_MEMBER; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; syncstates = 0; } break; } break; case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s", __func__, &inm->in6m_addr, if_name(inm->in6m_ifp)); } } /* * Enqueue an MLDv2 group record to the given output queue. * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * If use_block_allow is non-zero, state change reports for initial join * and final leave, on an inclusive mode group with a source list, will be * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. * * The function will attempt to allocate leading space in the packet * for the IPv6+ICMP headers to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query, const int use_block_allow) { struct mldv2_record mr; struct mldv2_record *pmr; struct ifnet *ifp; struct ip6_msource *ims, *nims; struct mbuf *m0, *m, *md; int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; uint8_t mode; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pmr = NULL; type = MLD_DO_NOTHING; mode = inm->in6m_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && inm->in6m_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. * * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. A transition to/from this state is * considered inclusive with some special treatment. * * If we are rewriting initial joins/leaves to use * ALLOW/BLOCK, and the group's membership is inclusive, * we need to send sources in all cases. */ if (mode != inm->in6m_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: change to EXCLUDE", __func__); type = MLD_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_MLD, "%s: change to INCLUDE", __func__); if (use_block_allow) { /* * XXX * Here we're interested in state * edges either direction between * MCAST_UNDEFINED and MCAST_INCLUDE. * Perhaps we should just check * the group state, rather than * the filter mode. */ if (mode == MCAST_UNDEFINED) { type = MLD_BLOCK_OLD_SOURCES; } else { type = MLD_ALLOW_NEW_SOURCES; } } else { type = MLD_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = MLD_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = MLD_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = MLD_MODE_IS_INCLUDE; KASSERT(inm->in6m_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->in6m_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (mld_v2_enqueue_filter_change(mq, inm)); if (type == MLD_DO_NOTHING) { CTR3(KTR_MLD, "%s: nothing to do for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct mldv2_record); if (record_has_sources) minrec0len += sizeof(struct in6_addr); CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__, mld_rec_type_to_str(type), ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP6+RA+ICMPV6+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - MLD_MTUSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); m = m0; CTR1(KTR_MLD, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); if (!is_state_change && !is_group_query) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); CTR1(KTR_MLD, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ mr.mr_type = type; mr.mr_datalen = 0; mr.mr_numsrc = 0; mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct mldv2_record); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, and * use_block_allow is zero, do not include source entries. * Otherwise, we need to include this source in the report. * * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); CTR2(KTR_MLD, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct in6_addr); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__, msrcs); pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); } if (is_source_query && msrcs == 0) { CTR1(KTR_MLD, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_MLD, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); CTR1(KTR_MLD, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct mldv2_record); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); msrcs = 0; RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); CTR1(KTR_MLD, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an MLDv2 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm) { static const int MINRECLEN = sizeof(struct mldv2_record) + sizeof(struct in6_addr); struct ifnet *ifp; struct mldv2_record mr; struct mldv2_record *pmr; struct ip6_msource *ims, *nims; struct mbuf *m, *m0, *md; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); if (inm->in6m_nsrc == 0 || (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) return (0); ifp = inm->in6m_ifp; /* interface */ mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ npbytes = 0; /* # of bytes appended this packet */ nbytes = 0; /* # of bytes appended to group's state-change queue */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - MLD_MTUSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); CTR1(KTR_MLD, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CTR1(KTR_MLD, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; mld_save_context(m, ifp); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); npbytes = 0; CTR1(KTR_MLD, "%s: allocated new packet", __func__); } /* * Append the MLD group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&mr, 0, sizeof(mr)); mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(mr), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct mldv2_record); if (m != m0) { /* new packet; offset in chain */ md = m_getptr(m, npbytes - sizeof(struct mldv2_record), &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct mldv2_record)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) { nims = RB_MIN(ip6_msource_tree, &inm->in6m_srcs); } RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); then = im6s_get_mode(inm, ims, 0); CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_MLD, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct mldv2_record); if (m != m0) { CTR1(KTR_MLD, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_MLD, "%s: m_adj(m, -mr)", __func__); m_adj(m, -((int)sizeof( struct mldv2_record))); } continue; } npbytes += (rsrcs * sizeof(struct in6_addr)); if (crt == REC_ALLOW) pmr->mr_type = MLD_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pmr->mr_type = MLD_BLOCK_OLD_SOURCES; pmr->mr_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->in6m_scrv > 0) docopy = 1; gq = &inm->in6m_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an MLDv2 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= MLD_V2_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_MLD, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_MLD, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_MLD, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending MLDv2 General Query. */ static void mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; int retval; - IN6_MULTI_LOCK_ASSERT(); + IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli->mli_version == MLD_VERSION_2, ("%s: called when version %d", __func__, mli->mli_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (mbufq_len(&mli->mli_gq) != 0) goto send; ifp = mli->mli_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->in6m_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: inm->in6m_state = MLD_REPORTING_MEMBER; retval = mld_v2_enqueue_group_record(&mli->mli_gq, inm, 0, 0, 0, 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); send: mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&mli->mli_gq) != NULL) { mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( MLD_RESPONSE_BURST_INTERVAL); V_interface_timers_running6 = 1; } } /* * Transmit the next pending message in the output queue. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as MLD traffic is always local to * a link and uses a link-scope multicast address. */ static void mld_dispatch_packet(struct mbuf *m) { struct ip6_moptions im6o; struct ifnet *ifp; struct ifnet *oifp; struct mbuf *m0; struct mbuf *md; struct ip6_hdr *ip6; struct mld_hdr *mld; int error; int off; int type; uint32_t ifindex; CTR2(KTR_MLD, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ ifindex = mld_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IP6STAT_INC(ip6s_noroute); goto out; } im6o.im6o_multicast_hlim = 1; im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL); im6o.im6o_multicast_ifp = ifp; if (m->m_flags & M_MLDV1) { m0 = m; } else { m0 = mld_v2_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_MLD, "%s: dropped %p", __func__, m); IP6STAT_INC(ip6s_odropped); goto out; } } mld_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; ip6 = mtod(m0, struct ip6_hdr *); #if 0 (void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ #else /* * XXX XXX Break some KPI rules to prevent an LOR which would * occur if we called in6_setscope() at transmission. * See comments at top of file. */ MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); #endif /* * Retrieve the ICMPv6 type before handoff to ip6_output(), * so we can bump the stats. */ md = m_getptr(m0, sizeof(struct ip6_hdr), &off); mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); type = mld->mld_type; error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o, &oifp, NULL); if (error) { CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error); goto out; } ICMP6STAT_INC(icp6s_outhist[type]); if (oifp != NULL) { icmp6_ifstat_inc(oifp, ifs6_out_msg); switch (type) { case MLD_LISTENER_REPORT: case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(oifp, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(oifp, ifs6_out_mlddone); break; } } out: return; } /* * Encapsulate an MLDv2 report. * * KAME IPv6 requires that hop-by-hop options be passed separately, * and that the IPv6 header be prepended in a separate mbuf. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mh; struct mldv2_report *mld; struct ip6_hdr *ip6; struct in6_ifaddr *ia; int mldreclen; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); /* * RFC3590: OK to send as :: or tentative during DAD. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ia == NULL) CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__); mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); m_freem(m); return (NULL); } M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); mldreclen = m_length(m, NULL); CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen); mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report) + mldreclen; ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; if (ia != NULL) ifa_free(&ia->ia_ifa); ip6->ip6_dst = in6addr_linklocal_allv2routers; /* scope ID will be set in netisr */ mld = (struct mldv2_report *)(ip6 + 1); mld->mld_type = MLDV2_LISTENER_REPORT; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_v2_reserved = 0; mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs); m->m_pkthdr.PH_vt.vt_nrecs = 0; mh->m_next = m; mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); return (mh); } #ifdef KTR static char * mld_rec_type_to_str(const int type) { switch (type) { case MLD_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case MLD_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case MLD_MODE_IS_EXCLUDE: return "MODE_EX"; break; case MLD_MODE_IS_INCLUDE: return "MODE_IN"; break; case MLD_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case MLD_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void mld_init(void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); MLD_LOCK_INIT(); ip6_initpktopts(&mld_po); mld_po.ip6po_hlim = 1; mld_po.ip6po_hbh = &mld_ra.hbh; mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; mld_po.ip6po_flags = IP6PO_DONTFRAG; } SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL); static void mld_uninit(void *unused __unused) { CTR1(KTR_MLD, "%s: tearing down", __func__); MLD_LOCK_DESTROY(); } SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL); static void vnet_mld_init(const void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); LIST_INIT(&V_mli_head); } VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init, NULL); static void vnet_mld_uninit(const void *unused __unused) { /* This can happen if we shutdown the network stack. */ CTR1(KTR_MLD, "%s: tearing down", __func__); } VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit, NULL); static int mld_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t mld_mod = { "mld", mld_modevent, 0 }; DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY); Index: head/sys/netinet6/mld6_var.h =================================================================== --- head/sys/netinet6/mld6_var.h (revision 333174) +++ head/sys/netinet6/mld6_var.h (revision 333175) @@ -1,179 +1,178 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET6_MLD6_VAR_H_ #define _NETINET6_MLD6_VAR_H_ /* * Multicast Listener Discovery (MLD) * implementation-specific definitions. */ /* * MLD per-group states. */ #define MLD_NOT_MEMBER 0 /* Can garbage collect group */ #define MLD_SILENT_MEMBER 1 /* Do not perform MLD for group */ #define MLD_REPORTING_MEMBER 2 /* MLDv1 we are reporter */ #define MLD_IDLE_MEMBER 3 /* MLDv1 we reported last */ #define MLD_LAZY_MEMBER 4 /* MLDv1 other member reporting */ #define MLD_SLEEPING_MEMBER 5 /* MLDv1 start query response */ #define MLD_AWAKENING_MEMBER 6 /* MLDv1 group timer will start */ #define MLD_G_QUERY_PENDING_MEMBER 7 /* MLDv2 group query pending */ #define MLD_SG_QUERY_PENDING_MEMBER 8 /* MLDv2 source query pending */ #define MLD_LEAVING_MEMBER 9 /* MLDv2 dying gasp (pending last */ /* retransmission of INCLUDE {}) */ /* * MLD version tag. */ #define MLD_VERSION_NONE 0 /* Invalid */ #define MLD_VERSION_1 1 #define MLD_VERSION_2 2 /* Default */ /* * MLDv2 protocol control variables. */ #define MLD_RV_INIT 2 /* Robustness Variable */ #define MLD_RV_MIN 1 #define MLD_RV_MAX 7 #define MLD_QI_INIT 125 /* Query Interval (s) */ #define MLD_QI_MIN 1 #define MLD_QI_MAX 255 #define MLD_QRI_INIT 10 /* Query Response Interval (s) */ #define MLD_QRI_MIN 1 #define MLD_QRI_MAX 255 #define MLD_URI_INIT 3 /* Unsolicited Report Interval (s) */ #define MLD_URI_MIN 0 #define MLD_URI_MAX 10 #define MLD_MAX_GS_SOURCES 256 /* # of sources in rx GS query */ #define MLD_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ #define MLD_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ #define MLD_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ #define MLD_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ #define MLD_RESPONSE_BURST_INTERVAL (PR_FASTHZ / 2) /* 500ms */ /* * MLD-specific mbuf flags. */ #define M_MLDV1 M_PROTO1 /* Packet is MLDv1 */ #define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ /* * Leading space for MLDv2 reports inside MTU. * * NOTE: This differs from IGMPv3 significantly. KAME IPv6 requires * that a fully formed mbuf chain *without* the Router Alert option * is passed to ip6_output(), however we must account for it in the * MTU if we need to split an MLDv2 report into several packets. * * We now put the MLDv2 report header in the initial mbuf containing * the IPv6 header. */ #define MLD_MTUSPACE (sizeof(struct ip6_hdr) + sizeof(struct mld_raopt) + \ sizeof(struct icmp6_hdr)) /* * Structure returned by net.inet6.mld.ifinfo. */ struct mld_ifinfo { uint32_t mli_version; /* MLDv1 Host Compatibility Mode */ uint32_t mli_v1_timer; /* MLDv1 Querier Present timer (s) */ uint32_t mli_v2_timer; /* MLDv2 General Query (interface) timer (s)*/ uint32_t mli_flags; /* MLD per-interface flags */ #define MLIF_SILENT 0x00000001 /* Do not use MLD on this ifp */ #define MLIF_USEALLOW 0x00000002 /* Use ALLOW/BLOCK for joins/leaves */ uint32_t mli_rv; /* MLDv2 Robustness Variable */ uint32_t mli_qi; /* MLDv2 Query Interval (s) */ uint32_t mli_qri; /* MLDv2 Query Response Interval (s) */ uint32_t mli_uri; /* MLDv2 Unsolicited Report Interval (s) */ }; #ifdef _KERNEL /* * Per-link MLD state. */ struct mld_ifsoftc { LIST_ENTRY(mld_ifsoftc) mli_link; struct ifnet *mli_ifp; /* interface this instance belongs to */ uint32_t mli_version; /* MLDv1 Host Compatibility Mode */ uint32_t mli_v1_timer; /* MLDv1 Querier Present timer (s) */ uint32_t mli_v2_timer; /* MLDv2 General Query (interface) timer (s)*/ uint32_t mli_flags; /* MLD per-interface flags */ uint32_t mli_rv; /* MLDv2 Robustness Variable */ uint32_t mli_qi; /* MLDv2 Query Interval (s) */ uint32_t mli_qri; /* MLDv2 Query Response Interval (s) */ uint32_t mli_uri; /* MLDv2 Unsolicited Report Interval (s) */ - SLIST_HEAD(,in6_multi) mli_relinmhead; /* released groups */ struct mbufq mli_gq; /* queue of general query responses */ }; #define MLD_RANDOM_DELAY(X) (arc4random() % (X) + 1) #define MLD_MAX_STATE_CHANGES 24 /* Max pending changes per group */ /* * Subsystem lock macros. * The MLD lock is only taken with MLD. Currently it is system-wide. * VIMAGE: The lock could be pushed to per-VIMAGE granularity in future. */ #define MLD_LOCK_INIT() mtx_init(&mld_mtx, "mld_mtx", NULL, MTX_DEF) #define MLD_LOCK_DESTROY() mtx_destroy(&mld_mtx) #define MLD_LOCK() mtx_lock(&mld_mtx) #define MLD_LOCK_ASSERT() mtx_assert(&mld_mtx, MA_OWNED) #define MLD_UNLOCK() mtx_unlock(&mld_mtx) #define MLD_UNLOCK_ASSERT() mtx_assert(&mld_mtx, MA_NOTOWNED) /* * Per-link MLD context. */ #define MLD_IFINFO(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->mld_ifinfo) int mld_change_state(struct in6_multi *, const int); struct mld_ifsoftc * mld_domifattach(struct ifnet *); void mld_domifdetach(struct ifnet *); void mld_fasttimo(void); void mld_ifdetach(struct ifnet *); int mld_input(struct mbuf *, int, int); void mld_slowtimo(void); #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_mld); #endif #endif /* _KERNEL */ #endif /* _NETINET6_MLD6_VAR_H_ */ Index: head/sys/sys/gtaskqueue.h =================================================================== --- head/sys/sys/gtaskqueue.h (revision 333174) +++ head/sys/sys/gtaskqueue.h (revision 333175) @@ -1,110 +1,112 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Jeffrey Roberson * Copyright (c) 2016 Matthew Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_GTASKQUEUE_H_ #define _SYS_GTASKQUEUE_H_ #include #ifndef _KERNEL #error "no user-serviceable parts inside" #endif struct gtaskqueue; typedef void (*gtaskqueue_enqueue_fn)(void *context); /* * Taskqueue groups. Manages dynamic thread groups and irq binding for * device and other tasks. */ void gtaskqueue_block(struct gtaskqueue *queue); void gtaskqueue_unblock(struct gtaskqueue *queue); int gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask); void gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *task); void gtaskqueue_drain_all(struct gtaskqueue *queue); int grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *task); void taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *grptask, - void *uniq, int irq, char *name); + void *uniq, int irq, const char *name); int taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *grptask, void *uniq, int cpu, int irq, char *name); void taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask); struct taskqgroup *taskqgroup_create(char *name); void taskqgroup_destroy(struct taskqgroup *qgroup); int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride); +void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn, + const char *name); #define TASK_ENQUEUED 0x1 #define TASK_SKIP_WAKEUP 0x2 #define GTASK_INIT(task, flags, priority, func, context) do { \ (task)->ta_flags = flags; \ (task)->ta_priority = (priority); \ (task)->ta_func = (func); \ (task)->ta_context = (context); \ } while (0) #define GROUPTASK_INIT(gtask, priority, func, context) \ GTASK_INIT(&(gtask)->gt_task, TASK_SKIP_WAKEUP, priority, func, context) #define GROUPTASK_ENQUEUE(gtask) \ grouptaskqueue_enqueue((gtask)->gt_taskqueue, &(gtask)->gt_task) #define TASKQGROUP_DECLARE(name) \ extern struct taskqgroup *qgroup_##name #define TASKQGROUP_DEFINE(name, cnt, stride) \ \ struct taskqgroup *qgroup_##name; \ \ static void \ taskqgroup_define_##name(void *arg) \ { \ qgroup_##name = taskqgroup_create(#name); \ } \ \ SYSINIT(taskqgroup_##name, SI_SUB_TASKQ, SI_ORDER_FIRST, \ taskqgroup_define_##name, NULL); \ \ static void \ taskqgroup_adjust_##name(void *arg) \ { \ taskqgroup_adjust(qgroup_##name, (cnt), (stride)); \ } \ \ SYSINIT(taskqgroup_adj_##name, SI_SUB_SMP, SI_ORDER_ANY, \ taskqgroup_adjust_##name, NULL) TASKQGROUP_DECLARE(net); TASKQGROUP_DECLARE(softirq); #endif /* !_SYS_GTASKQUEUE_H_ */