Index: head/share/man/man9/timeout.9 =================================================================== --- head/share/man/man9/timeout.9 (revision 290804) +++ head/share/man/man9/timeout.9 (revision 290805) @@ -1,846 +1,846 @@ .\" $NetBSD: timeout.9,v 1.2 1996/06/23 22:32:34 pk Exp $ .\" .\" Copyright (c) 1996 The NetBSD Foundation, Inc. .\" All rights reserved. .\" .\" This code is derived from software contributed to The NetBSD Foundation .\" by Paul Kranenburg. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS .\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED .\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR .\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" .Dd September 14, 2015 .Dt TIMEOUT 9 .Os .Sh NAME .Nm callout_active , .Nm callout_deactivate , .Nm callout_async_drain , .Nm callout_drain , .Nm callout_handle_init , .Nm callout_init , .Nm callout_init_mtx , .Nm callout_init_rm , .Nm callout_init_rw , .Nm callout_pending , .Nm callout_reset , .Nm callout_reset_curcpu , .Nm callout_reset_on , .Nm callout_reset_sbt , .Nm callout_reset_sbt_curcpu , .Nm callout_reset_sbt_on , .Nm callout_schedule , .Nm callout_schedule_curcpu , .Nm callout_schedule_on , .Nm callout_schedule_sbt , .Nm callout_schedule_sbt_curcpu , .Nm callout_schedule_sbt_on , .Nm callout_stop , .Nm timeout , .Nm untimeout .Nd execute a function after a specified length of time .Sh SYNOPSIS .In sys/types.h .In sys/systm.h .Bd -literal typedef void timeout_t (void *); .Ed .Ft int .Fn callout_active "struct callout *c" .Ft void .Fn callout_deactivate "struct callout *c" .Ft int .Fn callout_async_drain "struct callout *c" "timeout_t *drain" .Ft int .Fn callout_drain "struct callout *c" .Ft void .Fn callout_handle_init "struct callout_handle *handle" .Bd -literal struct callout_handle handle = CALLOUT_HANDLE_INITIALIZER(&handle); .Ed .Ft void .Fn callout_init "struct callout *c" "int mpsafe" .Ft void .Fn callout_init_mtx "struct callout *c" "struct mtx *mtx" "int flags" .Ft void .Fn callout_init_rm "struct callout *c" "struct rmlock *rm" "int flags" .Ft void .Fn callout_init_rw "struct callout *c" "struct rwlock *rw" "int flags" .Ft int .Fn callout_pending "struct callout *c" .Ft int .Fn callout_reset "struct callout *c" "int ticks" "timeout_t *func" "void *arg" .Ft int .Fn callout_reset_curcpu "struct callout *c" "int ticks" "timeout_t *func" \ "void *arg" .Ft int .Fn callout_reset_on "struct callout *c" "int ticks" "timeout_t *func" \ "void *arg" "int cpu" .Ft int .Fn callout_reset_sbt "struct callout *c" "sbintime_t sbt" \ "sbintime_t pr" "timeout_t *func" "void *arg" "int flags" .Ft int .Fn callout_reset_sbt_curcpu "struct callout *c" "sbintime_t sbt" \ "sbintime_t pr" "timeout_t *func" "void *arg" "int flags" .Ft int .Fn callout_reset_sbt_on "struct callout *c" "sbintime_t sbt" \ "sbintime_t pr" "timeout_t *func" "void *arg" "int cpu" "int flags" .Ft int .Fn callout_schedule "struct callout *c" "int ticks" .Ft int .Fn callout_schedule_curcpu "struct callout *c" "int ticks" .Ft int .Fn callout_schedule_on "struct callout *c" "int ticks" "int cpu" .Ft int .Fn callout_schedule_sbt "struct callout *c" "sbintime_t sbt" \ "sbintime_t pr" "int flags" .Ft int .Fn callout_schedule_sbt_curcpu "struct callout *c" "sbintime_t sbt" \ "sbintime_t pr" "int flags" .Ft int .Fn callout_schedule_sbt_on "struct callout *c" "sbintime_t sbt" \ "sbintime_t pr" "int cpu" "int flags" .Ft int .Fn callout_stop "struct callout *c" .Ft struct callout_handle .Fn timeout "timeout_t *func" "void *arg" "int ticks" .Ft void .Fn untimeout "timeout_t *func" "void *arg" "struct callout_handle handle" .Sh DESCRIPTION The .Nm callout API is used to schedule a call to an arbitrary function at a specific time in the future. Consumers of this API are required to allocate a callout structure .Pq struct callout for each pending function invocation. This structure stores state about the pending function invocation including the function to be called and the time at which the function should be invoked. Pending function calls can be cancelled or rescheduled to a different time. In addition, a callout structure may be reused to schedule a new function call after a scheduled call is completed. .Pp Callouts only provide a single-shot mode. If a consumer requires a periodic timer, it must explicitly reschedule each function call. This is normally done by rescheduling the subsequent call within the called function. .Pp Callout functions must not sleep. They may not acquire sleepable locks, wait on condition variables, perform blocking allocation requests, or invoke any other action that might sleep. .Pp Each callout structure must be initialized by .Fn callout_init , .Fn callout_init_mtx , .Fn callout_init_rm , or .Fn callout_init_rw before it is passed to any of the other callout functions. The .Fn callout_init function initializes a callout structure in .Fa c that is not associated with a specific lock. If the .Fa mpsafe argument is zero, the callout structure is not considered to be .Dq multi-processor safe ; and the Giant lock will be acquired before calling the callout function and released when the callout function returns. .Pp The .Fn callout_init_mtx , .Fn callout_init_rm , and .Fn callout_init_rw functions initialize a callout structure in .Fa c that is associated with a specific lock. The lock is specified by the .Fa mtx , .Fa rm , or .Fa rw parameter. The associated lock must be held while stopping or rescheduling the callout. The callout subsystem acquires the associated lock before calling the callout function and releases it after the function returns. If the callout was cancelled while the callout subsystem waited for the associated lock, the callout function is not called, and the associated lock is released. This ensures that stopping or rescheduling the callout will abort any previously scheduled invocation. .Pp Only regular mutexes may be used with .Fn callout_init_mtx ; spin mutexes are not supported. A sleepable read-mostly lock .Po one initialized with the .Dv RM_SLEEPABLE flag .Pc may not be used with .Fn callout_init_rm . Similarly, other sleepable lock types such as .Xr sx 9 and .Xr lockmgr 9 cannot be used with callouts because sleeping is not permitted in the callout subsystem. .Pp These .Fa flags may be specified for .Fn callout_init_mtx , .Fn callout_init_rm , or .Fn callout_init_rw : .Bl -tag -width ".Dv CALLOUT_RETURNUNLOCKED" .It Dv CALLOUT_RETURNUNLOCKED The callout function will release the associated lock itself, so the callout subsystem should not attempt to unlock it after the callout function returns. .It Dv CALLOUT_SHAREDLOCK The lock is only acquired in read mode when running the callout handler. This flag is ignored by .Fn callout_init_mtx . .El .Pp The function .Fn callout_stop cancels a callout .Fa c if it is currently pending. If the callout is pending and successfuly stopped, then .Fn callout_stop returns a value of one. If the callout is not set, or has already been serviced, then negative one is returned. If the callout is currently being serviced and cannot be stopped, then zero will be returned. If the callout has an associated lock, then that lock must be held when this function is called. .Pp The function .Fn callout_async_drain is identical to .Fn callout_stop with one difference. When .Fn callout_async_drain returns zero it will arrange for the function .Fa drain to be called using the same argument given to the .Fn callout_reset function. .Fn callout_async_drain If the callout has an associated lock, then that lock must be held when this function is called. Note that when stopping multiple callouts that use the same lock it is possible to get multiple return's of zero and multiple calls to the .Fa drain function, depending upon which CPU's the callouts are running. The .Fa drain function itself is called from the context of the completing callout i.e. softclock or hardclock, just like a callout itself. p .Pp The function .Fn callout_drain is identical to .Fn callout_stop except that it will wait for the callout .Fa c to complete if it is already in progress. This function MUST NOT be called while holding any locks on which the callout might block, or deadlock will result. Note that if the callout subsystem has already begun processing this callout, then the callout function may be invoked before .Fn callout_drain returns. However, the callout subsystem does guarantee that the callout will be fully stopped before .Fn callout_drain returns. .Pp The .Fn callout_reset and .Fn callout_schedule function families schedule a future function invocation for callout .Fa c . If .Fa c already has a pending callout, it is cancelled before the new invocation is scheduled. -These functions return a non-zero value if a pending callout was cancelled +These functions return a value of one if a pending callout was cancelled and zero if there was no pending callout. If the callout has an associated lock, then that lock must be held when any of these functions are called. .Pp The time at which the callout function will be invoked is determined by either the .Fa ticks argument or the .Fa sbt , .Fa pr , and .Fa flags arguments. When .Fa ticks is used, the callout is scheduled to execute after .Fa ticks Ns No /hz seconds. Non-positive values of .Fa ticks are silently converted to the value .Sq 1 . .Pp The .Fa sbt , .Fa pr , and .Fa flags arguments provide more control over the scheduled time including support for higher resolution times, specifying the precision of the scheduled time, and setting an absolute deadline instead of a relative timeout. The callout is scheduled to execute in a time window which begins at the time specified in .Fa sbt and extends for the amount of time specified in .Fa pr . If .Fa sbt specifies a time in the past, the window is adjusted to start at the current time. A non-zero value for .Fa pr allows the callout subsystem to coalesce callouts scheduled close to each other into fewer timer interrupts, reducing processing overhead and power consumption. These .Fa flags may be specified to adjust the interpretation of .Fa sbt and .Fa pr : .Bl -tag -width ".Dv C_DIRECT_EXEC" .It Dv C_ABSOLUTE Handle the .Fa sbt argument as an absolute time since boot. By default, .Fa sbt is treated as a relative amount of time, similar to .Fa ticks . .It Dv C_DIRECT_EXEC Run the handler directly from hardware interrupt context instead of from the softclock thread. This reduces latency and overhead, but puts more constraints on the callout function. Callout functions run in this context may use only spin mutexes for locking and should be as small as possible because they run with absolute priority. .It Fn C_PREL Specifies relative event time precision as binary logarithm of time interval divided by acceptable time deviation: 1 -- 1/2, 2 -- 1/4, etc. Note that the larger of .Fa pr or this value is used as the length of the time window. Smaller values .Pq which result in larger time intervals allow the callout subsystem to aggregate more events in one timer interrupt. .It Dv C_HARDCLOCK Align the timeouts to .Fn hardclock calls if possible. .El .Pp The .Fn callout_reset functions accept a .Fa func argument which identifies the function to be called when the time expires. It must be a pointer to a function that takes a single .Fa void * argument. Upon invocation, .Fa func will receive .Fa arg as its only argument. The .Fn callout_schedule functions reuse the .Fa func and .Fa arg arguments from the previous callout. Note that one of the .Fn callout_reset functions must always be called to initialize .Fa func and .Fa arg before one of the .Fn callout_schedule functions can be used. .Pp The callout subsystem provides a softclock thread for each CPU in the system. Callouts are assigned to a single CPU and are executed by the softclock thread for that CPU. Initially, callouts are assigned to CPU 0. The .Fn callout_reset_on , .Fn callout_reset_sbt_on , .Fn callout_schedule_on and .Fn callout_schedule_sbt_on functions assign the callout to CPU .Fa cpu . The .Fn callout_reset_curcpu , .Fn callout_reset_sbt_curpu , .Fn callout_schedule_curcpu and .Fn callout_schedule_sbt_curcpu functions assign the callout to the current CPU. The .Fn callout_reset , .Fn callout_reset_sbt , .Fn callout_schedule and .Fn callout_schedule_sbt functions schedule the callout to execute in the softclock thread of the CPU to which it is currently assigned. .Pp Softclock threads are not pinned to their respective CPUs by default. The softclock thread for CPU 0 can be pinned to CPU 0 by setting the .Va kern.pin_default_swi loader tunable to a non-zero value. Softclock threads for CPUs other than zero can be pinned to their respective CPUs by setting the .Va kern.pin_pcpu_swi loader tunable to a non-zero value. .Pp The macros .Fn callout_pending , .Fn callout_active and .Fn callout_deactivate provide access to the current state of the callout. The .Fn callout_pending macro checks whether a callout is .Em pending ; a callout is considered .Em pending when a timeout has been set but the time has not yet arrived. Note that once the timeout time arrives and the callout subsystem starts to process this callout, .Fn callout_pending will return .Dv FALSE even though the callout function may not have finished .Pq or even begun executing. The .Fn callout_active macro checks whether a callout is marked as .Em active , and the .Fn callout_deactivate macro clears the callout's .Em active flag. The callout subsystem marks a callout as .Em active when a timeout is set and it clears the .Em active flag in .Fn callout_stop and .Fn callout_drain , but it .Em does not clear it when a callout expires normally via the execution of the callout function. .Ss "Avoiding Race Conditions" The callout subsystem invokes callout functions from its own thread context. Without some kind of synchronization, it is possible that a callout function will be invoked concurrently with an attempt to stop or reset the callout by another thread. In particular, since callout functions typically acquire a lock as their first action, the callout function may have already been invoked, but is blocked waiting for that lock at the time that another thread tries to reset or stop the callout. .Pp There are three main techniques for addressing these synchronization concerns. The first approach is preferred as it is the simplest: .Bl -enum -offset indent .It Callouts can be associated with a specific lock when they are initialized by .Fn callout_init_mtx , .Fn callout_init_rm , or .Fn callout_init_rw . When a callout is associated with a lock, the callout subsystem acquires the lock before the callout function is invoked. This allows the callout subsystem to transparently handle races between callout cancellation, scheduling, and execution. Note that the associated lock must be acquired before calling .Fn callout_stop or one of the .Fn callout_reset or .Fn callout_schedule functions to provide this safety. .Pp A callout initialized via .Fn callout_init with .Fa mpsafe set to zero is implicitly associated with the .Va Giant mutex. If .Va Giant is held when cancelling or rescheduling the callout, then its use will prevent races with the callout function. .It The return value from .Fn callout_stop .Po or the .Fn callout_reset and .Fn callout_schedule function families .Pc indicates whether or not the callout was removed. If it is known that the callout was set and the callout function has not yet executed, then a return value of .Dv FALSE indicates that the callout function is about to be called. For example: .Bd -literal -offset indent if (sc->sc_flags & SCFLG_CALLOUT_RUNNING) { if (callout_stop(&sc->sc_callout)) { sc->sc_flags &= ~SCFLG_CALLOUT_RUNNING; /* successfully stopped */ } else { /* * callout has expired and callout * function is about to be executed */ } } .Ed .It The .Fn callout_pending , .Fn callout_active and .Fn callout_deactivate macros can be used together to work around the race conditions. When a callout's timeout is set, the callout subsystem marks the callout as both .Em active and .Em pending . When the timeout time arrives, the callout subsystem begins processing the callout by first clearing the .Em pending flag. It then invokes the callout function without changing the .Em active flag, and does not clear the .Em active flag even after the callout function returns. The mechanism described here requires the callout function itself to clear the .Em active flag using the .Fn callout_deactivate macro. The .Fn callout_stop and .Fn callout_drain functions always clear both the .Em active and .Em pending flags before returning. .Pp The callout function should first check the .Em pending flag and return without action if .Fn callout_pending returns .Dv TRUE . This indicates that the callout was rescheduled using .Fn callout_reset just before the callout function was invoked. If .Fn callout_active returns .Dv FALSE then the callout function should also return without action. This indicates that the callout has been stopped. Finally, the callout function should call .Fn callout_deactivate to clear the .Em active flag. For example: .Bd -literal -offset indent mtx_lock(&sc->sc_mtx); if (callout_pending(&sc->sc_callout)) { /* callout was reset */ mtx_unlock(&sc->sc_mtx); return; } if (!callout_active(&sc->sc_callout)) { /* callout was stopped */ mtx_unlock(&sc->sc_mtx); return; } callout_deactivate(&sc->sc_callout); /* rest of callout function */ .Ed .Pp Together with appropriate synchronization, such as the mutex used above, this approach permits the .Fn callout_stop and .Fn callout_reset functions to be used at any time without races. For example: .Bd -literal -offset indent mtx_lock(&sc->sc_mtx); callout_stop(&sc->sc_callout); /* The callout is effectively stopped now. */ .Ed .Pp If the callout is still pending then these functions operate normally, but if processing of the callout has already begun then the tests in the callout function cause it to return without further action. Synchronization between the callout function and other code ensures that stopping or resetting the callout will never be attempted while the callout function is past the .Fn callout_deactivate call. .Pp The above technique additionally ensures that the .Em active flag always reflects whether the callout is effectively enabled or disabled. If .Fn callout_active returns false, then the callout is effectively disabled, since even if the callout subsystem is actually just about to invoke the callout function, the callout function will return without action. .El .Pp There is one final race condition that must be considered when a callout is being stopped for the last time. In this case it may not be safe to let the callout function itself detect that the callout was stopped, since it may need to access data objects that have already been destroyed or recycled. To ensure that the callout is completely finished, a call to .Fn callout_drain should be used. In particular, a callout should always be drained prior to destroying its associated lock or releasing the storage for the callout structure. .Sh LEGACY API .Bf Sy The functions below are a legacy API that will be removed in a future release. New code should not use these routines. .Ef .Pp The function .Fn timeout schedules a call to the function given by the argument .Fa func to take place after .Fa ticks Ns No /hz seconds. Non-positive values of .Fa ticks are silently converted to the value .Sq 1 . .Fa func should be a pointer to a function that takes a .Fa void * argument. Upon invocation, .Fa func will receive .Fa arg as its only argument. The return value from .Fn timeout is a .Ft struct callout_handle which can be used in conjunction with the .Fn untimeout function to request that a scheduled timeout be canceled. .Pp The function .Fn callout_handle_init can be used to initialize a handle to a state which will cause any calls to .Fn untimeout with that handle to return with no side effects. .Pp Assigning a callout handle the value of .Fn CALLOUT_HANDLE_INITIALIZER performs the same function as .Fn callout_handle_init and is provided for use on statically declared or global callout handles. .Pp The function .Fn untimeout cancels the timeout associated with .Fa handle using the .Fa func and .Fa arg arguments to validate the handle. If the handle does not correspond to a timeout with the function .Fa func taking the argument .Fa arg no action is taken. .Fa handle must be initialized by a previous call to .Fn timeout , .Fn callout_handle_init , or assigned the value of .Fn CALLOUT_HANDLE_INITIALIZER "&handle" before being passed to .Fn untimeout . The behavior of calling .Fn untimeout with an uninitialized handle is undefined. .Pp As handles are recycled by the system, it is possible (although unlikely) that a handle from one invocation of .Fn timeout may match the handle of another invocation of .Fn timeout if both calls used the same function pointer and argument, and the first timeout is expired or canceled before the second call. The timeout facility offers O(1) running time for .Fn timeout and .Fn untimeout . Timeouts are executed from .Fn softclock with the .Va Giant lock held. Thus they are protected from re-entrancy. .Sh RETURN VALUES The .Fn callout_active macro returns the state of a callout's .Em active flag. .Pp The .Fn callout_pending macro returns the state of a callout's .Em pending flag. .Pp The .Fn callout_reset and .Fn callout_schedule -function families return non-zero if the callout was pending before the new +function families return a value of one if the callout was pending before the new function invocation was scheduled. .Pp The .Fn callout_stop and .Fn callout_drain -functions return non-zero if the callout was still pending when it was -called or zero otherwise. -The +functions return a value of one if the callout was still pending when it was +called, a zero if the callout could not be stopped and a negative one is it +was either not running or haas already completed. The .Fn timeout function returns a .Ft struct callout_handle that can be passed to .Fn untimeout . .Sh HISTORY The current timeout and untimeout routines are based on the work of .An Adam M. Costello and .An George Varghese , published in a technical report entitled .%T "Redesigning the BSD Callout and Timer Facilities" and modified slightly for inclusion in .Fx by .An Justin T. Gibbs . The original work on the data structures used in this implementation was published by .An G. Varghese and .An A. Lauck in the paper .%T "Hashed and Hierarchical Timing Wheels: Data Structures for the Efficient Implementation of a Timer Facility" in the .%B "Proceedings of the 11th ACM Annual Symposium on Operating Systems Principles" . The current implementation replaces the long standing .Bx linked list callout mechanism which offered O(n) insertion and removal running time but did not generate or require handles for untimeout operations. Index: head/sys/kern/subr_taskqueue.c =================================================================== --- head/sys/kern/subr_taskqueue.c (revision 290804) +++ head/sys/kern/subr_taskqueue.c (revision 290805) @@ -1,781 +1,781 @@ /*- * Copyright (c) 2000 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues"); static void *taskqueue_giant_ih; static void *taskqueue_ih; static void taskqueue_fast_enqueue(void *); static void taskqueue_swi_enqueue(void *); static void taskqueue_swi_giant_enqueue(void *); struct taskqueue_busy { struct task *tb_running; TAILQ_ENTRY(taskqueue_busy) tb_link; }; struct task * const TB_DRAIN_WAITER = (struct task *)0x1; struct taskqueue { STAILQ_HEAD(, task) tq_queue; taskqueue_enqueue_fn tq_enqueue; void *tq_context; TAILQ_HEAD(, taskqueue_busy) tq_active; struct mtx tq_mutex; struct thread **tq_threads; int tq_tcount; int tq_spin; int tq_flags; int tq_callouts; taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS]; void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS]; }; #define TQ_FLAGS_ACTIVE (1 << 0) #define TQ_FLAGS_BLOCKED (1 << 1) #define TQ_FLAGS_UNLOCKED_ENQUEUE (1 << 2) #define DT_CALLOUT_ARMED (1 << 0) #define TQ_LOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_lock_spin(&(tq)->tq_mutex); \ else \ mtx_lock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_LOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_OWNED) #define TQ_UNLOCK(tq) \ do { \ if ((tq)->tq_spin) \ mtx_unlock_spin(&(tq)->tq_mutex); \ else \ mtx_unlock(&(tq)->tq_mutex); \ } while (0) #define TQ_ASSERT_UNLOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED) void _timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task, int priority, task_fn_t func, void *context) { TASK_INIT(&timeout_task->t, priority, func, context); callout_init_mtx(&timeout_task->c, &queue->tq_mutex, CALLOUT_RETURNUNLOCKED); timeout_task->q = queue; timeout_task->f = 0; } static __inline int TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm, int t) { if (tq->tq_spin) return (msleep_spin(p, m, wm, t)); return (msleep(p, m, pri, wm, t)); } static struct taskqueue * _taskqueue_create(const char *name __unused, int mflags, taskqueue_enqueue_fn enqueue, void *context, int mtxflags, const char *mtxname) { struct taskqueue *queue; queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO); if (!queue) return NULL; STAILQ_INIT(&queue->tq_queue); TAILQ_INIT(&queue->tq_active); queue->tq_enqueue = enqueue; queue->tq_context = context; queue->tq_spin = (mtxflags & MTX_SPIN) != 0; queue->tq_flags |= TQ_FLAGS_ACTIVE; if (enqueue == taskqueue_fast_enqueue || enqueue == taskqueue_swi_enqueue || enqueue == taskqueue_swi_giant_enqueue || enqueue == taskqueue_thread_enqueue) queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE; mtx_init(&queue->tq_mutex, mtxname, NULL, mtxflags); return queue; } struct taskqueue * taskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { return _taskqueue_create(name, mflags, enqueue, context, MTX_DEF, "taskqueue"); } void taskqueue_set_callback(struct taskqueue *queue, enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback, void *context) { KASSERT(((cb_type >= TASKQUEUE_CALLBACK_TYPE_MIN) && (cb_type <= TASKQUEUE_CALLBACK_TYPE_MAX)), ("Callback type %d not valid, must be %d-%d", cb_type, TASKQUEUE_CALLBACK_TYPE_MIN, TASKQUEUE_CALLBACK_TYPE_MAX)); KASSERT((queue->tq_callbacks[cb_type] == NULL), ("Re-initialization of taskqueue callback?")); queue->tq_callbacks[cb_type] = callback; queue->tq_cb_contexts[cb_type] = context; } /* * Signal a taskqueue thread to terminate. */ static void taskqueue_terminate(struct thread **pp, struct taskqueue *tq) { while (tq->tq_tcount > 0 || tq->tq_callouts > 0) { wakeup(tq); TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0); } } void taskqueue_free(struct taskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_ACTIVE; taskqueue_terminate(queue->tq_threads, queue); KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?")); KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks")); mtx_destroy(&queue->tq_mutex); free(queue->tq_threads, M_TASKQUEUE); free(queue, M_TASKQUEUE); } static int taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task) { struct task *ins; struct task *prev; /* * Count multiple enqueues. */ if (task->ta_pending) { if (task->ta_pending < USHRT_MAX) task->ta_pending++; TQ_UNLOCK(queue); return (0); } /* * Optimise the case when all tasks have the same priority. */ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link); if (!prev || prev->ta_priority >= task->ta_priority) { STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link); } else { prev = NULL; for (ins = STAILQ_FIRST(&queue->tq_queue); ins; prev = ins, ins = STAILQ_NEXT(ins, ta_link)) if (ins->ta_priority < task->ta_priority) break; if (prev) STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link); else STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link); } task->ta_pending = 1; if ((queue->tq_flags & TQ_FLAGS_UNLOCKED_ENQUEUE) != 0) TQ_UNLOCK(queue); if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0) queue->tq_enqueue(queue->tq_context); if ((queue->tq_flags & TQ_FLAGS_UNLOCKED_ENQUEUE) == 0) TQ_UNLOCK(queue); /* Return with lock released. */ return (0); } int taskqueue_enqueue(struct taskqueue *queue, struct task *task) { int res; TQ_LOCK(queue); res = taskqueue_enqueue_locked(queue, task); /* The lock is released inside. */ return (res); } static void taskqueue_timeout_func(void *arg) { struct taskqueue *queue; struct timeout_task *timeout_task; timeout_task = arg; queue = timeout_task->q; KASSERT((timeout_task->f & DT_CALLOUT_ARMED) != 0, ("Stray timeout")); timeout_task->f &= ~DT_CALLOUT_ARMED; queue->tq_callouts--; taskqueue_enqueue_locked(timeout_task->q, &timeout_task->t); /* The lock is released inside. */ } int taskqueue_enqueue_timeout(struct taskqueue *queue, struct timeout_task *timeout_task, int ticks) { int res; TQ_LOCK(queue); KASSERT(timeout_task->q == NULL || timeout_task->q == queue, ("Migrated queue")); KASSERT(!queue->tq_spin, ("Timeout for spin-queue")); timeout_task->q = queue; res = timeout_task->t.ta_pending; if (ticks == 0) { taskqueue_enqueue_locked(queue, &timeout_task->t); /* The lock is released inside. */ } else { if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) { res++; } else { queue->tq_callouts++; timeout_task->f |= DT_CALLOUT_ARMED; if (ticks < 0) ticks = -ticks; /* Ignore overflow. */ } if (ticks > 0) { callout_reset(&timeout_task->c, ticks, taskqueue_timeout_func, timeout_task); } TQ_UNLOCK(queue); } return (res); } static void taskqueue_task_nop_fn(void *context, int pending) { } /* * Block until all currently queued tasks in this taskqueue * have begun execution. Tasks queued during execution of * this function are ignored. */ static void taskqueue_drain_tq_queue(struct taskqueue *queue) { struct task t_barrier; if (STAILQ_EMPTY(&queue->tq_queue)) return; /* * Enqueue our barrier after all current tasks, but with * the highest priority so that newly queued tasks cannot * pass it. Because of the high priority, we can not use * taskqueue_enqueue_locked directly (which drops the lock * anyway) so just insert it at tail while we have the * queue lock. */ TASK_INIT(&t_barrier, USHRT_MAX, taskqueue_task_nop_fn, &t_barrier); STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link); t_barrier.ta_pending = 1; /* * Once the barrier has executed, all previously queued tasks * have completed or are currently executing. */ while (t_barrier.ta_pending != 0) TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0); } /* * Block until all currently executing tasks for this taskqueue * complete. Tasks that begin execution during the execution * of this function are ignored. */ static void taskqueue_drain_tq_active(struct taskqueue *queue) { struct taskqueue_busy tb_marker, *tb_first; if (TAILQ_EMPTY(&queue->tq_active)) return; /* Block taskq_terminate().*/ queue->tq_callouts++; /* * Wait for all currently executing taskqueue threads * to go idle. */ tb_marker.tb_running = TB_DRAIN_WAITER; TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link); while (TAILQ_FIRST(&queue->tq_active) != &tb_marker) TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0); TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link); /* * Wakeup any other drain waiter that happened to queue up * without any intervening active thread. */ tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); /* Release taskqueue_terminate(). */ queue->tq_callouts--; if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0) wakeup_one(queue->tq_threads); } void taskqueue_block(struct taskqueue *queue) { TQ_LOCK(queue); queue->tq_flags |= TQ_FLAGS_BLOCKED; TQ_UNLOCK(queue); } void taskqueue_unblock(struct taskqueue *queue) { TQ_LOCK(queue); queue->tq_flags &= ~TQ_FLAGS_BLOCKED; if (!STAILQ_EMPTY(&queue->tq_queue)) queue->tq_enqueue(queue->tq_context); TQ_UNLOCK(queue); } static void taskqueue_run_locked(struct taskqueue *queue) { struct taskqueue_busy tb; struct taskqueue_busy *tb_first; struct task *task; int pending; TQ_ASSERT_LOCKED(queue); tb.tb_running = NULL; while (STAILQ_FIRST(&queue->tq_queue)) { TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link); /* * Carefully remove the first task from the queue and * zero its pending count. */ task = STAILQ_FIRST(&queue->tq_queue); STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link); pending = task->ta_pending; task->ta_pending = 0; tb.tb_running = task; TQ_UNLOCK(queue); task->ta_func(task->ta_context, pending); TQ_LOCK(queue); tb.tb_running = NULL; wakeup(task); TAILQ_REMOVE(&queue->tq_active, &tb, tb_link); tb_first = TAILQ_FIRST(&queue->tq_active); if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER) wakeup(tb_first); } } void taskqueue_run(struct taskqueue *queue) { TQ_LOCK(queue); taskqueue_run_locked(queue); TQ_UNLOCK(queue); } static int task_is_running(struct taskqueue *queue, struct task *task) { struct taskqueue_busy *tb; TQ_ASSERT_LOCKED(queue); TAILQ_FOREACH(tb, &queue->tq_active, tb_link) { if (tb->tb_running == task) return (1); } return (0); } static int taskqueue_cancel_locked(struct taskqueue *queue, struct task *task, u_int *pendp) { if (task->ta_pending > 0) STAILQ_REMOVE(&queue->tq_queue, task, task, ta_link); if (pendp != NULL) *pendp = task->ta_pending; task->ta_pending = 0; return (task_is_running(queue, task) ? EBUSY : 0); } int taskqueue_cancel(struct taskqueue *queue, struct task *task, u_int *pendp) { int error; TQ_LOCK(queue); error = taskqueue_cancel_locked(queue, task, pendp); TQ_UNLOCK(queue); return (error); } int taskqueue_cancel_timeout(struct taskqueue *queue, struct timeout_task *timeout_task, u_int *pendp) { u_int pending, pending1; int error; TQ_LOCK(queue); - pending = !!callout_stop(&timeout_task->c); + pending = !!(callout_stop(&timeout_task->c) > 0); error = taskqueue_cancel_locked(queue, &timeout_task->t, &pending1); if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) { timeout_task->f &= ~DT_CALLOUT_ARMED; queue->tq_callouts--; } TQ_UNLOCK(queue); if (pendp != NULL) *pendp = pending + pending1; return (error); } void taskqueue_drain(struct taskqueue *queue, struct task *task) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); while (task->ta_pending != 0 || task_is_running(queue, task)) TQ_SLEEP(queue, task, &queue->tq_mutex, PWAIT, "-", 0); TQ_UNLOCK(queue); } void taskqueue_drain_all(struct taskqueue *queue) { if (!queue->tq_spin) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__); TQ_LOCK(queue); taskqueue_drain_tq_queue(queue); taskqueue_drain_tq_active(queue); TQ_UNLOCK(queue); } void taskqueue_drain_timeout(struct taskqueue *queue, struct timeout_task *timeout_task) { callout_drain(&timeout_task->c); taskqueue_drain(queue, &timeout_task->t); } static void taskqueue_swi_enqueue(void *context) { swi_sched(taskqueue_ih, 0); } static void taskqueue_swi_run(void *dummy) { taskqueue_run(taskqueue_swi); } static void taskqueue_swi_giant_enqueue(void *context) { swi_sched(taskqueue_giant_ih, 0); } static void taskqueue_swi_giant_run(void *dummy) { taskqueue_run(taskqueue_swi_giant); } static int _taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, va_list ap) { char ktname[MAXCOMLEN + 1]; struct thread *td; struct taskqueue *tq; int i, error; if (count <= 0) return (EINVAL); vsnprintf(ktname, sizeof(ktname), name, ap); tq = *tqp; tq->tq_threads = malloc(sizeof(struct thread *) * count, M_TASKQUEUE, M_NOWAIT | M_ZERO); if (tq->tq_threads == NULL) { printf("%s: no memory for %s threads\n", __func__, ktname); return (ENOMEM); } for (i = 0; i < count; i++) { if (count == 1) error = kthread_add(taskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname); else error = kthread_add(taskqueue_thread_loop, tqp, NULL, &tq->tq_threads[i], RFSTOPPED, 0, "%s_%d", ktname, i); if (error) { /* should be ok to continue, taskqueue_free will dtrt */ printf("%s: kthread_add(%s): error %d", __func__, ktname, error); tq->tq_threads[i] = NULL; /* paranoid */ } else tq->tq_tcount++; } for (i = 0; i < count; i++) { if (tq->tq_threads[i] == NULL) continue; td = tq->tq_threads[i]; if (mask) { error = cpuset_setthread(td->td_tid, mask); /* * Failing to pin is rarely an actual fatal error; * it'll just affect performance. */ if (error) printf("%s: curthread=%llu: can't pin; " "error=%d\n", __func__, (unsigned long long) td->td_tid, error); } thread_lock(td); sched_prio(td, pri); sched_add(td, SRQ_BORING); thread_unlock(td); } return (0); } int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, const char *name, ...) { va_list ap; int error; va_start(ap, name); error = _taskqueue_start_threads(tqp, count, pri, NULL, name, ap); va_end(ap); return (error); } int taskqueue_start_threads_cpuset(struct taskqueue **tqp, int count, int pri, cpuset_t *mask, const char *name, ...) { va_list ap; int error; va_start(ap, name); error = _taskqueue_start_threads(tqp, count, pri, mask, name, ap); va_end(ap); return (error); } static inline void taskqueue_run_callback(struct taskqueue *tq, enum taskqueue_callback_type cb_type) { taskqueue_callback_fn tq_callback; TQ_ASSERT_UNLOCKED(tq); tq_callback = tq->tq_callbacks[cb_type]; if (tq_callback != NULL) tq_callback(tq->tq_cb_contexts[cb_type]); } void taskqueue_thread_loop(void *arg) { struct taskqueue **tqp, *tq; tqp = arg; tq = *tqp; taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT); TQ_LOCK(tq); while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) { taskqueue_run_locked(tq); /* * Because taskqueue_run() can drop tq_mutex, we need to * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the * meantime, which means we missed a wakeup. */ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0) break; TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0); } taskqueue_run_locked(tq); /* * This thread is on its way out, so just drop the lock temporarily * in order to call the shutdown callback. This allows the callback * to look at the taskqueue, even just before it dies. */ TQ_UNLOCK(tq); taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN); TQ_LOCK(tq); /* rendezvous with thread that asked us to terminate */ tq->tq_tcount--; wakeup_one(tq->tq_threads); TQ_UNLOCK(tq); kthread_exit(); } void taskqueue_thread_enqueue(void *context) { struct taskqueue **tqp, *tq; tqp = context; tq = *tqp; wakeup_one(tq); } TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL, swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ, INTR_MPSAFE, &taskqueue_ih)); TASKQUEUE_DEFINE(swi_giant, taskqueue_swi_giant_enqueue, NULL, swi_add(NULL, "Giant taskq", taskqueue_swi_giant_run, NULL, SWI_TQ_GIANT, 0, &taskqueue_giant_ih)); TASKQUEUE_DEFINE_THREAD(thread); struct taskqueue * taskqueue_create_fast(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { return _taskqueue_create(name, mflags, enqueue, context, MTX_SPIN, "fast_taskqueue"); } /* NB: for backwards compatibility */ int taskqueue_enqueue_fast(struct taskqueue *queue, struct task *task) { return taskqueue_enqueue(queue, task); } static void *taskqueue_fast_ih; static void taskqueue_fast_enqueue(void *context) { swi_sched(taskqueue_fast_ih, 0); } static void taskqueue_fast_run(void *dummy) { taskqueue_run(taskqueue_fast); } TASKQUEUE_FAST_DEFINE(fast, taskqueue_fast_enqueue, NULL, swi_add(NULL, "fast taskq", taskqueue_fast_run, NULL, SWI_TQ_FAST, INTR_MPSAFE, &taskqueue_fast_ih)); int taskqueue_member(struct taskqueue *queue, struct thread *td) { int i, j, ret = 0; for (i = 0, j = 0; ; i++) { if (queue->tq_threads[i] == NULL) continue; if (queue->tq_threads[i] == td) { ret = 1; break; } if (++j >= queue->tq_tcount) break; } return (ret); } Index: head/sys/net/if_llatbl.c =================================================================== --- head/sys/net/if_llatbl.c (revision 290804) +++ head/sys/net/if_llatbl.c (revision 290805) @@ -1,839 +1,839 @@ /* * Copyright (c) 2004 Luigi Rizzo, Alessandro Cerri. All rights reserved. * Copyright (c) 2004-2008 Qing Li. All rights reserved. * Copyright (c) 2008 Kip Macy. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_LLTABLE, "lltable", "link level address tables"); static VNET_DEFINE(SLIST_HEAD(, lltable), lltables); #define V_lltables VNET(lltables) static void vnet_lltable_init(void); struct rwlock lltable_rwlock; RW_SYSINIT(lltable_rwlock, &lltable_rwlock, "lltable_rwlock"); static void lltable_unlink(struct lltable *llt); static void llentries_unlink(struct lltable *llt, struct llentries *head); static void htable_unlink_entry(struct llentry *lle); static void htable_link_entry(struct lltable *llt, struct llentry *lle); static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg); /* * Dump lle state for a specific address family. */ static int lltable_dump_af(struct lltable *llt, struct sysctl_req *wr) { int error; LLTABLE_LOCK_ASSERT(); if (llt->llt_ifp->if_flags & IFF_LOOPBACK) return (0); error = 0; IF_AFDATA_RLOCK(llt->llt_ifp); error = lltable_foreach_lle(llt, (llt_foreach_cb_t *)llt->llt_dump_entry, wr); IF_AFDATA_RUNLOCK(llt->llt_ifp); return (error); } /* * Dump arp state for a specific address family. */ int lltable_sysctl_dumparp(int af, struct sysctl_req *wr) { struct lltable *llt; int error = 0; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af == af) { error = lltable_dump_af(llt, wr); if (error != 0) goto done; } } done: LLTABLE_RUNLOCK(); return (error); } /* * Common function helpers for chained hash table. */ /* * Runs specified callback for each entry in @llt. * Caller does the locking. * */ static int htable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { struct llentry *lle, *next; int i, error; error = 0; for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { error = f(llt, lle, farg); if (error != 0) break; } } return (error); } static void htable_link_entry(struct lltable *llt, struct llentry *lle) { struct llentries *lleh; uint32_t hashidx; if ((lle->la_flags & LLE_LINKED) != 0) return; IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); hashidx = llt->llt_hash(lle, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); } static void htable_unlink_entry(struct llentry *lle) { if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(lle->lle_tbl->llt_ifp); LIST_REMOVE(lle, lle_next); lle->la_flags &= ~(LLE_VALID | LLE_LINKED); #if 0 lle->lle_tbl = NULL; lle->lle_head = NULL; #endif } } struct prefix_match_data { const struct sockaddr *addr; const struct sockaddr *mask; struct llentries dchain; u_int flags; }; static int htable_prefix_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct prefix_match_data *pmd; pmd = (struct prefix_match_data *)farg; if (llt->llt_match_prefix(pmd->addr, pmd->mask, pmd->flags, lle)) { LLE_WLOCK(lle); LIST_INSERT_HEAD(&pmd->dchain, lle, lle_chain); } return (0); } static void htable_prefix_free(struct lltable *llt, const struct sockaddr *addr, const struct sockaddr *mask, u_int flags) { struct llentry *lle, *next; struct prefix_match_data pmd; bzero(&pmd, sizeof(pmd)); pmd.addr = addr; pmd.mask = mask; pmd.flags = flags; LIST_INIT(&pmd.dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push matching lles to chain */ lltable_foreach_lle(llt, htable_prefix_free_cb, &pmd); llentries_unlink(llt, &pmd.dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &pmd.dchain, lle_chain, next) lltable_free_entry(llt, lle); } static void htable_free_tbl(struct lltable *llt) { free(llt->lle_head, M_LLTABLE); free(llt, M_LLTABLE); } static void llentries_unlink(struct lltable *llt, struct llentries *head) { struct llentry *lle, *next; LIST_FOREACH_SAFE(lle, head, lle_chain, next) llt->llt_unlink_entry(lle); } /* * Helper function used to drop all mbufs in hold queue. * * Returns the number of held packets, if any, that were dropped. */ size_t lltable_drop_entry_queue(struct llentry *lle) { size_t pkts_dropped; struct mbuf *next; LLE_WLOCK_ASSERT(lle); pkts_dropped = 0; while ((lle->la_numheld > 0) && (lle->la_hold != NULL)) { next = lle->la_hold->m_nextpkt; m_freem(lle->la_hold); lle->la_hold = next; lle->la_numheld--; pkts_dropped++; } KASSERT(lle->la_numheld == 0, ("%s: la_numheld %d > 0, pkts_droped %zd", __func__, lle->la_numheld, pkts_dropped)); return (pkts_dropped); } void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, const char *lladdr) { bcopy(lladdr, &lle->ll_addr, ifp->if_addrlen); lle->la_flags |= LLE_VALID; } /* * * Performes generic cleanup routines and frees lle. * * Called for non-linked entries, with callouts and * other AF-specific cleanups performed. * * @lle must be passed WLOCK'ed * * Returns the number of held packets, if any, that were dropped. */ size_t llentry_free(struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT((lle->la_flags & LLE_LINKED) == 0, ("freeing linked lle")); pkts_dropped = lltable_drop_entry_queue(lle); LLE_FREE_LOCKED(lle); return (pkts_dropped); } /* * (al)locate an llentry for address dst (equivalent to rtalloc for new-arp). * * If found the llentry * is returned referenced and unlocked. */ struct llentry * llentry_alloc(struct ifnet *ifp, struct lltable *lt, struct sockaddr_storage *dst) { struct llentry *la, *la_tmp; IF_AFDATA_RLOCK(ifp); la = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); IF_AFDATA_RUNLOCK(ifp); if (la != NULL) { LLE_ADDREF(la); LLE_WUNLOCK(la); return (la); } if ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { la = lltable_alloc_entry(lt, 0, (struct sockaddr *)dst); if (la == NULL) return (NULL); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); /* Prefer any existing LLE over newly-created one */ la_tmp = lla_lookup(lt, LLE_EXCLUSIVE, (struct sockaddr *)dst); if (la_tmp == NULL) lltable_link_entry(lt, la); IF_AFDATA_WUNLOCK(ifp); if (la_tmp != NULL) { lltable_free_entry(lt, la); la = la_tmp; } LLE_ADDREF(la); LLE_WUNLOCK(la); } return (la); } /* * Free all entries from given table and free itself. */ static int lltable_free_cb(struct lltable *llt, struct llentry *lle, void *farg) { struct llentries *dchain; dchain = (struct llentries *)farg; LLE_WLOCK(lle); LIST_INSERT_HEAD(dchain, lle, lle_chain); return (0); } /* * Free all entries from given table and free itself. */ void lltable_free(struct lltable *llt) { struct llentry *lle, *next; struct llentries dchain; KASSERT(llt != NULL, ("%s: llt is NULL", __func__)); lltable_unlink(llt); LIST_INIT(&dchain); IF_AFDATA_WLOCK(llt->llt_ifp); /* Push all lles to @dchain */ lltable_foreach_lle(llt, lltable_free_cb, &dchain); llentries_unlink(llt, &dchain); IF_AFDATA_WUNLOCK(llt->llt_ifp); LIST_FOREACH_SAFE(lle, &dchain, lle_chain, next) { - if (callout_stop(&lle->lle_timer)) + if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); llentry_free(lle); } llt->llt_free_tbl(llt); } #if 0 void lltable_drain(int af) { struct lltable *llt; struct llentry *lle; register int i; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af != af) continue; for (i=0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { LLE_WLOCK(lle); if (lle->la_hold) { m_freem(lle->la_hold); lle->la_hold = NULL; } LLE_WUNLOCK(lle); } } } LLTABLE_RUNLOCK(); } #endif /* * Deletes an address from given lltable. * Used for userland interaction to remove * individual entries. Skips entries added by OS. */ int lltable_delete_addr(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { struct llentry *lle; struct ifnet *ifp; ifp = llt->llt_ifp; IF_AFDATA_WLOCK(ifp); lle = lla_lookup(llt, LLE_EXCLUSIVE, l3addr); if (lle == NULL) { IF_AFDATA_WUNLOCK(ifp); return (ENOENT); } if ((lle->la_flags & LLE_IFADDR) != 0 && (flags & LLE_IFADDR) == 0) { IF_AFDATA_WUNLOCK(ifp); LLE_WUNLOCK(lle); return (EPERM); } lltable_unlink_entry(llt, lle); IF_AFDATA_WUNLOCK(ifp); llt->llt_delete_entry(llt, lle); return (0); } void lltable_prefix_free(int af, struct sockaddr *addr, struct sockaddr *mask, u_int flags) { struct lltable *llt; LLTABLE_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af != af) continue; llt->llt_prefix_free(llt, addr, mask, flags); } LLTABLE_RUNLOCK(); } struct lltable * lltable_allocate_htbl(uint32_t hsize) { struct lltable *llt; int i; llt = malloc(sizeof(struct lltable), M_LLTABLE, M_WAITOK | M_ZERO); llt->llt_hsize = hsize; llt->lle_head = malloc(sizeof(struct llentries) * hsize, M_LLTABLE, M_WAITOK | M_ZERO); for (i = 0; i < llt->llt_hsize; i++) LIST_INIT(&llt->lle_head[i]); /* Set some default callbacks */ llt->llt_link_entry = htable_link_entry; llt->llt_unlink_entry = htable_unlink_entry; llt->llt_prefix_free = htable_prefix_free; llt->llt_foreach_entry = htable_foreach_lle; llt->llt_free_tbl = htable_free_tbl; return (llt); } /* * Links lltable to global llt list. */ void lltable_link(struct lltable *llt) { LLTABLE_WLOCK(); SLIST_INSERT_HEAD(&V_lltables, llt, llt_link); LLTABLE_WUNLOCK(); } static void lltable_unlink(struct lltable *llt) { LLTABLE_WLOCK(); SLIST_REMOVE(&V_lltables, llt, lltable, llt_link); LLTABLE_WUNLOCK(); } /* * External methods used by lltable consumers */ int lltable_foreach_lle(struct lltable *llt, llt_foreach_cb_t *f, void *farg) { return (llt->llt_foreach_entry(llt, f, farg)); } struct llentry * lltable_alloc_entry(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { return (llt->llt_alloc_entry(llt, flags, l3addr)); } void lltable_free_entry(struct lltable *llt, struct llentry *lle) { llt->llt_free_entry(llt, lle); } void lltable_link_entry(struct lltable *llt, struct llentry *lle) { llt->llt_link_entry(llt, lle); } void lltable_unlink_entry(struct lltable *llt, struct llentry *lle) { llt->llt_unlink_entry(lle); } void lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct lltable *llt; llt = lle->lle_tbl; llt->llt_fill_sa_entry(lle, sa); } struct ifnet * lltable_get_ifp(const struct lltable *llt) { return (llt->llt_ifp); } int lltable_get_af(const struct lltable *llt) { return (llt->llt_af); } /* * Called in route_output when rtm_flags contains RTF_LLDATA. */ int lla_rt_output(struct rt_msghdr *rtm, struct rt_addrinfo *info) { struct sockaddr_dl *dl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY]; struct sockaddr *dst = (struct sockaddr *)info->rti_info[RTAX_DST]; struct ifnet *ifp; struct lltable *llt; struct llentry *lle, *lle_tmp; u_int laflags = 0; int error; KASSERT(dl != NULL && dl->sdl_family == AF_LINK, ("%s: invalid dl\n", __func__)); ifp = ifnet_byindex(dl->sdl_index); if (ifp == NULL) { log(LOG_INFO, "%s: invalid ifp (sdl_index %d)\n", __func__, dl->sdl_index); return EINVAL; } /* XXX linked list may be too expensive */ LLTABLE_RLOCK(); SLIST_FOREACH(llt, &V_lltables, llt_link) { if (llt->llt_af == dst->sa_family && llt->llt_ifp == ifp) break; } LLTABLE_RUNLOCK(); KASSERT(llt != NULL, ("Yep, ugly hacks are bad\n")); error = 0; switch (rtm->rtm_type) { case RTM_ADD: /* Add static LLE */ laflags = 0; if (rtm->rtm_rmx.rmx_expire == 0) laflags = LLE_STATIC; lle = lltable_alloc_entry(llt, laflags, dst); if (lle == NULL) return (ENOMEM); bcopy(LLADDR(dl), &lle->ll_addr, ifp->if_addrlen); if ((rtm->rtm_flags & RTF_ANNOUNCE)) lle->la_flags |= LLE_PUB; lle->la_flags |= LLE_VALID; lle->la_expire = rtm->rtm_rmx.rmx_expire; laflags = lle->la_flags; /* Try to link new entry */ lle_tmp = NULL; IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, dst); if (lle_tmp != NULL) { /* Check if we are trying to replace immutable entry */ if ((lle_tmp->la_flags & LLE_IFADDR) != 0) { IF_AFDATA_WUNLOCK(ifp); LLE_WUNLOCK(lle_tmp); lltable_free_entry(llt, lle); return (EPERM); } /* Unlink existing entry from table */ lltable_unlink_entry(llt, lle_tmp); } lltable_link_entry(llt, lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { EVENTHANDLER_INVOKE(lle_event, lle_tmp,LLENTRY_EXPIRED); lltable_free_entry(llt, lle_tmp); } /* * By invoking LLE handler here we might get * two events on static LLE entry insertion * in routing socket. However, since we might have * other subscribers we need to generate this event. */ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); #ifdef INET /* gratuitous ARP */ if ((laflags & LLE_PUB) && dst->sa_family == AF_INET) arprequest(ifp, &((struct sockaddr_in *)dst)->sin_addr, &((struct sockaddr_in *)dst)->sin_addr, (u_char *)LLADDR(dl)); #endif break; case RTM_DELETE: return (lltable_delete_addr(llt, 0, dst)); default: error = EINVAL; } return (error); } static void vnet_lltable_init() { SLIST_INIT(&V_lltables); } VNET_SYSINIT(vnet_lltable_init, SI_SUB_PSEUDO, SI_ORDER_FIRST, vnet_lltable_init, NULL); #ifdef DDB struct llentry_sa { struct llentry base; struct sockaddr l3_addr; }; static void llatbl_lle_show(struct llentry_sa *la) { struct llentry *lle; uint8_t octet[6]; lle = &la->base; db_printf("lle=%p\n", lle); db_printf(" lle_next=%p\n", lle->lle_next.le_next); db_printf(" lle_lock=%p\n", &lle->lle_lock); db_printf(" lle_tbl=%p\n", lle->lle_tbl); db_printf(" lle_head=%p\n", lle->lle_head); db_printf(" la_hold=%p\n", lle->la_hold); db_printf(" la_numheld=%d\n", lle->la_numheld); db_printf(" la_expire=%ju\n", (uintmax_t)lle->la_expire); db_printf(" la_flags=0x%04x\n", lle->la_flags); db_printf(" la_asked=%u\n", lle->la_asked); db_printf(" la_preempt=%u\n", lle->la_preempt); db_printf(" ln_state=%d\n", lle->ln_state); db_printf(" ln_router=%u\n", lle->ln_router); db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); bcopy(&lle->ll_addr.mac16, octet, sizeof(octet)); db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); db_printf(" lle_timer=%p\n", &lle->lle_timer); switch (la->l3_addr.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; char l3s[INET_ADDRSTRLEN]; sin = (struct sockaddr_in *)&la->l3_addr; inet_ntoa_r(sin->sin_addr, l3s); db_printf(" l3_addr=%s\n", l3s); break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; char l3s[INET6_ADDRSTRLEN]; sin6 = (struct sockaddr_in6 *)&la->l3_addr; ip6_sprintf(l3s, &sin6->sin6_addr); db_printf(" l3_addr=%s\n", l3s); break; } #endif default: db_printf(" l3_addr=N/A (af=%d)\n", la->l3_addr.sa_family); break; } } DB_SHOW_COMMAND(llentry, db_show_llentry) { if (!have_addr) { db_printf("usage: show llentry \n"); return; } llatbl_lle_show((struct llentry_sa *)addr); } static void llatbl_llt_show(struct lltable *llt) { int i; struct llentry *lle; db_printf("llt=%p llt_af=%d llt_ifp=%p\n", llt, llt->llt_af, llt->llt_ifp); for (i = 0; i < llt->llt_hsize; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { llatbl_lle_show((struct llentry_sa *)lle); if (db_pager_quit) return; } } } DB_SHOW_COMMAND(lltable, db_show_lltable) { if (!have_addr) { db_printf("usage: show lltable \n"); return; } llatbl_llt_show((struct lltable *)addr); } DB_SHOW_ALL_COMMAND(lltables, db_show_all_lltables) { VNET_ITERATOR_DECL(vnet_iter); struct lltable *llt; VNET_FOREACH(vnet_iter) { CURVNET_SET_QUIET(vnet_iter); #ifdef VIMAGE db_printf("vnet=%p\n", curvnet); #endif SLIST_FOREACH(llt, &V_lltables, llt_link) { db_printf("llt=%p llt_af=%d llt_ifp=%p(%s)\n", llt, llt->llt_af, llt->llt_ifp, (llt->llt_ifp != NULL) ? llt->llt_ifp->if_xname : "?"); if (have_addr && addr != 0) /* verbose */ llatbl_llt_show(llt); if (db_pager_quit) { CURVNET_RESTORE(); return; } } CURVNET_RESTORE(); } } #endif Index: head/sys/netinet/in.c =================================================================== --- head/sys/netinet/in.c (revision 290804) +++ head/sys/netinet/in.c (revision 290805) @@ -1,1402 +1,1402 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { struct rm_priotracker in_ifa_tracker; register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { struct rm_priotracker in_ifa_tracker; in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (it); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { register u_long i = ntohl(in.s_addr); register u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { register char *cplim = (char *) &ap->sin_addr; register char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); IN_MULTI_LOCK(); if (ii->ii_allhosts) { (void)in_leavegroup_locked(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } IN_MULTI_UNLOCK(); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; /* * XXXME: add fib-aware in_localip. * We definitely don't want to switch between * prefixes in different fibs. */ eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { register struct ifaddr *ifa; u_long t; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); t = ntohl(in.s_addr); /* * Look through the list of addresses for a match * with a broadcast address. */ #define ia ((struct in_ifaddr *)ifa) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && t == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff) return (1); return (0); #undef ia } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { LIST_HEAD(,in_multi) purgeinms; struct in_multi *inm, *tinm; struct ifmultiaddr *ifma; LIST_INIT(&purgeinms); IN_MULTI_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; LIST_INSERT_HEAD(&purgeinms, inm, inm_link); } IF_ADDR_RUNLOCK(ifp); LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { LIST_REMOVE(inm, inm_link); inm_release_locked(inm); } igmp_ifdetach(ifp); IN_MULTI_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); free(lle, M_LLTABLE); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } /* cancel timer */ - if (callout_stop(&lle->lle_timer)) + if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rtentry *rt; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* XXX rtalloc1_fib should take a const param */ rt = rtalloc1_fib(__DECONST(struct sockaddr *, l3addr), 0, 0, ifp->if_fib); if (rt == NULL) return (EINVAL); /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt->rt_flags & RTF_GATEWAY) { if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp || rt->rt_ifp->if_type != IFT_ETHER || (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { RTFREE_LOCKED(rt); return (EINVAL); } } /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) { const char *sa, *mask, *addr, *lim; int len; mask = (const char *)rt_mask(rt); /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if (mask == NULL) { RTFREE_LOCKED(rt); return (EINVAL); } sa = (const char *)rt_key(rt); addr = (const char *)l3addr; len = ((const struct sockaddr_in *)l3addr)->sin_len; lim = addr + len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); #endif RTFREE_LOCKED(rt); return (EINVAL); } } } RTFREE_LOCKED(rt); return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { lltable_set_entry_addr(ifp, lle, IF_LLADDR(ifp)); lle->la_flags |= LLE_STATIC; } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; lltable_link(llt); return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c (revision 290804) +++ head/sys/netinet/tcp_timer.c (revision 290805) @@ -1,994 +1,994 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); static int always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); #define V_tcp_pmtud_blackhole_activated \ VNET(tcp_pmtud_blackhole_activated) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_activated), 0, "Path MTU Discovery Black Hole Detection, Activation Count"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); #define V_tcp_pmtud_blackhole_activated_min_mss \ VNET(tcp_pmtud_blackhole_activated_min_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_failed), 0, "Path MTU Discovery Black Hole Detection, Failure Count"); #ifdef INET static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif #ifdef RSS static int per_cpu_timers = 1; #else static int per_cpu_timers = 0; #endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); #if 0 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) #endif /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ static inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; #ifdef RSS if (per_cpu_timers) { cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (curcpu); /* XXX */ else return (cpuid); } #else /* Legacy, pre-RSS behaviour */ if (per_cpu_timers) { /* * We don't have a flowid -> cpuid mapping, so cheat and * just map unknown cpuids to curcpu. Not the best, but * apparently better than defaulting to swi 0. */ cpuid = inp->inp_flowid % (mp_maxid + 1); if (! CPU_ABSENT(cpuid)) return (cpuid); return (curcpu); } #endif /* Default for RSS and non-RSS - cpuid 0 */ else { return (0); } } /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, ("%s: tp %p delack callout should be running", __func__, tp)); tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, ("%s: tp %p 2msl callout should be running", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); tp = tcp_close(tp); } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { if (!callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { tp->t_timers->tt_flags &= ~TT_2MSL_RST; } } else tp = tcp_close(tp); } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, ("%s: tp %p keep callout should be running", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, ("%s: tp %p persist callout should be running", __func__, tp)); /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, ("%s: tp %p rexmt callout should be running", __func__, tp)); tcp_free_sackholes(tp); /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); headlocked = 1; goto out; } INP_INFO_RUNLOCK(&V_tcbinfo); headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if (tp->t_state == TCPS_SYN_SENT) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { int optlen; #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ optlen = tp->t_maxopd - tp->t_maxseg; tp->t_pmtud_saved_maxopd = tp->t_maxopd; /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else if (isipv6) { /* Use the default MSS. */ tp->t_maxopd = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxopd = V_tcp_pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else { /* Use the default MSS. */ tp->t_maxopd = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif tp->t_maxseg = tp->t_maxopd - optlen; /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift > 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; optlen = tp->t_maxopd - tp->t_maxseg; tp->t_maxopd = tp->t_pmtud_saved_maxopd; tp->t_maxseg = tp->t_maxopd - optlen; V_tcp_pmtud_blackhole_failed++; /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); #endif tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tcp_output(tp); out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); if (headlocked) INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); uint32_t f_reset; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; f_reset = TT_2MSL_RST; break; default: panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { if ((tp->t_timers->tt_flags & timer_type) && - callout_stop(t_callout) && + (callout_stop(t_callout) > 0) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } } else { if ((tp->t_timers->tt_flags & timer_type) == 0) { tp->t_timers->tt_flags |= (timer_type | f_reset); callout_reset_on(t_callout, delta, f_callout, tp, cpu); } else { /* Reset already running callout on the same CPU. */ if (!callout_reset(t_callout, delta, f_callout, tp)) { /* * Callout not cancelled, consider it as not * properly restarted. */ tp->t_timers->tt_flags &= ~f_reset; } } } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; timeout_t *f_callout; uint32_t f_reset; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack_discard; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt_discard; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist_discard; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep_discard; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl_discard; f_reset = TT_2MSL_RST; break; default: panic("tp %p bad timer_type %#x", tp, timer_type); } if (tp->t_timers->tt_flags & timer_type) { - if (callout_stop(t_callout) && + if ((callout_stop(t_callout) > 0) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } else { /* * Can't stop the callout, defer tcpcb actual deletion * to the last tcp timer discard callout. * The TT_STOPPED flag will ensure that no tcp timer * callouts can be restarted on our behalf, and * past this point currently running callouts waiting * on inp lock will return right away after the * classical check for callout reset/stop events: * callout_pending() || !callout_active() */ callout_reset(t_callout, 1, f_callout, tp); } } } #define ticks_to_msecs(t) (1000*(t) / hz) void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) { sbintime_t now; bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; now = getsbinuptime(); if (callout_active(&timer->tt_delack)) xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } Index: head/sys/netinet6/in6.c =================================================================== --- head/sys/netinet6/in6.c (revision 290804) +++ head/sys/netinet6/in6.c (revision 290805) @@ -1,2504 +1,2504 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; /* Announce arrival of local address to all FIBs. */ rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCSIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; case SIOCSIFALIFETIME_IN6: { struct in6_addrlifetime *lt; if (td != NULL) { error = priv_check(td, PRIV_NETINET_ALIFETIME6); if (error) goto out; } if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* sanity for overflow - beware unsigned */ lt = &ifr->ifr_ifru.ifru_lifetime; if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME && lt->ia6t_vltime + time_uptime < time_uptime) { error = EINVAL; goto out; } if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME && lt->ia6t_pltime + time_uptime < time_uptime) { error = EINVAL; goto out; } break; } } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } /* * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCSIFALIFETIME_IN6: ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime; /* for sanity */ if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) ifa_free(&ia->ia_ifa); if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); goto out; } if (pr == NULL) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); log(LOG_ERR, "nd6_prelist_add succeeded but " "no prefix\n"); error = EINVAL; goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_refcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: if (error != 0 || ia == NULL) break; /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_refcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr && pr->ndpr_refcnt == 0) prelist_remove(pr); EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ /* * DAD should be performed for an new address or addresses on * an interface with ND6_IFF_IFDISABLED. */ if (in6if_do_dad(ifp) && (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED))) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* Perform DAD, if the address is TENTATIVE. */ if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } in6_newaddrmsg(ia, RTM_ADD); ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6) ? RTF_HOST : 0); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_newaddrmsg(ia, RTM_DELETE); in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { char ip6buf[INET6_ADDRSTRLEN]; int remove_lle; IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ remove_lle = 0; if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_refcnt--; /* Do not delete lles within prefix if refcont != 0 */ if (ia->ia6_ndpr->ndpr_refcnt == 0) remove_lle = 1; ia->ia6_ndpr = NULL; } nd6_rem_ifa_lle(ia, remove_lle); /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other subsystems about address change/arrival: * 1) Notifies device handler on the first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } IF_ADDR_RUNLOCK(ifp); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) return (error); } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) return (error); ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * find the internet address corresponding to a given address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } if_addr_runlock(ifp); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(&in6_ifa_tracker); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return 1; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Return 1 if an internet address is configured on an interface. */ int in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr) { struct in6_addr in6; struct ifaddr *ifa; struct in6_ifaddr *ia6; in6 = *addr; if (in6_clearscope(&in6)) return (0); in6_setscope(&in6, ifp, NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia6 = (struct in6_ifaddr *)ifa; if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6)) { IF_ADDR_RUNLOCK(ifp); return (1); } } IF_ADDR_RUNLOCK(ifp); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(&in6_ifa_tracker); LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = 0; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); IF_ADDR_RUNLOCK(ifp); return (besta); } TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); IF_ADDR_RUNLOCK(ifp); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); IF_ADDR_RUNLOCK(ifp); return dep[1]; } IF_ADDR_RUNLOCK(ifp); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } IF_ADDR_RUNLOCK(ifp); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) || (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD)) return (0); /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionaly, users may try to assign addresses * before the interface becomes up (or running). * This function returns EAGAIN in that case. * The caller should mark "tentative" on the address instead of * performing DAD immediately. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return (EAGAIN); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } IFNET_RUNLOCK_NOSLEEP(); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ case IFT_L2VLAN: /* ditto */ case IFT_IEEE80211: /* ditto */ case IFT_INFINIBAND: return (64); case IFT_FDDI: /* RFC2467 */ return (64); case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */ return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_ARCNET: /* RFC2497 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; }; #define IN6_LLTBL_DEFAULT_HSIZE 32 #define IN6_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in6_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); free(lle, M_LLTABLE); } static struct llentry * in6_lltable_new(const struct in6_addr *addr6, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->base.r_l3addr.addr6 = *addr6; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in6_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { const struct in6_addr *addr, *mask, *lle_addr; addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr; mask = &((const struct sockaddr_in6 *)smask)->sin6_addr; lle_addr = &lle->r_l3addr.addr6; if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. */ if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in6_lltable_free_entry(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table */ if ((lle->la_flags & LLE_LINKED) != 0) { ifp = llt->llt_ifp; IF_AFDATA_WLOCK_ASSERT(ifp); lltable_unlink_entry(llt, lle); } - if (callout_stop(&lle->lle_timer)) + if (callout_stop(&lle->lle_timer) > 0) LLE_REMREF(lle); llentry_free(lle); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* Our local addresses are always only installed on the default FIB. */ /* XXX rtalloc1 should take a const param */ rt = in6_rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0, RT_DEFAULT_FIB); if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ ifa = ifaof_ifpforaddr(l3addr, ifp); if (ifa != NULL) { ifa_free(ifa); if (rt != NULL) RTFREE_LOCKED(rt); return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &((const struct sockaddr_in6 *)l3addr)->sin6_addr)); if (rt != NULL) RTFREE_LOCKED(rt); return EINVAL; } RTFREE_LOCKED(rt); return 0; } static inline uint32_t in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize) { return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize)); } static uint32_t in6_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize)); } static void in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_addr = lle->r_l3addr.addr6; } static inline struct llentry * in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst)) break; } return (lle); } static void in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in6_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in6_lltable_new(&sin6->sin6_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { lltable_set_entry_addr(ifp, lle, IF_LLADDR(ifp)); lle->la_flags |= LLE_STATIC; } if ((lle->la_flags & LLE_STATIC) != 0) lle->ln_state = ND6_LLINFO_REACHABLE; return (lle); } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); lle = in6_lltable_find_dst(llt, &sin6->sin6_addr); if (lle == NULL) return (NULL); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); return (lle); } static int in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; struct sockaddr_dl *sdl; int error; bzero(&ndpc, sizeof(ndpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_alen = ifp->if_addrlen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); ndpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) ndpc.rtm.rtm_flags |= RTF_PINNED; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); return (error); } static struct lltable * in6_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET6; llt->llt_ifp = ifp; llt->llt_lookup = in6_lltable_lookup; llt->llt_alloc_entry = in6_lltable_alloc; llt->llt_delete_entry = in6_lltable_delete_entry; llt->llt_dump_entry = in6_lltable_dump_entry; llt->llt_hash = in6_lltable_hash; llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry; llt->llt_free_entry = in6_lltable_free_entry; llt->llt_match_prefix = in6_lltable_match_prefix; lltable_link(llt); return (llt); } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = in6_lltattach(ifp); ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: head/sys/netinet6/nd6.c =================================================================== --- head/sys/netinet6/nd6.c (revision 290804) +++ head/sys/netinet6/nd6.c (revision 290805) @@ -1,2431 +1,2431 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif static eventhandler_tag lle_event_eh; /* for debugging? */ #if 0 static int nd6_inuse, nd6_allocated; #endif VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static void nd6_free(struct llentry *, int); static void nd6_free_redirect(const struct llentry *); static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int nd6_resolve_slow(struct ifnet *, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *); static int nd6_need_cache(struct ifnet *); static VNET_DEFINE(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE(struct callout, nd6_timer_ch); static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct rt_addrinfo rtinfo; struct sockaddr_in6 dst; struct sockaddr_dl gw; struct ifnet *ifp; int type; LLE_WLOCK_ASSERT(lle); if (lltable_get_af(lle->lle_tbl) != AF_INET6) return; switch (evt) { case LLENTRY_RESOLVED: type = RTM_ADD; KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); break; case LLENTRY_EXPIRED: type = RTM_DELETE; break; default: return; } ifp = lltable_get_ifp(lle->lle_tbl); bzero(&dst, sizeof(dst)); bzero(&gw, sizeof(gw)); bzero(&rtinfo, sizeof(rtinfo)); lltable_fill_sa_entry(lle, (struct sockaddr *)&dst); dst.sin6_scope_id = in6_getscopezone(ifp, in6_addrscope(&dst.sin6_addr)); gw.sdl_len = sizeof(struct sockaddr_dl); gw.sdl_family = AF_LINK; gw.sdl_alen = ifp->if_addrlen; gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) bcopy(&lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | ( type == RTM_ADD ? RTF_UP: 0), 0, RT_DEFAULT_FIB); } void nd6_init(void) { LIST_INIT(&V_nd_prefix); /* initialization of the default router list */ TAILQ_INIT(&V_nd_defrouter); /* start timer */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); nd6_dad_init(); if (IS_DEFAULT_VNET(curvnet)) lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = (struct nd_ifinfo *)malloc(sizeof(*nd), M_IP6NDP, M_WAITOK|M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) nd->flags |= ND6_IFF_ACCEPT_RTADV; if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct nd_ifinfo *nd) { free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; switch (ifp->if_type) { case IFT_ARCNET: ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */ break; case IFT_FDDI: ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */ break; case IFT_ISO88025: ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu); break; default: ndi->maxmtu = ifp->if_mtu; break; } /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accomodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ static void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->lle_timer); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->lle_timer, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->lle_timer, tick, nd6_llinfo_timer, ln); } } - if (canceled) + if (canceled > 0) LLE_REMREF(ln); } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. * * Set noinline to be dtrace-friendly */ static __noinline struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr hdr; struct mbuf *m; if (ln->la_hold == NULL) return (NULL); /* * assume every packet in la_hold has the same IP header */ m = ln->la_hold; if (sizeof(hdr) > m->m_len) return (NULL); m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr); *src = hdr.ip6_src; return (src); } /* * Switch @lle state to new state optionally arming timers. * * Set noinline to be dtrace-friendly */ __noinline void nd6_llinfo_setstate(struct llentry *lle, int newstate) { struct ifnet *ifp; long delay; delay = 0; switch (newstate) { case ND6_LLINFO_INCOMPLETE: ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(lle)) { ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->reachable * hz; } break; case ND6_LLINFO_STALE: delay = (long)V_nd6_gctimer * hz; break; case ND6_LLINFO_DELAY: lle->la_asked = 0; delay = (long)V_nd6_delay * hz; break; } if (delay > 0) nd6_llinfo_settimer_locked(lle, delay); lle->ln_state = newstate; } /* * Timer-dependent part of nd state machine. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_llinfo_timer(void *arg) { struct llentry *ln; struct in6_addr *dst, *pdst, *psrc, src; struct ifnet *ifp; struct nd_ifinfo *ndi = NULL; int send_ns; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; LLE_WLOCK(ln); if (callout_pending(&ln->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by nd6_llinfo_settimer_locked above since canceled * would have been 1. */ LLE_WUNLOCK(ln); return; } ifp = ln->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); ndi = ND_IFINFO(ifp); send_ns = 0; dst = &ln->r_l3addr.addr6; pdst = dst; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { nd6_free(ln, 0); ln = NULL; goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; send_ns = 1; /* Send NS to multicast address */ pdst = NULL; } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_TIMEDOUT); nd6_free(ln, 0); ln = NULL; if (m != NULL) icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); break; case ND6_LLINFO_STALE: /* Garbage Collection(RFC 2461 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) { EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); nd6_free(ln, 1); ln = NULL; } break; case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE); send_ns = 1; } else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */ break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; send_ns = 1; } else { EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); nd6_free(ln, 0); ln = NULL; } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (send_ns != 0) { nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); psrc = nd6_llinfo_get_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd6_ns_output(ifp, psrc, pdst, dst, NULL); } if (ln != NULL) LLE_FREE_LOCKED(ln); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; struct in6_ifaddr *ia6, *nia6; callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); /* expire default router list */ TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->expire && dr->expire < time_uptime) defrtrlist_del(dr); } /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ addrloop: TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) { /* * Schedule DAD for a tentative address. This happens * if the interface was down or not running * when the address was configured. */ int delay; delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); nd6_dad_start((struct ifaddr *)ia6, delay); } else { /* * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ if ((ia6->ia_ifp->if_flags & IFF_UP) == 0 || (ia6->ia_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ia6->ia_ifp)->flags & ND6_IFF_IFDISABLED) != 0) { ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } /* expire prefix list */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* * check prefix lifetime. * since pltime is just for autoconf, pltime processing for * prefix is not necessary. */ if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME && time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) { /* * address expiration and prefix expiration are * separate. NEVER perform in6_purgeaddr here. */ prelist_remove(pr); } } CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; ifp = ia6->ia_ifa.ifa_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); IF_ADDR_RUNLOCK(ifp); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Nuke neighbor cache/prefix/default router management table, right before * ifp goes away. */ void nd6_purge(struct ifnet *ifp) { struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; /* * Nuke default router list entries toward ifp. * We defer removal of default router list entries that is installed * in the routing table, in order to keep additional side effects as * small as possible. */ TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->installed) continue; if (dr->ifp == ifp) defrtrlist_del(dr); } TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (!dr->installed) continue; if (dr->ifp == ifp) defrtrlist_del(dr); } /* Nuke prefix list entries toward ifp */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { if (pr->ndpr_ifp == ifp) { /* * Because if_detach() does *not* release prefixes * while purging addresses the reference count will * still be above zero. We therefore reset it to * make sure that the prefix really gets purged. */ pr->ndpr_refcnt = 0; /* * Previously, pr->ndpr_addr is removed as well, * but I strongly believe we don't have to do it. * nd6_purge() is only called from in6_ifdetach(), * which removes all the associated interface addresses * by itself. * (jinmei@kame.net 20010129) */ prelist_remove(pr); } } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select(); } /* XXXXX * We do not nuke the neighbor cache entries here any more * because the neighbor cache is kept in if_afdata[AF_INET6]. * nd6_purge() is invoked by in6_ifdetach() which is called * from if_detach() where everything gets purged. So let * in6_domifdetach() do the actual L2 table purging work. */ } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6); return (ln); } struct llentry * nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6); if (ln != NULL) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *dstaddr; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { struct rtentry *rt; /* Always use the default FIB here. */ rt = in6_rtalloc1((struct sockaddr *)&pr->ndpr_prefix, 0, 0, RT_DEFAULT_FIB); if (rt == NULL) continue; /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. In the case * where RADIX_MPATH is enabled, multiple route * entries (of the same rt_key value) will be * installed because the interface addresses all * differ. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr)) { RTFREE_LOCKED(rt); continue; } RTFREE_LOCKED(rt); } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) return (1); } /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ dstaddr = ifa_ifwithdstaddr((const struct sockaddr *)addr, RT_ALL_FIBS); if (dstaddr != NULL) { if (dstaddr->ifa_ifp == ifp) { ifa_free(dstaddr); return (1); } ifa_free(dstaddr); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && TAILQ_EMPTY(&V_nd_defrouter) && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *lle; int rc = 0; IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ IF_AFDATA_RLOCK(ifp); if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } IF_AFDATA_RUNLOCK(ifp); return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_free(struct llentry *ln, int gc) { struct nd_defrouter *dr; struct ifnet *ifp; LLE_WLOCK_ASSERT(ln); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); ifp = ln->lle_tbl->llt_ifp; if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { dr = defrouter_lookup(&ln->r_l3addr.addr6, ifp); if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); LLE_REMREF(ln); LLE_WUNLOCK(ln); return; } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&ln->r_l3addr.addr6, ifp); } if (dr) { /* * Since defrouter_select() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select(); } /* * If this entry was added by an on-link redirect, remove the * corresponding host route. */ if (ln->la_flags & LLE_REDIRECT) nd6_free_redirect(ln); if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { /* Remove callout reference */ LLE_REMREF(ln); lltable_unlink_entry(ln->lle_tbl, ln); } IF_AFDATA_UNLOCK(ifp); llentry_free(ln); } /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. */ static void nd6_free_redirect(const struct llentry *ln) { int fibnum; struct rtentry *rt; struct radix_node_head *rnh; struct sockaddr_in6 sin6; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = rt_tables_get_rnh(fibnum, AF_INET6); if (rnh == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); rt = in6_rtalloc1((struct sockaddr *)&sin6, 0, RTF_RNH_LOCKED, fibnum); if (rt) { if (rt->rt_flags == (RTF_UP | RTF_HOST | RTF_DYNAMIC)) rt_expunge(rnh, rt); RTFREE_LOCKED(rt); } RADIX_NODE_HEAD_UNLOCK(rnh); } } /* * Rejuvenate this function for routing operations related * processing. */ void nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; struct ifnet *ifp; gateway = (struct sockaddr_in6 *)rt->rt_gateway; ifp = rt->rt_ifp; switch (req) { case RTM_ADD: break; case RTM_DELETE: if (!ifp) return; /* * Only indirect routes are interesting. */ if ((rt->rt_flags & RTF_GATEWAY) == 0) return; /* * check for default route */ if (IN6_ARE_ADDR_EQUAL(&in6addr_any, &SIN6(rt_key(rt))->sin6_addr)) { dr = defrouter_lookup(&gateway->sin6_addr, ifp); if (dr != NULL) dr->installed = 0; } break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intented for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } IF_ADDR_RUNLOCK(ifp); if (ifa != NULL) { /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } IF_ADDR_RUNLOCK(ifp); } } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } IF_ADDR_RUNLOCK(ifp); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } } ND_IFINFO(ifp)->flags = ND.flags; break; #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select(); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct nd_prefix *pr, *next; LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { struct in6_ifaddr *ia, *ia_next; if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ /* do we really have to remove addresses as well? */ /* XXXRW: in6_ifaddrhead locking. */ TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } prelist_remove(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ struct nd_defrouter *dr, *next; defrouter_reset(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, next) { defrtrlist_del(dr); } defrouter_select(); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&nb_addr, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Calculates new isRouter value based on provided parameters and * returns it. */ static int nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr, int ln_router) { /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * is_new old_addr new_addr NS RS RA redir * D R * 0 n n (1) c ? s * 0 y n (2) c s s * 0 n y (3) c s s * 0 y y (4) c s s * 0 y y (5) c s s * 1 -- n (6) c c c s * 1 -- y (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_new) /* (6-7) */ ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln_router = 1; else { if (is_new) /* (6-7) */ ln_router = 0; } break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_new && (old_addr || new_addr)) || /* (2-5) */ (is_new && new_addr)) { /* (7) */ ln_router = 1; } break; } return (ln_router); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * */ void nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL, *ln_tmp; int is_newentry; int do_update; int olladdr; int llchange; int flags; uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(from, flags, ifp); IF_AFDATA_RUNLOCK(ifp); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; ln = nd6_alloc(from, 0, ifp); if (ln == NULL) return; /* * Since we already know all the data for the new entry, * fill it before insertion. */ if (lladdr != NULL) lltable_set_entry_addr(ifp, ln, lladdr); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp); if (ln_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); if (lladdr != NULL) /* (7) */ EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } else { lltable_free_entry(LLTABLE6(ifp), ln); ln = ln_tmp; ln_tmp = NULL; } } /* do nothing if static ndp is set */ if ((ln->la_flags & LLE_STATIC)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); return; } olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, &ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y y (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ do_update = 0; if (is_newentry == 0 && llchange != 0) { do_update = 1; /* (3,5) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ lltable_set_entry_addr(ifp, ln, lladdr); nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); } /* Calculates new router status */ router = nd6_is_router(type, code, is_newentry, olladdr, lladdr != NULL ? 1 : 0, ln->ln_router); ln->ln_router = router; /* Mark non-router redirects with special flag */ if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER) ln->la_flags |= LLE_REDIRECT; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, ifp, chain, &sin6); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if ((do_update || is_newentry) && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select(); } } static void nd6_slowtimo(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } IFNET_RUNLOCK_NOSLEEP(); CURVNET_RESTORE(); } void nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain, struct sockaddr_in6 *sin6) { LLE_WLOCK_ASSERT(ln); *chain = ln->la_hold; ln->la_hold = NULL; lltable_fill_sa_entry(ln, (struct sockaddr *)sin6); if (ln->ln_state == ND6_LLINFO_STALE) { /* * The first time we send a packet to a * neighbor whose entry is STALE, we have * to change the state to DELAY and a sets * a timer to expire in DELAY_FIRST_PROBE_TIME * seconds to ensure do neighbor unreachability * detection on expiration. * (RFC 2461 7.3.3) */ nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY); } } int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst) { int error; int ip6len; struct ip6_hdr *ip6; struct m_tag *mtag; #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, NULL); return (error); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification * is required, call "slow" version. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags) { struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; if (pflags != NULL) *pflags = 0; dst6 = (const struct sockaddr_in6 *)sa_dst; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (m != NULL && m->m_flags & M_MCAST) { switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_L2VLAN: case IFT_IEEE80211: case IFT_BRIDGE: case IFT_ISO88025: ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr, desten); return (0); default: m_freem(m); return (EAFNOSUPPORT); } } IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&dst6->sin6_addr, 0, ifp); IF_AFDATA_RUNLOCK(ifp); /* * Perform fast path for the following cases: * 1) lle state is REACHABLE * 2) lle state is DELAY (NS message sent) * * Every other case involves lle modification, so we handle * them separately. */ if (ln == NULL || (ln->ln_state != ND6_LLINFO_REACHABLE && ln->ln_state != ND6_LLINFO_DELAY)) { /* Fall back to slow processing path */ if (ln != NULL) LLE_RUNLOCK(ln); return (nd6_resolve_slow(ifp, m, dst6, desten, pflags)); } bcopy(&ln->ll_addr, desten, ifp->if_addrlen); if (pflags != NULL) *pflags = ln->la_flags; LLE_RUNLOCK(ln); return (0); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Heavy version. * Function assume that destination LLE does not exist, * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired. * * Set noinline to be dtrace-friendly */ static __noinline int nd6_resolve_slow(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; int send_ns; /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { IF_AFDATA_RLOCK(ifp); lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); IF_AFDATA_RUNLOCK(ifp); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ lle = nd6_alloc(&dst->sin6_addr, 0, ifp); if (lle == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Prefer any existing entry over newly-created one */ lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if (lle_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { lltable_free_entry(LLTABLE6(ifp), lle); lle = lle_tmp; lle_tmp = NULL; } } } if (lle == NULL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { m_freem(m); return (ENOBUFS); } if (m != NULL) m_freem(m); return (ENOBUFS); } LLE_WLOCK_ASSERT(lle); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY); /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { bcopy(&lle->ll_addr, desten, ifp->if_addrlen); if (pflags != NULL) *pflags = lle->la_flags; LLE_WUNLOCK(lle); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf, unless the number of the packet * does not exceed nd6_maxqueuelen. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. * Note that for newly-created lle la_asked will be 0, * so we will transition from ND6_LLINFO_NOSTATE to * ND6_LLINFO_INCOMPLETE state here. */ psrc = NULL; send_ns = 0; if (lle->la_asked == 0) { lle->la_asked++; send_ns = 1; psrc = nd6_llinfo_get_holdsrc(lle, &src); nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE); } LLE_WUNLOCK(lle); if (send_ns != 0) nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL); return (EWOULDBLOCK); } int nd6_flush_holdchain(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain, struct sockaddr_in6 *dst) { struct mbuf *m, *m_head; struct ifnet *outifp; int error = 0; m_head = chain; if ((ifp->if_flags & IFF_LOOPBACK) != 0) outifp = origifp; else outifp = ifp; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; error = nd6_output_ifp(ifp, origifp, m, dst); } /* * XXX * note that intermediate errors are blindly ignored */ return (error); } static int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than ARCnet, Ethernet, FDDI and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_IEEE80211: case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln, *ln_tmp; struct sockaddr *dst; ifp = ia->ia_ifa.ifa_ifp; if (nd6_need_cache(ifp) == 0) return (0); ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; dst = (struct sockaddr *)&ia->ia_addr; ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst); if (ln == NULL) return (ENOBUFS); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Unlink any entry if exists */ ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst); if (ln_tmp != NULL) lltable_unlink_entry(LLTABLE6(ifp), ln_tmp); lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); LLE_WUNLOCK(ln); if (ln_tmp != NULL) llentry_free(ln_tmp); return (0); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all) { struct sockaddr_in6 mask, addr; struct sockaddr *saddr, *smask; struct ifnet *ifp; ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); saddr = (struct sockaddr *)&addr; smask = (struct sockaddr *)&mask; if (all != 0) lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC); else lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; return; } static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_icmp6); #endif SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, CTLFLAG_RD, nd6_sysctl_drlist, ""); SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLFLAG_RD, nd6_sysctl_prlist, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { struct in6_defrouter d; struct nd_defrouter *dr; int error; if (req->newptr) return (EPERM); bzero(&d, sizeof(d)); d.rtaddr.sin6_family = AF_INET6; d.rtaddr.sin6_len = sizeof(d.rtaddr); /* * XXX locking */ TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { d.rtaddr.sin6_addr = dr->rtaddr; error = sa6_recoverscope(&d.rtaddr); if (error != 0) return (error); d.flags = dr->flags; d.rtlifetime = dr->rtlifetime; d.expire = dr->expire + (time_second - time_uptime); d.if_index = dr->ifp->if_index; error = SYSCTL_OUT(req, &d, sizeof(d)); if (error != 0) return (error); } return (0); } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); /* * XXX locking */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_refcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) return (error); LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) return (error); } } return (0); } Index: head/sys/netpfil/pf/if_pfsync.c =================================================================== --- head/sys/netpfil/pf/if_pfsync.c (revision 290804) +++ head/sys/netpfil/pf/if_pfsync.c (revision 290805) @@ -1,2421 +1,2421 @@ /*- * Copyright (c) 2002 Michael Shalayeff * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2009 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $ * * Revisions picked from OpenBSD after revision 1.110 import: * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input() * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates * 1.120, 1.175 - use monotonic time_uptime * 1.122 - reduce number of updates for non-TCP sessions * 1.125, 1.127 - rewrite merge or stale processing * 1.128 - cleanups * 1.146 - bzero() mbuf before sparsely filling it with data * 1.170 - SIOCSIFMTU checks * 1.126, 1.142 - deferred packets processing * 1.173 - correct expire time processing */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PFSYNC_MINPKT ( \ sizeof(struct ip) + \ sizeof(struct pfsync_header) + \ sizeof(struct pfsync_subheader) ) struct pfsync_pkt { struct ip *ip; struct in_addr src; u_int8_t flags; }; static int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *, struct pfsync_state_peer *); static int pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int); static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = { pfsync_in_clr, /* PFSYNC_ACT_CLR */ pfsync_in_ins, /* PFSYNC_ACT_INS */ pfsync_in_iack, /* PFSYNC_ACT_INS_ACK */ pfsync_in_upd, /* PFSYNC_ACT_UPD */ pfsync_in_upd_c, /* PFSYNC_ACT_UPD_C */ pfsync_in_ureq, /* PFSYNC_ACT_UPD_REQ */ pfsync_in_del, /* PFSYNC_ACT_DEL */ pfsync_in_del_c, /* PFSYNC_ACT_DEL_C */ pfsync_in_error, /* PFSYNC_ACT_INS_F */ pfsync_in_error, /* PFSYNC_ACT_DEL_F */ pfsync_in_bus, /* PFSYNC_ACT_BUS */ pfsync_in_tdb, /* PFSYNC_ACT_TDB */ pfsync_in_eof /* PFSYNC_ACT_EOF */ }; struct pfsync_q { void (*write)(struct pf_state *, void *); size_t len; u_int8_t action; }; /* we have one of these for every PFSYNC_S_ */ static void pfsync_out_state(struct pf_state *, void *); static void pfsync_out_iack(struct pf_state *, void *); static void pfsync_out_upd_c(struct pf_state *, void *); static void pfsync_out_del(struct pf_state *, void *); static struct pfsync_q pfsync_qs[] = { { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD }, { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C } }; static void pfsync_q_ins(struct pf_state *, int); static void pfsync_q_del(struct pf_state *); static void pfsync_update_state(struct pf_state *); struct pfsync_upd_req_item { TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; struct pfsync_upd_req ur_msg; }; struct pfsync_deferral { struct pfsync_softc *pd_sc; TAILQ_ENTRY(pfsync_deferral) pd_entry; u_int pd_refs; struct callout pd_tmo; struct pf_state *pd_st; struct mbuf *pd_m; }; struct pfsync_softc { /* Configuration */ struct ifnet *sc_ifp; struct ifnet *sc_sync_if; struct ip_moptions sc_imo; struct in_addr sc_sync_peer; uint32_t sc_flags; #define PFSYNCF_OK 0x00000001 #define PFSYNCF_DEFER 0x00000002 #define PFSYNCF_PUSH 0x00000004 uint8_t sc_maxupdates; struct ip sc_template; struct callout sc_tmo; struct mtx sc_mtx; /* Queued data */ size_t sc_len; TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT]; TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list; TAILQ_HEAD(, pfsync_deferral) sc_deferrals; u_int sc_deferred; void *sc_plus; size_t sc_pluslen; /* Bulk update info */ struct mtx sc_bulk_mtx; uint32_t sc_ureq_sent; int sc_bulk_tries; uint32_t sc_ureq_received; int sc_bulk_hashid; uint64_t sc_bulk_stateid; uint32_t sc_bulk_creatorid; struct callout sc_bulk_tmo; struct callout sc_bulkfail_tmo; }; #define PFSYNC_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx) #define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx) #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED) static const char pfsyncname[] = "pfsync"; static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data"); static VNET_DEFINE(struct pfsync_softc *, pfsyncif) = NULL; #define V_pfsyncif VNET(pfsyncif) static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL; #define V_pfsync_swi_cookie VNET(pfsync_swi_cookie) static VNET_DEFINE(struct pfsyncstats, pfsyncstats); #define V_pfsyncstats VNET(pfsyncstats) static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW; #define V_pfsync_carp_adj VNET(pfsync_carp_adj) static void pfsync_timeout(void *); static void pfsync_push(struct pfsync_softc *); static void pfsyncintr(void *); static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *, void *); static void pfsync_multicast_cleanup(struct pfsync_softc *); static void pfsync_pointers_init(void); static void pfsync_pointers_uninit(void); static int pfsync_init(void); static void pfsync_uninit(void); SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC"); SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(pfsyncstats), pfsyncstats, "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW, &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment"); static int pfsync_clone_create(struct if_clone *, int, caddr_t); static void pfsync_clone_destroy(struct ifnet *); static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *, struct pf_state_peer *); static int pfsyncoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int pfsyncioctl(struct ifnet *, u_long, caddr_t); static int pfsync_defer(struct pf_state *, struct mbuf *); static void pfsync_undefer(struct pfsync_deferral *, int); static void pfsync_undefer_state(struct pf_state *, int); static void pfsync_defer_tmo(void *); static void pfsync_request_update(u_int32_t, u_int64_t); static void pfsync_update_state_req(struct pf_state *); static void pfsync_drop(struct pfsync_softc *); static void pfsync_sendout(int); static void pfsync_send_plus(void *, size_t); static void pfsync_bulk_start(void); static void pfsync_bulk_status(u_int8_t); static void pfsync_bulk_update(void *); static void pfsync_bulk_fail(void *); #ifdef IPSEC static void pfsync_update_net_tdb(struct pfsync_tdb *); #endif #define PFSYNC_MAX_BULKTRIES 12 VNET_DEFINE(struct if_clone *, pfsync_cloner); #define V_pfsync_cloner VNET(pfsync_cloner) static int pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) { struct pfsync_softc *sc; struct ifnet *ifp; int q; if (unit != 0) return (EINVAL); sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); sc->sc_flags |= PFSYNCF_OK; for (q = 0; q < PFSYNC_S_COUNT; q++) TAILQ_INIT(&sc->sc_qs[q]); TAILQ_INIT(&sc->sc_upd_req_list); TAILQ_INIT(&sc->sc_deferrals); sc->sc_len = PFSYNC_MINPKT; sc->sc_maxupdates = 128; ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC); if (ifp == NULL) { free(sc, M_PFSYNC); return (ENOSPC); } if_initname(ifp, pfsyncname, unit); ifp->if_softc = sc; ifp->if_ioctl = pfsyncioctl; ifp->if_output = pfsyncoutput; ifp->if_type = IFT_PFSYNC; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_hdrlen = sizeof(struct pfsync_header); ifp->if_mtu = ETHERMTU; mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF); mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF); callout_init(&sc->sc_tmo, 1); callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0); callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0); if_attach(ifp); bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); V_pfsyncif = sc; return (0); } static void pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; /* * At this stage, everything should have already been * cleared by pfsync_uninit(), and we have only to * drain callouts. */ while (sc->sc_deferred > 0) { struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals); TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); sc->sc_deferred--; - if (callout_stop(&pd->pd_tmo)) { + if (callout_stop(&pd->pd_tmo) > 0) { pf_release_state(pd->pd_st); m_freem(pd->pd_m); free(pd, M_PFSYNC); } else { pd->pd_refs++; callout_drain(&pd->pd_tmo); free(pd, M_PFSYNC); } } callout_drain(&sc->sc_tmo); callout_drain(&sc->sc_bulkfail_tmo); callout_drain(&sc->sc_bulk_tmo); if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy"); bpfdetach(ifp); if_detach(ifp); pfsync_drop(sc); if_free(ifp); if (sc->sc_imo.imo_membership) pfsync_multicast_cleanup(sc); mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_bulk_mtx); free(sc, M_PFSYNC); V_pfsyncif = NULL; } static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, struct pf_state_peer *d) { if (s->scrub.scrub_flag && d->scrub == NULL) { d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO); if (d->scrub == NULL) return (ENOMEM); } return (0); } static int pfsync_state_import(struct pfsync_state *sp, u_int8_t flags) { struct pfsync_softc *sc = V_pfsyncif; #ifndef __NO_STRICT_ALIGNMENT struct pfsync_state_key key[2]; #endif struct pfsync_state_key *kw, *ks; struct pf_state *st = NULL; struct pf_state_key *skw = NULL, *sks = NULL; struct pf_rule *r = NULL; struct pfi_kif *kif; int error; PF_RULES_RASSERT(); if (sp->creatorid == 0) { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: invalid creator id: %08x\n", __func__, ntohl(sp->creatorid)); return (EINVAL); } if ((kif = pfi_kif_find(sp->ifname)) == NULL) { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: unknown interface: %s\n", __func__, sp->ifname); if (flags & PFSYNC_SI_IOCTL) return (EINVAL); return (0); /* skip this state */ } /* * If the ruleset checksums match or the state is coming from the ioctl, * it's safe to associate the state with the rule of that number. */ if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) < pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) r = pf_main_ruleset.rules[ PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)]; else r = &V_pf_default_rule; if ((r->max_states && counter_u64_fetch(r->states_cur) >= r->max_states)) goto cleanup; /* * XXXGL: consider M_WAITOK in ioctl path after. */ if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL) goto cleanup; if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL) goto cleanup; #ifndef __NO_STRICT_ALIGNMENT bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2); kw = &key[PF_SK_WIRE]; ks = &key[PF_SK_STACK]; #else kw = &sp->key[PF_SK_WIRE]; ks = &sp->key[PF_SK_STACK]; #endif if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) || PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) || kw->port[0] != ks->port[0] || kw->port[1] != ks->port[1]) { sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sks == NULL) goto cleanup; } else sks = skw; /* allocate memory for scrub info */ if (pfsync_alloc_scrub_memory(&sp->src, &st->src) || pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) goto cleanup; /* Copy to state key(s). */ skw->addr[0] = kw->addr[0]; skw->addr[1] = kw->addr[1]; skw->port[0] = kw->port[0]; skw->port[1] = kw->port[1]; skw->proto = sp->proto; skw->af = sp->af; if (sks != skw) { sks->addr[0] = ks->addr[0]; sks->addr[1] = ks->addr[1]; sks->port[0] = ks->port[0]; sks->port[1] = ks->port[1]; sks->proto = sp->proto; sks->af = sp->af; } /* copy to state */ bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr)); st->creation = time_uptime - ntohl(sp->creation); st->expire = time_uptime; if (sp->expire) { uint32_t timeout; timeout = r->timeout[sp->timeout]; if (!timeout) timeout = V_pf_default_rule.timeout[sp->timeout]; /* sp->expire may have been adaptively scaled by export. */ st->expire -= timeout - ntohl(sp->expire); } st->direction = sp->direction; st->log = sp->log; st->timeout = sp->timeout; st->state_flags = sp->state_flags; st->id = sp->id; st->creatorid = sp->creatorid; pf_state_peer_ntoh(&sp->src, &st->src); pf_state_peer_ntoh(&sp->dst, &st->dst); st->rule.ptr = r; st->nat_rule.ptr = NULL; st->anchor.ptr = NULL; st->rt_kif = NULL; st->pfsync_time = time_uptime; st->sync_state = PFSYNC_S_NONE; if (!(flags & PFSYNC_SI_IOCTL)) st->state_flags |= PFSTATE_NOSYNC; if ((error = pf_state_insert(kif, skw, sks, st)) != 0) goto cleanup_state; /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */ counter_u64_add(r->states_cur, 1); counter_u64_add(r->states_tot, 1); if (!(flags & PFSYNC_SI_IOCTL)) { st->state_flags &= ~PFSTATE_NOSYNC; if (st->state_flags & PFSTATE_ACK) { pfsync_q_ins(st, PFSYNC_S_IACK); pfsync_push(sc); } } st->state_flags &= ~PFSTATE_ACK; PF_STATE_UNLOCK(st); return (0); cleanup: error = ENOMEM; if (skw == sks) sks = NULL; if (skw != NULL) uma_zfree(V_pf_state_key_z, skw); if (sks != NULL) uma_zfree(V_pf_state_key_z, sks); cleanup_state: /* pf_state_insert() frees the state keys. */ if (st) { if (st->dst.scrub) uma_zfree(V_pf_state_scrub_z, st->dst.scrub); if (st->src.scrub) uma_zfree(V_pf_state_scrub_z, st->src.scrub); uma_zfree(V_pf_state_z, st); } return (error); } static int pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_pkt pkt; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pfsync_header *ph; struct pfsync_subheader subh; int offset, len; int rv; uint16_t count; *mp = NULL; V_pfsyncstats.pfsyncs_ipackets++; /* Verify that we have a sync interface configured. */ if (!sc || !sc->sc_sync_if || !V_pf_status.running || (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) goto done; /* verify that the packet came in on the right interface */ if (sc->sc_sync_if != m->m_pkthdr.rcvif) { V_pfsyncstats.pfsyncs_badif++; goto done; } if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* verify that the IP TTL is 255. */ if (ip->ip_ttl != PFSYNC_DFLTTL) { V_pfsyncstats.pfsyncs_badttl++; goto done; } offset = ip->ip_hl << 2; if (m->m_pkthdr.len < offset + sizeof(*ph)) { V_pfsyncstats.pfsyncs_hdrops++; goto done; } if (offset + sizeof(*ph) > m->m_len) { if (m_pullup(m, offset + sizeof(*ph)) == NULL) { V_pfsyncstats.pfsyncs_hdrops++; return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ph = (struct pfsync_header *)((char *)ip + offset); /* verify the version */ if (ph->version != PFSYNC_VERSION) { V_pfsyncstats.pfsyncs_badver++; goto done; } len = ntohs(ph->len) + offset; if (m->m_pkthdr.len < len) { V_pfsyncstats.pfsyncs_badlen++; goto done; } /* Cheaper to grab this now than having to mess with mbufs later */ pkt.ip = ip; pkt.src = ip->ip_src; pkt.flags = 0; /* * Trusting pf_chksum during packet processing, as well as seeking * in interface name tree, require holding PF_RULES_RLOCK(). */ PF_RULES_RLOCK(); if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) pkt.flags |= PFSYNC_SI_CKSUM; offset += sizeof(*ph); while (offset <= len - sizeof(subh)) { m_copydata(m, offset, sizeof(subh), (caddr_t)&subh); offset += sizeof(subh); if (subh.action >= PFSYNC_ACT_MAX) { V_pfsyncstats.pfsyncs_badact++; PF_RULES_RUNLOCK(); goto done; } count = ntohs(subh.count); V_pfsyncstats.pfsyncs_iacts[subh.action] += count; rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count); if (rv == -1) { PF_RULES_RUNLOCK(); return (IPPROTO_DONE); } offset += rv; } PF_RULES_RUNLOCK(); done: m_freem(m); return (IPPROTO_DONE); } static int pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_clr *clr; struct mbuf *mp; int len = sizeof(*clr) * count; int i, offp; u_int32_t creatorid; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } clr = (struct pfsync_clr *)(mp->m_data + offp); for (i = 0; i < count; i++) { creatorid = clr[i].creatorid; if (clr[i].ifname[0] != '\0' && pfi_kif_find(clr[i].ifname) == NULL) continue; for (int i = 0; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_state *s; relock: PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (s->creatorid == creatorid) { s->state_flags |= PFSTATE_NOSYNC; pf_unlink_state(s, PF_ENTER_LOCKED); goto relock; } } PF_HASHROW_UNLOCK(ih); } } return (len); } static int pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct mbuf *mp; struct pfsync_state *sa, *sp; int len = sizeof(*sp) * count; int i, offp; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_state *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; /* Check for invalid values. */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST || sp->direction > PF_OUT || (sp->af != AF_INET && sp->af != AF_INET6)) { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: invalid value\n", __func__); V_pfsyncstats.pfsyncs_badval++; continue; } if (pfsync_state_import(sp, pkt->flags) == ENOMEM) /* Drop out, but process the rest of the actions. */ break; } return (len); } static int pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_ins_ack *ia, *iaa; struct pf_state *st; struct mbuf *mp; int len = count * sizeof(*ia); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } iaa = (struct pfsync_ins_ack *)(mp->m_data + offp); for (i = 0; i < count; i++) { ia = &iaa[i]; st = pf_find_state_byid(ia->id, ia->creatorid); if (st == NULL) continue; if (st->state_flags & PFSTATE_ACK) { PFSYNC_LOCK(V_pfsyncif); pfsync_undefer_state(st, 0); PFSYNC_UNLOCK(V_pfsyncif); } PF_STATE_UNLOCK(st); } /* * XXX this is not yet implemented, but we know the size of the * message so we can skip it. */ return (count * sizeof(struct pfsync_ins_ack)); } static int pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src, struct pfsync_state_peer *dst) { int sync = 0; PF_STATE_LOCK_ASSERT(st); /* * The state should never go backwards except * for syn-proxy states. Neither should the * sequence window slide backwards. */ if ((st->src.state > src->state && (st->src.state < PF_TCPS_PROXY_SRC || src->state >= PF_TCPS_PROXY_SRC)) || (st->src.state == src->state && SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) sync++; else pf_state_peer_ntoh(src, &st->src); if ((st->dst.state > dst->state) || (st->dst.state >= TCPS_SYN_SENT && SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) sync++; else pf_state_peer_ntoh(dst, &st->dst); return (sync); } static int pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_state *sa, *sp; struct pf_state *st; int sync; struct mbuf *mp; int len = count * sizeof(*sp); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_state *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; /* check for invalid values */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pfsync_input: PFSYNC_ACT_UPD: " "invalid value\n"); } V_pfsyncstats.pfsyncs_badval++; continue; } st = pf_find_state_byid(sp->id, sp->creatorid); if (st == NULL) { /* insert the update */ if (pfsync_state_import(sp, 0)) V_pfsyncstats.pfsyncs_badstate++; continue; } if (st->state_flags & PFSTATE_ACK) { PFSYNC_LOCK(sc); pfsync_undefer_state(st, 1); PFSYNC_UNLOCK(sc); } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) sync = pfsync_upd_tcp(st, &sp->src, &sp->dst); else { sync = 0; /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > sp->src.state) sync++; else pf_state_peer_ntoh(&sp->src, &st->src); if (st->dst.state > sp->dst.state) sync++; else pf_state_peer_ntoh(&sp->dst, &st->dst); } if (sync < 2) { pfsync_alloc_scrub_memory(&sp->dst, &st->dst); pf_state_peer_ntoh(&sp->dst, &st->dst); st->expire = time_uptime; st->timeout = sp->timeout; } st->pfsync_time = time_uptime; if (sync) { V_pfsyncstats.pfsyncs_stale++; pfsync_update_state(st); PF_STATE_UNLOCK(st); PFSYNC_LOCK(sc); pfsync_push(sc); PFSYNC_UNLOCK(sc); continue; } PF_STATE_UNLOCK(st); } return (len); } static int pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_upd_c *ua, *up; struct pf_state *st; int len = count * sizeof(*up); int sync; struct mbuf *mp; int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } ua = (struct pfsync_upd_c *)(mp->m_data + offp); for (i = 0; i < count; i++) { up = &ua[i]; /* check for invalid values */ if (up->timeout >= PFTM_MAX || up->src.state > PF_TCPS_PROXY_DST || up->dst.state > PF_TCPS_PROXY_DST) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pfsync_input: " "PFSYNC_ACT_UPD_C: " "invalid value\n"); } V_pfsyncstats.pfsyncs_badval++; continue; } st = pf_find_state_byid(up->id, up->creatorid); if (st == NULL) { /* We don't have this state. Ask for it. */ PFSYNC_LOCK(sc); pfsync_request_update(up->creatorid, up->id); PFSYNC_UNLOCK(sc); continue; } if (st->state_flags & PFSTATE_ACK) { PFSYNC_LOCK(sc); pfsync_undefer_state(st, 1); PFSYNC_UNLOCK(sc); } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) sync = pfsync_upd_tcp(st, &up->src, &up->dst); else { sync = 0; /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > up->src.state) sync++; else pf_state_peer_ntoh(&up->src, &st->src); if (st->dst.state > up->dst.state) sync++; else pf_state_peer_ntoh(&up->dst, &st->dst); } if (sync < 2) { pfsync_alloc_scrub_memory(&up->dst, &st->dst); pf_state_peer_ntoh(&up->dst, &st->dst); st->expire = time_uptime; st->timeout = up->timeout; } st->pfsync_time = time_uptime; if (sync) { V_pfsyncstats.pfsyncs_stale++; pfsync_update_state(st); PF_STATE_UNLOCK(st); PFSYNC_LOCK(sc); pfsync_push(sc); PFSYNC_UNLOCK(sc); continue; } PF_STATE_UNLOCK(st); } return (len); } static int pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_upd_req *ur, *ura; struct mbuf *mp; int len = count * sizeof(*ur); int i, offp; struct pf_state *st; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } ura = (struct pfsync_upd_req *)(mp->m_data + offp); for (i = 0; i < count; i++) { ur = &ura[i]; if (ur->id == 0 && ur->creatorid == 0) pfsync_bulk_start(); else { st = pf_find_state_byid(ur->id, ur->creatorid); if (st == NULL) { V_pfsyncstats.pfsyncs_badstate++; continue; } if (st->state_flags & PFSTATE_NOSYNC) { PF_STATE_UNLOCK(st); continue; } pfsync_update_state_req(st); PF_STATE_UNLOCK(st); } } return (len); } static int pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct mbuf *mp; struct pfsync_state *sa, *sp; struct pf_state *st; int len = count * sizeof(*sp); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_state *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; st = pf_find_state_byid(sp->id, sp->creatorid); if (st == NULL) { V_pfsyncstats.pfsyncs_badstate++; continue; } st->state_flags |= PFSTATE_NOSYNC; pf_unlink_state(st, PF_ENTER_LOCKED); } return (len); } static int pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct mbuf *mp; struct pfsync_del_c *sa, *sp; struct pf_state *st; int len = count * sizeof(*sp); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_del_c *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; st = pf_find_state_byid(sp->id, sp->creatorid); if (st == NULL) { V_pfsyncstats.pfsyncs_badstate++; continue; } st->state_flags |= PFSTATE_NOSYNC; pf_unlink_state(st, PF_ENTER_LOCKED); } return (len); } static int pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_bus *bus; struct mbuf *mp; int len = count * sizeof(*bus); int offp; PFSYNC_BLOCK(sc); /* If we're not waiting for a bulk update, who cares. */ if (sc->sc_ureq_sent == 0) { PFSYNC_BUNLOCK(sc); return (len); } mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { PFSYNC_BUNLOCK(sc); V_pfsyncstats.pfsyncs_badlen++; return (-1); } bus = (struct pfsync_bus *)(mp->m_data + offp); switch (bus->status) { case PFSYNC_BUS_START: callout_reset(&sc->sc_bulkfail_tmo, 4 * hz + V_pf_limits[PF_LIMIT_STATES].limit / ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) / sizeof(struct pfsync_state)), pfsync_bulk_fail, sc); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received bulk update start\n"); break; case PFSYNC_BUS_END: if (time_uptime - ntohl(bus->endtime) >= sc->sc_ureq_sent) { /* that's it, we're happy */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0; callout_stop(&sc->sc_bulkfail_tmo); if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync bulk done"); sc->sc_flags |= PFSYNCF_OK; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received valid " "bulk update end\n"); } else { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received invalid " "bulk update end: bad timestamp\n"); } break; } PFSYNC_BUNLOCK(sc); return (len); } static int pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { int len = count * sizeof(struct pfsync_tdb); #if defined(IPSEC) struct pfsync_tdb *tp; struct mbuf *mp; int offp; int i; int s; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } tp = (struct pfsync_tdb *)(mp->m_data + offp); for (i = 0; i < count; i++) pfsync_update_net_tdb(&tp[i]); #endif return (len); } #if defined(IPSEC) /* Update an in-kernel tdb. Silently fail if no tdb is found. */ static void pfsync_update_net_tdb(struct pfsync_tdb *pt) { struct tdb *tdb; int s; /* check for invalid values */ if (ntohl(pt->spi) <= SPI_RESERVED_MAX || (pt->dst.sa.sa_family != AF_INET && pt->dst.sa.sa_family != AF_INET6)) goto bad; tdb = gettdb(pt->spi, &pt->dst, pt->sproto); if (tdb) { pt->rpl = ntohl(pt->rpl); pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes); /* Neither replay nor byte counter should ever decrease. */ if (pt->rpl < tdb->tdb_rpl || pt->cur_bytes < tdb->tdb_cur_bytes) { goto bad; } tdb->tdb_rpl = pt->rpl; tdb->tdb_cur_bytes = pt->cur_bytes; } return; bad: if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: " "invalid value\n"); V_pfsyncstats.pfsyncs_badstate++; return; } #endif static int pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { /* check if we are at the right place in the packet */ if (offset != m->m_pkthdr.len) V_pfsyncstats.pfsyncs_badlen++; /* we're done. free and let the caller return */ m_freem(m); return (-1); } static int pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { V_pfsyncstats.pfsyncs_badact++; m_freem(m); return (-1); } static int pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *rt) { m_freem(m); return (0); } /* ARGSUSED */ static int pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct pfsync_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct pfsyncreq pfsyncr; int error; switch (cmd) { case SIOCSIFFLAGS: PFSYNC_LOCK(sc); if (ifp->if_flags & IFF_UP) { ifp->if_drv_flags |= IFF_DRV_RUNNING; PFSYNC_UNLOCK(sc); pfsync_pointers_init(); } else { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; PFSYNC_UNLOCK(sc); pfsync_pointers_uninit(); } break; case SIOCSIFMTU: if (!sc->sc_sync_if || ifr->ifr_mtu <= PFSYNC_MINPKT || ifr->ifr_mtu > sc->sc_sync_if->if_mtu) return (EINVAL); if (ifr->ifr_mtu < ifp->if_mtu) { PFSYNC_LOCK(sc); if (sc->sc_len > PFSYNC_MINPKT) pfsync_sendout(1); PFSYNC_UNLOCK(sc); } ifp->if_mtu = ifr->ifr_mtu; break; case SIOCGETPFSYNC: bzero(&pfsyncr, sizeof(pfsyncr)); PFSYNC_LOCK(sc); if (sc->sc_sync_if) { strlcpy(pfsyncr.pfsyncr_syncdev, sc->sc_sync_if->if_xname, IFNAMSIZ); } pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER == (sc->sc_flags & PFSYNCF_DEFER)); PFSYNC_UNLOCK(sc); return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); case SIOCSETPFSYNC: { struct ip_moptions *imo = &sc->sc_imo; struct ifnet *sifp; struct ip *ip; void *mship = NULL; if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) return (error); if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) return (error); if (pfsyncr.pfsyncr_maxupdates > 255) return (EINVAL); if (pfsyncr.pfsyncr_syncdev[0] == 0) sifp = NULL; else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL) return (EINVAL); if (sifp != NULL && ( pfsyncr.pfsyncr_syncpeer.s_addr == 0 || pfsyncr.pfsyncr_syncpeer.s_addr == htonl(INADDR_PFSYNC_GROUP))) mship = malloc((sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO); PFSYNC_LOCK(sc); if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); else sc->sc_sync_peer.s_addr = pfsyncr.pfsyncr_syncpeer.s_addr; sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; if (pfsyncr.pfsyncr_defer) { sc->sc_flags |= PFSYNCF_DEFER; pfsync_defer_ptr = pfsync_defer; } else { sc->sc_flags &= ~PFSYNCF_DEFER; pfsync_defer_ptr = NULL; } if (sifp == NULL) { if (sc->sc_sync_if) if_rele(sc->sc_sync_if); sc->sc_sync_if = NULL; if (imo->imo_membership) pfsync_multicast_cleanup(sc); PFSYNC_UNLOCK(sc); break; } if (sc->sc_len > PFSYNC_MINPKT && (sifp->if_mtu < sc->sc_ifp->if_mtu || (sc->sc_sync_if != NULL && sifp->if_mtu < sc->sc_sync_if->if_mtu) || sifp->if_mtu < MCLBYTES - sizeof(struct ip))) pfsync_sendout(1); if (imo->imo_membership) pfsync_multicast_cleanup(sc); if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { error = pfsync_multicast_setup(sc, sifp, mship); if (error) { if_rele(sifp); free(mship, M_PFSYNC); return (error); } } if (sc->sc_sync_if) if_rele(sc->sc_sync_if); sc->sc_sync_if = sifp; ip = &sc->sc_template; bzero(ip, sizeof(*ip)); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(sc->sc_template) >> 2; ip->ip_tos = IPTOS_LOWDELAY; /* len and id are set later. */ ip->ip_off = htons(IP_DF); ip->ip_ttl = PFSYNC_DFLTTL; ip->ip_p = IPPROTO_PFSYNC; ip->ip_src.s_addr = INADDR_ANY; ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr; /* Request a full state table update. */ if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(V_pfsync_carp_adj, "pfsync bulk start"); sc->sc_flags &= ~PFSYNCF_OK; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: requesting bulk update\n"); pfsync_request_update(0, 0); PFSYNC_UNLOCK(sc); PFSYNC_BLOCK(sc); sc->sc_ureq_sent = time_uptime; callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, sc); PFSYNC_BUNLOCK(sc); break; } default: return (ENOTTY); } return (0); } static void pfsync_out_state(struct pf_state *st, void *buf) { struct pfsync_state *sp = buf; pfsync_state_export(sp, st); } static void pfsync_out_iack(struct pf_state *st, void *buf) { struct pfsync_ins_ack *iack = buf; iack->id = st->id; iack->creatorid = st->creatorid; } static void pfsync_out_upd_c(struct pf_state *st, void *buf) { struct pfsync_upd_c *up = buf; bzero(up, sizeof(*up)); up->id = st->id; pf_state_peer_hton(&st->src, &up->src); pf_state_peer_hton(&st->dst, &up->dst); up->creatorid = st->creatorid; up->timeout = st->timeout; } static void pfsync_out_del(struct pf_state *st, void *buf) { struct pfsync_del_c *dp = buf; dp->id = st->id; dp->creatorid = st->creatorid; st->state_flags |= PFSTATE_NOSYNC; } static void pfsync_drop(struct pfsync_softc *sc) { struct pf_state *st, *next; struct pfsync_upd_req_item *ur; int q; for (q = 0; q < PFSYNC_S_COUNT; q++) { if (TAILQ_EMPTY(&sc->sc_qs[q])) continue; TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) { KASSERT(st->sync_state == q, ("%s: st->sync_state == q", __func__)); st->sync_state = PFSYNC_S_NONE; pf_release_state(st); } TAILQ_INIT(&sc->sc_qs[q]); } while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); free(ur, M_PFSYNC); } sc->sc_plus = NULL; sc->sc_len = PFSYNC_MINPKT; } static void pfsync_sendout(int schedswi) { struct pfsync_softc *sc = V_pfsyncif; struct ifnet *ifp = sc->sc_ifp; struct mbuf *m; struct ip *ip; struct pfsync_header *ph; struct pfsync_subheader *subh; struct pf_state *st; struct pfsync_upd_req_item *ur; int offset; int q, count = 0; KASSERT(sc != NULL, ("%s: null sc", __func__)); KASSERT(sc->sc_len > PFSYNC_MINPKT, ("%s: sc_len %zu", __func__, sc->sc_len)); PFSYNC_LOCK_ASSERT(sc); if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) { pfsync_drop(sc); return; } m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); V_pfsyncstats.pfsyncs_onomem++; return; } m->m_data += max_linkhdr; m->m_len = m->m_pkthdr.len = sc->sc_len; /* build the ip header */ ip = (struct ip *)m->m_data; bcopy(&sc->sc_template, ip, sizeof(*ip)); offset = sizeof(*ip); ip->ip_len = htons(m->m_pkthdr.len); ip_fillid(ip); /* build the pfsync header */ ph = (struct pfsync_header *)(m->m_data + offset); bzero(ph, sizeof(*ph)); offset += sizeof(*ph); ph->version = PFSYNC_VERSION; ph->len = htons(sc->sc_len - sizeof(*ip)); bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); /* walk the queues */ for (q = 0; q < PFSYNC_S_COUNT; q++) { if (TAILQ_EMPTY(&sc->sc_qs[q])) continue; subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) { KASSERT(st->sync_state == q, ("%s: st->sync_state == q", __func__)); /* * XXXGL: some of write methods do unlocked reads * of state data :( */ pfsync_qs[q].write(st, m->m_data + offset); offset += pfsync_qs[q].len; st->sync_state = PFSYNC_S_NONE; pf_release_state(st); count++; } TAILQ_INIT(&sc->sc_qs[q]); bzero(subh, sizeof(*subh)); subh->action = pfsync_qs[q].action; subh->count = htons(count); V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count; } if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); bcopy(&ur->ur_msg, m->m_data + offset, sizeof(ur->ur_msg)); offset += sizeof(ur->ur_msg); free(ur, M_PFSYNC); count++; } bzero(subh, sizeof(*subh)); subh->action = PFSYNC_ACT_UPD_REQ; subh->count = htons(count); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count; } /* has someone built a custom region for us to add? */ if (sc->sc_plus != NULL) { bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen); offset += sc->sc_pluslen; sc->sc_plus = NULL; } subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); bzero(subh, sizeof(*subh)); subh->action = PFSYNC_ACT_EOF; subh->count = htons(1); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++; /* we're done, let's put it on the wire */ if (ifp->if_bpf) { m->m_data += sizeof(*ip); m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip); BPF_MTAP(ifp, m); m->m_data -= sizeof(*ip); m->m_len = m->m_pkthdr.len = sc->sc_len; } if (sc->sc_sync_if == NULL) { sc->sc_len = PFSYNC_MINPKT; m_freem(m); return; } if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); sc->sc_len = PFSYNC_MINPKT; if (!_IF_QFULL(&sc->sc_ifp->if_snd)) _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); else { m_freem(m); if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1); } if (schedswi) swi_sched(V_pfsync_swi_cookie, 0); } static void pfsync_insert_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; if (st->state_flags & PFSTATE_NOSYNC) return; if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) || st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { st->state_flags |= PFSTATE_NOSYNC; return; } KASSERT(st->sync_state == PFSYNC_S_NONE, ("%s: st->sync_state %u", __func__, st->sync_state)); PFSYNC_LOCK(sc); if (sc->sc_len == PFSYNC_MINPKT) callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); pfsync_q_ins(st, PFSYNC_S_INS); PFSYNC_UNLOCK(sc); st->sync_updates = 0; } static int pfsync_defer(struct pf_state *st, struct mbuf *m) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_deferral *pd; if (m->m_flags & (M_BCAST|M_MCAST)) return (0); PFSYNC_LOCK(sc); if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) || !(sc->sc_flags & PFSYNCF_DEFER)) { PFSYNC_UNLOCK(sc); return (0); } if (sc->sc_deferred >= 128) pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0); pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT); if (pd == NULL) return (0); sc->sc_deferred++; m->m_flags |= M_SKIP_FIREWALL; st->state_flags |= PFSTATE_ACK; pd->pd_sc = sc; pd->pd_refs = 0; pd->pd_st = st; pf_ref_state(st); pd->pd_m = m; TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd); pfsync_push(sc); return (1); } static void pfsync_undefer(struct pfsync_deferral *pd, int drop) { struct pfsync_softc *sc = pd->pd_sc; struct mbuf *m = pd->pd_m; struct pf_state *st = pd->pd_st; PFSYNC_LOCK_ASSERT(sc); TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); sc->sc_deferred--; pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ free(pd, M_PFSYNC); pf_release_state(st); if (drop) m_freem(m); else { _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); pfsync_push(sc); } } static void pfsync_defer_tmo(void *arg) { struct pfsync_deferral *pd = arg; struct pfsync_softc *sc = pd->pd_sc; struct mbuf *m = pd->pd_m; struct pf_state *st = pd->pd_st; PFSYNC_LOCK_ASSERT(sc); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); sc->sc_deferred--; pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ if (pd->pd_refs == 0) free(pd, M_PFSYNC); PFSYNC_UNLOCK(sc); ip_output(m, NULL, NULL, 0, NULL, NULL); pf_release_state(st); CURVNET_RESTORE(); } static void pfsync_undefer_state(struct pf_state *st, int drop) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_deferral *pd; PFSYNC_LOCK_ASSERT(sc); TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { if (pd->pd_st == st) { - if (callout_stop(&pd->pd_tmo)) + if (callout_stop(&pd->pd_tmo) > 0) pfsync_undefer(pd, drop); return; } } panic("%s: unable to find deferred state", __func__); } static void pfsync_update_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; int sync = 0; PF_STATE_LOCK_ASSERT(st); PFSYNC_LOCK(sc); if (st->state_flags & PFSTATE_ACK) pfsync_undefer_state(st, 0); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) pfsync_q_del(st); PFSYNC_UNLOCK(sc); return; } if (sc->sc_len == PFSYNC_MINPKT) callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: case PFSYNC_S_INS: /* we're already handling it */ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { st->sync_updates++; if (st->sync_updates >= sc->sc_maxupdates) sync = 1; } break; case PFSYNC_S_IACK: pfsync_q_del(st); case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_UPD_C); st->sync_updates = 0; break; default: panic("%s: unexpected sync state %d", __func__, st->sync_state); } if (sync || (time_uptime - st->pfsync_time) < 2) pfsync_push(sc); PFSYNC_UNLOCK(sc); } static void pfsync_request_update(u_int32_t creatorid, u_int64_t id) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_upd_req_item *item; size_t nlen = sizeof(struct pfsync_upd_req); PFSYNC_LOCK_ASSERT(sc); /* * This code does a bit to prevent multiple update requests for the * same state being generated. It searches current subheader queue, * but it doesn't lookup into queue of already packed datagrams. */ TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry) if (item->ur_msg.id == id && item->ur_msg.creatorid == creatorid) return; item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT); if (item == NULL) return; /* XXX stats */ item->ur_msg.id = id; item->ur_msg.creatorid = creatorid; if (TAILQ_EMPTY(&sc->sc_upd_req_list)) nlen += sizeof(struct pfsync_subheader); if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { pfsync_sendout(1); nlen = sizeof(struct pfsync_subheader) + sizeof(struct pfsync_upd_req); } TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); sc->sc_len += nlen; } static void pfsync_update_state_req(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; PF_STATE_LOCK_ASSERT(st); PFSYNC_LOCK(sc); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) pfsync_q_del(st); PFSYNC_UNLOCK(sc); return; } switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_IACK: pfsync_q_del(st); case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_UPD); pfsync_push(sc); break; case PFSYNC_S_INS: case PFSYNC_S_UPD: case PFSYNC_S_DEL: /* we're already handling it */ break; default: panic("%s: unexpected sync state %d", __func__, st->sync_state); } PFSYNC_UNLOCK(sc); } static void pfsync_delete_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; PFSYNC_LOCK(sc); if (st->state_flags & PFSTATE_ACK) pfsync_undefer_state(st, 1); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) pfsync_q_del(st); PFSYNC_UNLOCK(sc); return; } if (sc->sc_len == PFSYNC_MINPKT) callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); switch (st->sync_state) { case PFSYNC_S_INS: /* We never got to tell the world so just forget about it. */ pfsync_q_del(st); break; case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: case PFSYNC_S_IACK: pfsync_q_del(st); /* FALLTHROUGH to putting it on the del list */ case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_DEL); break; default: panic("%s: unexpected sync state %d", __func__, st->sync_state); } PFSYNC_UNLOCK(sc); } static void pfsync_clear_states(u_int32_t creatorid, const char *ifname) { struct pfsync_softc *sc = V_pfsyncif; struct { struct pfsync_subheader subh; struct pfsync_clr clr; } __packed r; bzero(&r, sizeof(r)); r.subh.action = PFSYNC_ACT_CLR; r.subh.count = htons(1); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++; strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); r.clr.creatorid = creatorid; PFSYNC_LOCK(sc); pfsync_send_plus(&r, sizeof(r)); PFSYNC_UNLOCK(sc); } static void pfsync_q_ins(struct pf_state *st, int q) { struct pfsync_softc *sc = V_pfsyncif; size_t nlen = pfsync_qs[q].len; PFSYNC_LOCK_ASSERT(sc); KASSERT(st->sync_state == PFSYNC_S_NONE, ("%s: st->sync_state %u", __func__, st->sync_state)); KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu", sc->sc_len)); if (TAILQ_EMPTY(&sc->sc_qs[q])) nlen += sizeof(struct pfsync_subheader); if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { pfsync_sendout(1); nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; } sc->sc_len += nlen; TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); st->sync_state = q; pf_ref_state(st); } static void pfsync_q_del(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; int q = st->sync_state; PFSYNC_LOCK_ASSERT(sc); KASSERT(st->sync_state != PFSYNC_S_NONE, ("%s: st->sync_state != PFSYNC_S_NONE", __func__)); sc->sc_len -= pfsync_qs[q].len; TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); st->sync_state = PFSYNC_S_NONE; pf_release_state(st); if (TAILQ_EMPTY(&sc->sc_qs[q])) sc->sc_len -= sizeof(struct pfsync_subheader); } static void pfsync_bulk_start(void) { struct pfsync_softc *sc = V_pfsyncif; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received bulk update request\n"); PFSYNC_BLOCK(sc); sc->sc_ureq_received = time_uptime; sc->sc_bulk_hashid = 0; sc->sc_bulk_stateid = 0; pfsync_bulk_status(PFSYNC_BUS_START); callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc); PFSYNC_BUNLOCK(sc); } static void pfsync_bulk_update(void *arg) { struct pfsync_softc *sc = arg; struct pf_state *s; int i, sent = 0; PFSYNC_BLOCK_ASSERT(sc); CURVNET_SET(sc->sc_ifp->if_vnet); /* * Start with last state from previous invocation. * It may had gone, in this case start from the * hash slot. */ s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid); if (s != NULL) i = PF_IDHASH(s); else i = sc->sc_bulk_hashid; for (; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; if (s != NULL) PF_HASHROW_ASSERT(ih); else { PF_HASHROW_LOCK(ih); s = LIST_FIRST(&ih->states); } for (; s; s = LIST_NEXT(s, entry)) { if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) < sizeof(struct pfsync_state)) { /* We've filled a packet. */ sc->sc_bulk_hashid = i; sc->sc_bulk_stateid = s->id; sc->sc_bulk_creatorid = s->creatorid; PF_HASHROW_UNLOCK(ih); callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc); goto full; } if (s->sync_state == PFSYNC_S_NONE && s->timeout < PFTM_MAX && s->pfsync_time <= sc->sc_ureq_received) { pfsync_update_state_req(s); sent++; } } PF_HASHROW_UNLOCK(ih); } /* We're done. */ pfsync_bulk_status(PFSYNC_BUS_END); full: CURVNET_RESTORE(); } static void pfsync_bulk_status(u_int8_t status) { struct { struct pfsync_subheader subh; struct pfsync_bus bus; } __packed r; struct pfsync_softc *sc = V_pfsyncif; bzero(&r, sizeof(r)); r.subh.action = PFSYNC_ACT_BUS; r.subh.count = htons(1); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++; r.bus.creatorid = V_pf_status.hostid; r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received); r.bus.status = status; PFSYNC_LOCK(sc); pfsync_send_plus(&r, sizeof(r)); PFSYNC_UNLOCK(sc); } static void pfsync_bulk_fail(void *arg) { struct pfsync_softc *sc = arg; CURVNET_SET(sc->sc_ifp->if_vnet); PFSYNC_BLOCK_ASSERT(sc); if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { /* Try again */ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, V_pfsyncif); PFSYNC_LOCK(sc); pfsync_request_update(0, 0); PFSYNC_UNLOCK(sc); } else { /* Pretend like the transfer was ok. */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0; PFSYNC_LOCK(sc); if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync bulk fail"); sc->sc_flags |= PFSYNCF_OK; PFSYNC_UNLOCK(sc); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: failed to receive bulk update\n"); } CURVNET_RESTORE(); } static void pfsync_send_plus(void *plus, size_t pluslen) { struct pfsync_softc *sc = V_pfsyncif; PFSYNC_LOCK_ASSERT(sc); if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu) pfsync_sendout(1); sc->sc_plus = plus; sc->sc_len += (sc->sc_pluslen = pluslen); pfsync_sendout(1); } static void pfsync_timeout(void *arg) { struct pfsync_softc *sc = arg; CURVNET_SET(sc->sc_ifp->if_vnet); PFSYNC_LOCK(sc); pfsync_push(sc); PFSYNC_UNLOCK(sc); CURVNET_RESTORE(); } static void pfsync_push(struct pfsync_softc *sc) { PFSYNC_LOCK_ASSERT(sc); sc->sc_flags |= PFSYNCF_PUSH; swi_sched(V_pfsync_swi_cookie, 0); } static void pfsyncintr(void *arg) { struct pfsync_softc *sc = arg; struct mbuf *m, *n; CURVNET_SET(sc->sc_ifp->if_vnet); PFSYNC_LOCK(sc); if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) { pfsync_sendout(0); sc->sc_flags &= ~PFSYNCF_PUSH; } _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m); PFSYNC_UNLOCK(sc); for (; m != NULL; m = n) { n = m->m_nextpkt; m->m_nextpkt = NULL; /* * We distinguish between a deferral packet and our * own pfsync packet based on M_SKIP_FIREWALL * flag. This is XXX. */ if (m->m_flags & M_SKIP_FIREWALL) ip_output(m, NULL, NULL, 0, NULL, NULL); else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL) == 0) V_pfsyncstats.pfsyncs_opackets++; else V_pfsyncstats.pfsyncs_oerrors++; } CURVNET_RESTORE(); } static int pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship) { struct ip_moptions *imo = &sc->sc_imo; int error; if (!(ifp->if_flags & IFF_MULTICAST)) return (EADDRNOTAVAIL); imo->imo_membership = (struct in_multi **)mship; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_multicast_vif = -1; if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL, &imo->imo_membership[0])) != 0) { imo->imo_membership = NULL; return (error); } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = PFSYNC_DFLTTL; imo->imo_multicast_loop = 0; return (0); } static void pfsync_multicast_cleanup(struct pfsync_softc *sc) { struct ip_moptions *imo = &sc->sc_imo; in_leavegroup(imo->imo_membership[0], NULL); free(imo->imo_membership, M_PFSYNC); imo->imo_membership = NULL; imo->imo_multicast_ifp = NULL; } #ifdef INET extern struct domain inetdomain; static struct protosw in_pfsync_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PFSYNC, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = pfsync_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif static void pfsync_pointers_init() { PF_RULES_WLOCK(); pfsync_state_import_ptr = pfsync_state_import; pfsync_insert_state_ptr = pfsync_insert_state; pfsync_update_state_ptr = pfsync_update_state; pfsync_delete_state_ptr = pfsync_delete_state; pfsync_clear_states_ptr = pfsync_clear_states; pfsync_defer_ptr = pfsync_defer; PF_RULES_WUNLOCK(); } static void pfsync_pointers_uninit() { PF_RULES_WLOCK(); pfsync_state_import_ptr = NULL; pfsync_insert_state_ptr = NULL; pfsync_update_state_ptr = NULL; pfsync_delete_state_ptr = NULL; pfsync_clear_states_ptr = NULL; pfsync_defer_ptr = NULL; PF_RULES_WUNLOCK(); } static int pfsync_init() { VNET_ITERATOR_DECL(vnet_iter); int error = 0; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_pfsync_cloner = if_clone_simple(pfsyncname, pfsync_clone_create, pfsync_clone_destroy, 1); error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif, SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie); CURVNET_RESTORE(); if (error) goto fail_locked; } VNET_LIST_RUNLOCK(); #ifdef INET error = pf_proto_register(PF_INET, &in_pfsync_protosw); if (error) goto fail; error = ipproto_register(IPPROTO_PFSYNC); if (error) { pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); goto fail; } #endif pfsync_pointers_init(); return (0); fail: VNET_LIST_RLOCK(); fail_locked: VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (V_pfsync_swi_cookie) { swi_remove(V_pfsync_swi_cookie); if_clone_detach(V_pfsync_cloner); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (error); } static void pfsync_uninit() { VNET_ITERATOR_DECL(vnet_iter); pfsync_pointers_uninit(); ipproto_unregister(IPPROTO_PFSYNC); pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if_clone_detach(V_pfsync_cloner); swi_remove(V_pfsync_swi_cookie); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } static int pfsync_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: error = pfsync_init(); break; case MOD_QUIESCE: /* * Module should not be unloaded due to race conditions. */ error = EBUSY; break; case MOD_UNLOAD: pfsync_uninit(); break; default: error = EINVAL; break; } return (error); } static moduledata_t pfsync_mod = { pfsyncname, pfsync_modevent, 0 }; #define PFSYNC_MODVER 1 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_VERSION(pfsync, PFSYNC_MODVER); MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);