Index: head/sys/net/if_var.h =================================================================== --- head/sys/net/if_var.h (revision 360474) +++ head/sys/net/if_var.h (revision 360475) @@ -1,805 +1,807 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)if.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_VAR_H_ #define _NET_IF_VAR_H_ /* * Structures defining a network interface, providing a packet * transport mechanism (ala level 0 of the PUP protocols). * * Each interface accepts output datagrams of a specified maximum * length, and provides higher level routines with input datagrams * received from its medium. * * Output occurs when the routine if_output is called, with three parameters: * (*ifp->if_output)(ifp, m, dst, ro) * Here m is the mbuf chain to be sent and dst is the destination address. * The output routine encapsulates the supplied datagram if necessary, * and then transmits it on its medium. * * On input, each interface unwraps the data received by it, and either * places it on the input queue of an internetwork datagram routine * and posts the associated software interrupt, or passes the datagram to a raw * packet input routine. * * Routines exist for locating interfaces by their addresses * or for locating an interface on a certain network, as well as more general * routing and gateway routines maintaining information used to locate * interfaces. These routines live in the files if.c and route.c */ struct rtentry; /* ifa_rtrequest */ +struct nhop_object; /* ifa_rtrequest */ struct rt_addrinfo; /* ifa_rtrequest */ struct socket; struct carp_if; struct carp_softc; struct ifvlantrunk; struct route; /* if_output */ struct vnet; struct ifmedia; struct netmap_adapter; struct debugnet_methods; #ifdef _KERNEL #include #include /* ifqueue only? */ #include #include #endif /* _KERNEL */ #include #include #include #include /* XXX */ #include /* struct ifqueue */ #include /* XXX */ #include /* XXX */ #include /* if_link_task */ #define IF_DUNIT_NONE -1 #include CK_STAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ CK_STAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ CK_STAILQ_HEAD(ifmultihead, ifmultiaddr); CK_STAILQ_HEAD(ifgrouphead, ifg_group); #ifdef _KERNEL VNET_DECLARE(struct pfil_head *, link_pfil_head); #define V_link_pfil_head VNET(link_pfil_head) #define PFIL_ETHER_NAME "ethernet" #define HHOOK_IPSEC_INET 0 #define HHOOK_IPSEC_INET6 1 #define HHOOK_IPSEC_COUNT 2 VNET_DECLARE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DECLARE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); #define V_ipsec_hhh_in VNET(ipsec_hhh_in) #define V_ipsec_hhh_out VNET(ipsec_hhh_out) #endif /* _KERNEL */ typedef enum { IFCOUNTER_IPACKETS = 0, IFCOUNTER_IERRORS, IFCOUNTER_OPACKETS, IFCOUNTER_OERRORS, IFCOUNTER_COLLISIONS, IFCOUNTER_IBYTES, IFCOUNTER_OBYTES, IFCOUNTER_IMCASTS, IFCOUNTER_OMCASTS, IFCOUNTER_IQDROPS, IFCOUNTER_OQDROPS, IFCOUNTER_NOPROTO, IFCOUNTERS /* Array size. */ } ift_counter; typedef struct ifnet * if_t; typedef void (*if_start_fn_t)(if_t); typedef int (*if_ioctl_fn_t)(if_t, u_long, caddr_t); typedef void (*if_init_fn_t)(void *); typedef void (*if_qflush_fn_t)(if_t); typedef int (*if_transmit_fn_t)(if_t, struct mbuf *); typedef uint64_t (*if_get_counter_t)(if_t, ift_counter); struct ifnet_hw_tsomax { u_int tsomaxbytes; /* TSO total burst length limit in bytes */ u_int tsomaxsegcount; /* TSO maximum segment count */ u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; /* Interface encap request types */ typedef enum { IFENCAP_LL = 1 /* pre-calculate link-layer header */ } ife_type; /* * The structure below allows to request various pre-calculated L2/L3 headers * for different media. Requests varies by type (rtype field). * * IFENCAP_LL type: pre-calculates link header based on address family * and destination lladdr. * * Input data fields: * buf: pointer to destination buffer * bufsize: buffer size * flags: IFENCAP_FLAG_BROADCAST if destination is broadcast * family: address family defined by AF_ constant. * lladdr: pointer to link-layer address * lladdr_len: length of link-layer address * hdata: pointer to L3 header (optional, used for ARP requests). * Output data fields: * buf: encap data is stored here * bufsize: resulting encap length is stored here * lladdr_off: offset of link-layer address from encap hdr start * hdata: L3 header may be altered if necessary */ struct if_encap_req { u_char *buf; /* Destination buffer (w) */ size_t bufsize; /* size of provided buffer (r) */ ife_type rtype; /* request type (r) */ uint32_t flags; /* Request flags (r) */ int family; /* Address family AF_* (r) */ int lladdr_off; /* offset from header start (w) */ int lladdr_len; /* lladdr length (r) */ char *lladdr; /* link-level address pointer (r) */ char *hdata; /* Upper layer header data (rw) */ }; #define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ /* * Network interface send tag support. The storage of "struct * m_snd_tag" comes from the network driver and it is free to allocate * as much additional space as it wants for its own use. */ struct ktls_session; struct m_snd_tag; #define IF_SND_TAG_TYPE_RATE_LIMIT 0 #define IF_SND_TAG_TYPE_UNLIMITED 1 #define IF_SND_TAG_TYPE_TLS 2 #define IF_SND_TAG_TYPE_MAX 3 struct if_snd_tag_alloc_header { uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ uint32_t flowid; /* mbuf hash value */ uint32_t flowtype; /* mbuf hash type */ uint8_t numa_domain; /* numa domain of associated inp */ }; struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ uint32_t flags; /* M_NOWAIT or M_WAITOK */ uint32_t reserved; /* alignment */ }; struct if_snd_tag_alloc_tls { struct if_snd_tag_alloc_header hdr; struct inpcb *inp; const struct ktls_session *tls; }; struct if_snd_tag_rate_limit_params { uint64_t max_rate; /* in bytes/s */ uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 uint32_t flags; /* M_NOWAIT or M_WAITOK */ }; union if_snd_tag_alloc_params { struct if_snd_tag_alloc_header hdr; struct if_snd_tag_alloc_rate_limit rate_limit; struct if_snd_tag_alloc_rate_limit unlimited; struct if_snd_tag_alloc_tls tls; }; union if_snd_tag_modify_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params rate_limit; struct if_snd_tag_rate_limit_params unlimited; }; /* Query return flags */ #define RT_NOSUPPORT 0x00000000 /* Not supported */ #define RT_IS_INDIRECT 0x00000001 /* * Interface like a lagg, select * the actual interface for * capabilities. */ #define RT_IS_SELECTABLE 0x00000002 /* * No rate table, you select * rates and the first * number_of_rates are created. */ #define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */ #define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */ #define RT_IS_SETUP_REQ 0x00000010 /* The interface setup must be called before use */ struct if_ratelimit_query_results { const uint64_t *rate_table; /* Pointer to table if present */ uint32_t flags; /* Flags indicating results */ uint32_t max_flows; /* Max flows using, 0=unlimited */ uint32_t number_of_rates; /* How many unique rates can be created */ uint32_t min_segment_burst; /* The amount the adapter bursts at each send */ }; typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); typedef void (if_ratelimit_query_t)(struct ifnet *, struct if_ratelimit_query_results *); typedef int (if_ratelimit_setup_t)(struct ifnet *, uint64_t, uint32_t); /* * Structure defining a network interface. */ struct ifnet { /* General book keeping of interface lists. */ CK_STAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained (CK_) */ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */ /* protected by if_addr_lock */ u_char if_alloctype; /* if_type at time of allocation */ uint8_t if_numa_domain; /* NUMA domain of device */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ u_short if_index; /* numeric abbreviation for this if */ short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */ /* Variable fields that are touched by the stack and drivers. */ int if_flags; /* up/down, broadcast, etc. */ int if_drv_flags; /* driver-managed status flags */ int if_capabilities; /* interface features & capabilities */ int if_capenable; /* enabled features & capabilities */ void *if_linkmib; /* link-type-specific MIB data */ size_t if_linkmiblen; /* length of above data */ u_int if_refcount; /* reference count */ /* These fields are shared with struct if_data. */ uint8_t if_type; /* ethernet, tokenring, etc */ uint8_t if_addrlen; /* media address length */ uint8_t if_hdrlen; /* media header length */ uint8_t if_link_state; /* current link state */ uint32_t if_mtu; /* maximum transmission unit */ uint32_t if_metric; /* routing metric (external only) */ uint64_t if_baudrate; /* linespeed */ uint64_t if_hwassist; /* HW offload capabilities, see IFCAP */ time_t if_epoch; /* uptime at attach or stat reset */ struct timeval if_lastchange; /* time of last administrative change */ struct ifaltq if_snd; /* output queue (includes altq) */ struct task if_linktask; /* task for link change events */ struct task if_addmultitask; /* task for SIOCADDMULTI */ /* Addresses of different protocol families assigned to this if. */ struct mtx if_addr_lock; /* lock to protect address lists */ /* * if_addrhead is the list of all addresses associated to * an interface. * Some code in the kernel assumes that first element * of the list has type AF_LINK, and contains sockaddr_dl * addresses which store the link-level address and the name * of the interface. * However, access to the AF_LINK address through this * field is deprecated. Use if_addr or ifaddr_byindex() instead. */ struct ifaddrhead if_addrhead; /* linked list of addresses per if */ struct ifmultihead if_multiaddrs; /* multicast addresses configured */ int if_amcount; /* number of all-multicast requests */ struct ifaddr *if_addr; /* pointer to link-level address */ void *if_hw_addr; /* hardware link-level address */ const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */ struct mtx if_afdata_lock; void *if_afdata[AF_MAX]; int if_afdata_initialized; /* Additional features hung off the interface. */ u_int if_fib; /* interface FIB */ struct vnet *if_vnet; /* pointer to network stack instance */ struct vnet *if_home_vnet; /* where this ifnet originates from */ struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */ struct bpf_if *if_bpf; /* packet filter structure */ int if_pcount; /* number of promiscuous listeners */ void *if_bridge; /* bridge glue */ void *if_lagg; /* lagg glue */ void *if_pf_kif; /* pf glue */ struct carp_if *if_carp; /* carp interface structure */ struct label *if_label; /* interface MAC label */ struct netmap_adapter *if_netmap; /* netmap(4) softc */ /* Various procedures of the layer2 encapsulation and drivers. */ int (*if_output) /* output routine (enqueue) */ (struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); void (*if_input) /* input routine (from h/w driver) */ (struct ifnet *, struct mbuf *); struct mbuf *(*if_bridge_input)(struct ifnet *, struct mbuf *); int (*if_bridge_output)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*if_bridge_linkstate)(struct ifnet *ifp); if_start_fn_t if_start; /* initiate output routine */ if_ioctl_fn_t if_ioctl; /* ioctl routine */ if_init_fn_t if_init; /* Init routine */ int (*if_resolvemulti) /* validate/resolve multicast */ (struct ifnet *, struct sockaddr **, struct sockaddr *); if_qflush_fn_t if_qflush; /* flush any queue */ if_transmit_fn_t if_transmit; /* initiate output routine */ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ int (*if_requestencap) /* make link header from request */ (struct ifnet *, struct if_encap_req *); /* Statistics. */ counter_u64_t if_counters[IFCOUNTERS]; /* Stuff that's only temporary and doesn't belong here. */ /* * Network adapter TSO limits: * =========================== * * If the "if_hw_tsomax" field is zero the maximum segment * length limit does not apply. If the "if_hw_tsomaxsegcount" * or the "if_hw_tsomaxsegsize" field is zero the TSO segment * count limit does not apply. If all three fields are zero, * there is no TSO limit. * * NOTE: The TSO limits should reflect the values used in the * BUSDMA tag a network adapter is using to load a mbuf chain * for transmission. The TCP/IP network stack will subtract * space for all linklevel and protocol level headers and * ensure that the full mbuf chain passed to the network * adapter fits within the given limits. */ u_int if_hw_tsomax; /* TSO maximum size in bytes */ u_int if_hw_tsomaxsegcount; /* TSO maximum segment count */ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ /* * Network adapter send tag support: */ if_snd_tag_alloc_t *if_snd_tag_alloc; if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; if_ratelimit_query_t *if_ratelimit_query; if_ratelimit_setup_t *if_ratelimit_setup; /* Ethernet PCP */ uint8_t if_pcp; /* * Debugnet (Netdump) hooks to be called while in db/panic. */ struct debugnet_methods *if_debugnet_methods; struct epoch_context if_epoch_ctx; /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ int if_ispare[4]; /* general use */ }; /* for compatibility with other BSDs */ #define if_name(ifp) ((ifp)->if_xname) #define IF_NODOM 255 /* * Locks for address lists on the network interface. */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) #define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) #ifdef _KERNEL /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t); /* interface address change event */ typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t); typedef void (*ifaddr_event_ext_handler_t)(void *, struct ifnet *, struct ifaddr *, int); EVENTHANDLER_DECLARE(ifaddr_event_ext, ifaddr_event_ext_handler_t); #define IFADDR_EVENT_ADD 0 #define IFADDR_EVENT_DEL 1 /* new interface arrival event */ typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t); /* interface departure event */ typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *); EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t); /* Interface link state change event */ typedef void (*ifnet_link_event_handler_t)(void *, struct ifnet *, int); EVENTHANDLER_DECLARE(ifnet_link_event, ifnet_link_event_handler_t); /* Interface up/down event */ #define IFNET_EVENT_UP 0 #define IFNET_EVENT_DOWN 1 #define IFNET_EVENT_PCP 2 /* priority code point, PCP */ typedef void (*ifnet_event_fn)(void *, struct ifnet *ifp, int event); EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn); /* * interface groups */ struct ifg_group { char ifg_group[IFNAMSIZ]; u_int ifg_refcnt; void *ifg_pf_kif; CK_STAILQ_HEAD(, ifg_member) ifg_members; /* (CK_) */ CK_STAILQ_ENTRY(ifg_group) ifg_next; /* (CK_) */ }; struct ifg_member { CK_STAILQ_ENTRY(ifg_member) ifgm_next; /* (CK_) */ struct ifnet *ifgm_ifp; }; struct ifg_list { struct ifg_group *ifgl_group; CK_STAILQ_ENTRY(ifg_list) ifgl_next; /* (CK_) */ }; #ifdef _SYS_EVENTHANDLER_H_ /* group attach event */ typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t); /* group detach event */ typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *); EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t); /* group change event */ typedef void (*group_change_event_handler_t)(void *, const char *); EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t); #endif /* _SYS_EVENTHANDLER_H_ */ #define IF_AFDATA_LOCK_INIT(ifp) \ mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF) #define IF_AFDATA_WLOCK(ifp) mtx_lock(&(ifp)->if_afdata_lock) #define IF_AFDATA_WUNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_lock) #define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock)) #define IF_AFDATA_WLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED) /* * 72 was chosen below because it is the size of a TCP/IP * header (40) + the minimum mss (32). */ #define IF_MINMTU 72 #define IF_MAXMTU 65535 #define TOEDEV(ifp) ((ifp)->if_llsoftc) /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, * are allocated and attached when an address is set, and are linked * together so all addresses for an interface can be located. * * NOTE: a 'struct ifaddr' is always at the beginning of a larger * chunk of malloc'ed memory, where we store the three addresses * (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here. */ struct ifaddr { struct sockaddr *ifa_addr; /* address of interface */ struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ struct sockaddr *ifa_netmask; /* used to determine subnet */ struct ifnet *ifa_ifp; /* back-pointer to interface */ struct carp_softc *ifa_carp; /* pointer to CARP data */ CK_STAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ - (int, struct rtentry *, struct rt_addrinfo *); + (int, struct rtentry *, struct nhop_object *, + struct rt_addrinfo *); u_short ifa_flags; /* mostly rt_flags for cloning */ #define IFA_ROUTE RTF_UP /* route installed */ #define IFA_RTSELF RTF_HOST /* loopback route to self installed */ u_int ifa_refcnt; /* references to this structure */ counter_u64_t ifa_ipackets; counter_u64_t ifa_opackets; counter_u64_t ifa_ibytes; counter_u64_t ifa_obytes; struct epoch_context ifa_epoch_ctx; }; struct ifaddr * ifa_alloc(size_t size, int flags); void ifa_free(struct ifaddr *ifa); void ifa_ref(struct ifaddr *ifa); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. */ #define IFMA_F_ENQUEUED 0x1 struct ifmultiaddr { CK_STAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ struct sockaddr *ifma_addr; /* address this membership is for */ struct sockaddr *ifma_lladdr; /* link-layer translation, if any */ struct ifnet *ifma_ifp; /* back-pointer to interface */ u_int ifma_refcount; /* reference count */ int ifma_flags; void *ifma_protospec; /* protocol-specific state, if any */ struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */ struct epoch_context ifma_epoch_ctx; }; extern struct rwlock ifnet_rwlock; extern struct sx ifnet_sxlock; #define IFNET_WLOCK() do { \ sx_xlock(&ifnet_sxlock); \ rw_wlock(&ifnet_rwlock); \ } while (0) #define IFNET_WUNLOCK() do { \ rw_wunlock(&ifnet_rwlock); \ sx_xunlock(&ifnet_sxlock); \ } while (0) /* * To assert the ifnet lock, you must know not only whether it's for read or * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) /* * Look up an ifnet given its index; the _ref variant also acquires a * reference that must be freed using if_rele(). It is almost always a bug * to call ifnet_byindex() instead of ifnet_byindex_ref(). */ struct ifnet *ifnet_byindex(u_short idx); struct ifnet *ifnet_byindex_ref(u_short idx); /* * Given the index, ifaddr_byindex() returns the one and only * link-level ifaddr for the interface. You are not supposed to use * it to traverse the list of addresses associated to the interface. */ struct ifaddr *ifaddr_byindex(u_short idx); VNET_DECLARE(struct ifnethead, ifnet); VNET_DECLARE(struct ifgrouphead, ifg_head); VNET_DECLARE(int, if_index); VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */ #define V_ifnet VNET(ifnet) #define V_ifg_head VNET(ifg_head) #define V_if_index VNET(if_index) #define V_loif VNET(loif) #ifdef MCAST_VERBOSE #define MCDPRINTF printf #else #define MCDPRINTF(...) #endif int if_addgroup(struct ifnet *, const char *); int if_delgroup(struct ifnet *, const char *); int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **); int if_allmulti(struct ifnet *, int); struct ifnet* if_alloc(u_char); struct ifnet* if_alloc_dev(u_char, device_t dev); struct ifnet* if_alloc_domain(u_char, int numa_domain); void if_attach(struct ifnet *); void if_dead(struct ifnet *); int if_delmulti(struct ifnet *, struct sockaddr *); void if_delmulti_ifma(struct ifmultiaddr *); void if_delmulti_ifma_flags(struct ifmultiaddr *, int flags); void if_detach(struct ifnet *); void if_purgeaddrs(struct ifnet *); void if_delallmulti(struct ifnet *); void if_down(struct ifnet *); struct ifmultiaddr * if_findmulti(struct ifnet *, const struct sockaddr *); void if_freemulti(struct ifmultiaddr *ifma); void if_free(struct ifnet *); void if_initname(struct ifnet *, const char *, int); void if_link_state_change(struct ifnet *, int); int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3); void if_ref(struct ifnet *); void if_rele(struct ifnet *); int if_setlladdr(struct ifnet *, const u_char *, int); int if_tunnel_check_nesting(struct ifnet *, struct mbuf *, uint32_t, int); void if_up(struct ifnet *); int ifioctl(struct socket *, u_long, caddr_t, struct thread *); int ifpromisc(struct ifnet *, int); struct ifnet *ifunit(const char *); struct ifnet *ifunit_ref(const char *); int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *); int ifa_switch_loopback_route(struct ifaddr *, struct sockaddr *); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); int ifa_ifwithaddr_check(const struct sockaddr *); struct ifaddr *ifa_ifwithbroadaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *, int); struct ifaddr *ifa_ifwithnet(const struct sockaddr *, int, int); struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, struct sockaddr *, u_int); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); int ifa_preferred(struct ifaddr *, struct ifaddr *); int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen); typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp); typedef void if_com_free_t(void *com, u_char type); void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f); void if_deregister_com_alloc(u_char type); void if_data_copy(struct ifnet *, struct if_data *); uint64_t if_get_counter_default(struct ifnet *, ift_counter); void if_inc_counter(struct ifnet *, ift_counter, int64_t); #define IF_LLADDR(ifp) \ LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr)) uint64_t if_setbaudrate(if_t ifp, uint64_t baudrate); uint64_t if_getbaudrate(if_t ifp); int if_setcapabilities(if_t ifp, int capabilities); int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit); int if_getcapabilities(if_t ifp); int if_togglecapenable(if_t ifp, int togglecap); int if_setcapenable(if_t ifp, int capenable); int if_setcapenablebit(if_t ifp, int setcap, int clearcap); int if_getcapenable(if_t ifp); const char *if_getdname(if_t ifp); int if_setdev(if_t ifp, void *dev); int if_setdrvflagbits(if_t ifp, int if_setflags, int clear_flags); int if_getdrvflags(if_t ifp); int if_setdrvflags(if_t ifp, int flags); int if_clearhwassist(if_t ifp); int if_sethwassistbits(if_t ifp, int toset, int toclear); int if_sethwassist(if_t ifp, int hwassist_bit); int if_gethwassist(if_t ifp); int if_setsoftc(if_t ifp, void *softc); void *if_getsoftc(if_t ifp); int if_setflags(if_t ifp, int flags); int if_gethwaddr(if_t ifp, struct ifreq *); int if_setmtu(if_t ifp, int mtu); int if_getmtu(if_t ifp); int if_getmtu_family(if_t ifp, int family); int if_setflagbits(if_t ifp, int set, int clear); int if_getflags(if_t ifp); int if_sendq_empty(if_t ifp); int if_setsendqready(if_t ifp); int if_setsendqlen(if_t ifp, int tx_desc_count); int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax); int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount); int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize); u_int if_gethwtsomax(if_t ifp); u_int if_gethwtsomaxsegcount(if_t ifp); u_int if_gethwtsomaxsegsize(if_t ifp); int if_input(if_t ifp, struct mbuf* sendmp); int if_sendq_prepend(if_t ifp, struct mbuf *m); struct mbuf *if_dequeue(if_t ifp); int if_setifheaderlen(if_t ifp, int len); void if_setrcvif(struct mbuf *m, if_t ifp); void if_setvtag(struct mbuf *m, u_int16_t tag); u_int16_t if_getvtag(struct mbuf *m); int if_vlantrunkinuse(if_t ifp); caddr_t if_getlladdr(if_t ifp); void *if_gethandle(u_char); void if_bpfmtap(if_t ifp, struct mbuf *m); void if_etherbpfmtap(if_t ifp, struct mbuf *m); void if_vlancap(if_t ifp); /* * Traversing through interface address lists. */ struct sockaddr_dl; typedef u_int iflladdr_cb_t(void *, struct sockaddr_dl *, u_int); u_int if_foreach_lladdr(if_t, iflladdr_cb_t, void *); u_int if_foreach_llmaddr(if_t, iflladdr_cb_t, void *); u_int if_lladdr_count(if_t); u_int if_llmaddr_count(if_t); int if_getamcount(if_t ifp); struct ifaddr * if_getifaddr(if_t ifp); /* Functions */ void if_setinitfn(if_t ifp, void (*)(void *)); void if_setioctlfn(if_t ifp, int (*)(if_t, u_long, caddr_t)); void if_setstartfn(if_t ifp, void (*)(if_t)); void if_settransmitfn(if_t ifp, if_transmit_fn_t); void if_setqflushfn(if_t ifp, if_qflush_fn_t); void if_setgetcounterfn(if_t ifp, if_get_counter_t); /* Revisit the below. These are inline functions originally */ int drbr_inuse_drv(if_t ifp, struct buf_ring *br); struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br); int drbr_needs_enqueue_drv(if_t ifp, struct buf_ring *br); int drbr_enqueue_drv(if_t ifp, struct buf_ring *br, struct mbuf *m); /* TSO */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *); int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *); /* accessors for struct ifreq */ void *ifr_data_get_ptr(void *ifrp); void *ifr_buffer_get_buffer(void *data); size_t ifr_buffer_get_length(void *data); int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); #ifdef DEVICE_POLLING enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS }; typedef int poll_handler_t(if_t ifp, enum poll_cmd cmd, int count); int ether_poll_register(poll_handler_t *h, if_t ifp); int ether_poll_deregister(if_t ifp); #endif /* DEVICE_POLLING */ #endif /* _KERNEL */ #include /* XXXAO: temporary unconditional include */ #endif /* !_NET_IF_VAR_H_ */ Index: head/sys/net/route.c =================================================================== --- head/sys/net/route.c (revision 360474) +++ head/sys/net/route.c (revision 360475) @@ -1,2368 +1,2369 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); VNET_PCPUSTAT_SYSINIT(rtstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rtstat); #endif VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) VNET_DEFINE_STATIC(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) EVENTHANDLER_LIST_DEFINE(rt_addrmsg); static int rt_getifa_fib(struct rt_addrinfo *, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, void *arg); static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags); static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt); static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt); static int change_route(struct rib_head *, struct rt_addrinfo *, struct rtentry **); struct if_mtuinfo { struct ifnet *ifp; int mtu; }; static int if_updatemtu_cb(struct radix_node *, void *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds (0 <= %d < %d)", __func__, table, rt_numfibs)); KASSERT(fam >= 0 && fam < (AF_MAX + 1), ("%s: fam out of bounds (0 <= %d < %d)", __func__, fam, AF_MAX+1)); /* rnh is [fib=0][af=0]. */ rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } u_int rt_tables_get_gen(int table, int fam) { struct rib_head *rnh; rnh = *rt_tables_get_rnh_ptr(table, fam); KASSERT(rnh != NULL, ("%s: NULL rib_head pointer table %d fam %d", __func__, table, fam)); return (rnh->rnh_gen); } /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; nhops_init(); } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, 0, table); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset, int family, u_int fibnum) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Save metadata associated with this routing table. */ rh->rib_family = family; rh->rib_fibnum = fibnum; #ifdef VIMAGE rh->rib_vnet = curvnet; #endif tmproutes_init(rh); /* Init locks */ RIB_LOCK_INIT(rh); nhops_init_rib(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); nhops_destroy_rib(rh); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RLOCK(rh); #ifdef INVARIANTS else RIB_LOCK_ASSERT(rh); #endif rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); return (newrt); } else if ((ignflags & RTF_RNH_LOCKED) == 0) RIB_RUNLOCK(rh); /* * Either we hit the root or could not find any match, * which basically means: "cannot get there from here". */ miss: RTSTAT_INC(rts_unreach); if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ R_Free(rt_key(rt)); /* Unreference nexthop */ nhop_free(rt->rt_nhop); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Temporary RTFREE() function wrapper. * Intended to use in control plane code to * avoid exposing internal layout of 'struct rtentry'. */ void rtfree_func(struct rtentry *rt) { RTFREE(rt); } /* * Adds a temporal redirect entry to the routing table. * @fibnum: fib number * @dst: destination to install redirect to * @gateway: gateway to go via * @author: sockaddr of originating router, can be NULL * @ifp: interface to use for the redirected route * @flags: set of flags to add. Allowed: RTF_GATEWAY * @lifetime_sec: time in seconds to expire this redirect. * * Retuns 0 on success, errno otherwise. */ int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec) { struct rtentry *rt; int error; struct rt_addrinfo info; struct rt_metrics rti_rmx; struct ifaddr *ifa; NET_EPOCH_ASSERT(); if (rt_tables_get_rnh(fibnum, dst->sa_family) == NULL) return (EAFNOSUPPORT); /* Verify the allowed flag mask. */ KASSERT(((flags & ~(RTF_GATEWAY)) == 0), ("invalid redirect flags: %x", flags)); /* Get the best ifa for the given interface and gateway. */ if ((ifa = ifaof_ifpforaddr(gateway, ifp)) == NULL) return (ENETUNREACH); ifa_ref(ifa); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_ifa = ifa; info.rti_ifp = ifp; info.rti_flags = flags | RTF_HOST | RTF_DYNAMIC; /* Setup route metrics to define expire time. */ bzero(&rti_rmx, sizeof(rti_rmx)); /* Set expire time as absolute. */ rti_rmx.rmx_expire = lifetime_sec + time_second; info.rti_mflags |= RTV_EXPIRE; info.rti_rmx = &rti_rmx; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); ifa_free(ifa); if (error != 0) { /* TODO: add per-fib redirect stats. */ return (error); } RT_LOCK(rt); flags = rt->rt_flags; RTFREE_LOCKED(rt); RTSTAT_INC(rts_dynamic); /* Send notification of a route addition to userland. */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_AUTHOR] = author; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); return (0); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; NET_EPOCH_ASSERT(); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt; rt = rtalloc1_fib(gateway, 0, flags, fibnum); if (rt == NULL) goto out; /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) goto out; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } out: return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp and rt_ifa. * * Returns 0 on success. */ int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; struct nhop_object *nh; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = &rt->rt_nhop->gw_sa; dst = info->rti_info[RTAX_GATEWAY]; if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (rt->rt_flags & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa; info->rti_addrs |= RTA_GATEWAY; } } nh = rt->rt_nhop; rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = nh->nh_mtu; } info->rti_flags = rt->rt_flags | nhop_get_rtflags(nh); info->rti_ifp = nh->nh_ifp; info->rti_ifa = nh->nh_ifa; if (flags & NHR_REF) { if_ref(info->rti_ifp); ifa_ref(info->rti_ifa); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * If @flags contains NHR_REF, refcouting is performed on rt_ifp and rt_ifa. * All references can be released later by calling rib_free_info(). * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rt->rt_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { ifa_free(info->rti_ifa); if_rele(info->rti_ifp); } /* * Iterates over all existing fibs in system calling * @setwa_f function prior to traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rnh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, af, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); continue; } for (i = 1; i <= AF_MAX; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, i, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); } } } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; int error; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; error = 0; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa; rt = rt_unlinkrte(di->rnh, info, &error); if (rt == NULL) { /* Either not allowed or not matched. Skip entry */ return (0); } /* Entry was unlinked. Add to the list and return */ rt->rt_chain = di->head; di->head = rt; return (0); } /* * Iterates over a routing table specified by @fibnum and @family and * deletes elements marked by @filter_f. * @fibnum: rtable id * @family: AF_ address family * @filter_f: function returning non-zero value for items to delete * @arg: data to pass to the @filter_f function * @report: true if rtsock notification is needed. */ void rib_walk_del(u_int fibnum, int family, rt_filter_f_t *filter_f, void *arg, bool report) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; di.rnh = rnh; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); if (di.head == NULL) return; /* We might have something to reclaim. */ while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_notifydelete(rt, &di.info); if (report) rt_routemsg(RTM_DELETE, rt, rt->rt_ifp, 0, fibnum); RTFREE_LOCKED(rt); } } /* * Iterates over all existing fibs in system and deletes each element * for which @filter_f function returns non-zero value. * If @family is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk_del(int family, rt_filter_f_t *filter_f, void *arg) { u_int fibnum; int i, start, end; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (family != AF_UNSPEC) { start = family; end = family; } else { start = 1; end = AF_MAX; } for (i = start; i <= end; i++) { if (rt_tables_get_rnh(fibnum, i) == NULL) continue; rib_walk_del(fibnum, i, filter_f, arg, 0); } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * nh pointer to nhop * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { struct ifnet *ifp = arg; if (nh->nh_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); return (1); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes_af(struct ifnet *ifp, int af) { KASSERT((af >= 1 && af <= AF_MAX), ("%s: af %d not >= 1 and <= %d", __func__, af, AF_MAX)); rt_foreach_fib_walk_del(af, rt_ifdelroute, ifp); } void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns unlinked, locked and referenced @rtentry on success, * Returns NULL and sets @perror to: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete PINNED route without appropriate flag. * ENOENT - if supplied filter function returned 0 (not matched). */ static struct rtentry * rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) { struct sockaddr *dst, *netmask; struct rtentry *rt; struct radix_node *rn; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt == NULL) { *perror = ESRCH; return (NULL); } if ((info->rti_flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ if (rt->rt_flags & RTF_PINNED) { *perror = EADDRINUSE; return (NULL); } } if (info->rti_filter != NULL) { if (info->rti_filter(rt, rt->rt_nhop, info->rti_filterdata)==0){ /* Not matched */ *perror = ENOENT; return (NULL); } /* * Filter function requested rte deletion. * Ease the caller work by filling in remaining info * from that particular entry. */ info->rti_info[RTAX_GATEWAY] = &rt->rt_nhop->gw_sa; } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ *perror = ESRCH; #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) rn = rt_mpath_unlink(rnh, info, rt, perror); else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); if (rn == NULL) return (NULL); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; *perror = 0; return (rt); } static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ ifa = rt->rt_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) - ifa->ifa_rtrequest(RTM_DELETE, rt, info); + ifa->ifa_rtrequest(RTM_DELETE, rt, rt->rt_nhop, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. * * Assume basic consistency checks are executed by callers: * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct epoch_tracker et; int needref, error; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ error = 0; needref = (info->rti_ifa == NULL); NET_EPOCH_ENTER(et); /* If we have interface specified by the ifindex in the address, use it */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)ifpaddr; if (sdl->sdl_index != 0) info->rti_ifp = ifnet_byindex(sdl->sdl_index); } /* * If we have source address specified, try to find it * TODO: avoid enumerating all ifas on all interfaces. */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; /* * Most common use case for the userland-supplied routes. * * Choose sockaddr to select ifa. * -- if ifp is set -- * Order of preference: * 1) IFA address * 2) gateway address * Note: for interface routes link-level gateway address * is specified to indicate the interface index without * specifying RTF_GATEWAY. In this case, ignore gateway * Note: gateway AF may be different from dst AF. In this case, * ignore gateway * 3) final destination. * 4) if all of these fails, try to get at least link-level ifa. * -- else -- * try to lookup gateway or dst in the routing table to get ifa */ if (info->rti_info[RTAX_IFA] != NULL) sa = info->rti_info[RTAX_IFA]; else if ((info->rti_flags & RTF_GATEWAY) != 0 && gateway->sa_family == dst->sa_family) sa = gateway; else sa = dst; if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); /* Case 4 */ if (info->rti_ifa == NULL && gateway != NULL) info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if (needref && info->rti_ifa != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = info->rti_ifa->ifa_ifp; ifa_ref(info->rti_ifa); } else error = ENETUNREACH; NET_EPOCH_EXIT(et); return (error); } static int if_updatemtu_cb(struct radix_node *rn, void *arg) { struct rtentry *rt; struct if_mtuinfo *ifmtu; rt = (struct rtentry *)rn; ifmtu = (struct if_mtuinfo *)arg; if (rt->rt_ifp != ifmtu->ifp) return (0); if (rt->rt_mtu >= ifmtu->mtu) { /* We have to decrease mtu regardless of flags */ rt->rt_mtu = ifmtu->mtu; return (0); } /* * New MTU is bigger. Check if are allowed to alter it */ if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { /* * Skip routes with user-supplied MTU and * non-interface routes */ return (0); } /* We are safe to update route MTU */ rt->rt_mtu = ifmtu->mtu; return (0); } void rt_updatemtu(struct ifnet *ifp) { struct if_mtuinfo ifmtu; struct rib_head *rnh; int i, j; ifmtu.ifp = ifp; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { ifmtu.mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); nhops_update_ifmtu(rnh, ifp, ifmtu.mtu); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, &rt->rt_nhop->gw_sa); } return (i); } #endif #ifdef RADIX_MPATH /* * Deletes key for single-path routes, unlinks rtentry with * gateway specified in @info from multi-path routes. * * Returnes unlinked entry. In case of failure, returns NULL * and sets @perror to ESRCH. */ static struct radix_node * rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt; // *rto = NULL; struct radix_node *rn; struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; rt = rt_mpath_matchgate(rto, gw); if (rt == NULL) { *perror = ESRCH; return (NULL); } /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gw && (rt->rt_nhop->gw_sa.sa_len != gw->sa_len || memcmp(&rt->rt_nhop->gw_sa, gw, gw->sa_len))) { *perror = ESRCH; return (NULL); } } /* * use the normal delete code to remove * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); *perror = 0; return (rn); } /* * if the entry is 2nd and on up */ if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); *perror = 0; rn = (struct radix_node *)rt; return (rn); } #endif #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct epoch_tracker et; const struct sockaddr *dst; struct rib_head *rnh; int error; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((info->rti_flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); dst = info->rti_info[RTAX_DST]; switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; error = 0; switch (req) { case RTM_DELETE: error = del_route(rnh, info, ret_nrt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: error = add_route(rnh, info, ret_nrt); break; case RTM_CHANGE: NET_EPOCH_ENTER(et); RIB_WLOCK(rnh); error = change_route(rnh, info, ret_nrt); RIB_WUNLOCK(rnh); NET_EPOCH_EXIT(et); break; default: error = EOPNOTSUPP; } return (error); } static int add_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt) { struct sockaddr *dst, *ndst, *gateway, *netmask; struct rtentry *rt, *rt_old; struct nhop_object *nh; struct radix_node *rn; struct ifaddr *ifa; int error, flags; struct epoch_tracker et; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; netmask = info->rti_info[RTAX_NETMASK]; flags = info->rti_flags; if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, rnh->rib_fibnum); if (error) return (error); } else { ifa_ref(info->rti_ifa); } NET_EPOCH_ENTER(et); error = nhop_create_from_info(rnh, info, &nh); NET_EPOCH_EXIT(et); if (error != 0) { ifa_free(info->rti_ifa); return (error); } rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(info->rti_ifa); nhop_free(nh); return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = rnh->rib_fibnum; rt->rt_nhop = nh; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(info->rti_ifa); nhop_free(nh); uma_zfree(V_rtzone, rt); return (error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ ifa = info->rti_ifa; rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); RIB_WLOCK(rnh); RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); nhop_free(nh); uma_zfree(V_rtzone, rt); return (EEXIST); } #endif /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); if (rn != NULL && rt->rt_expire > 0) tmproutes_update(rnh, rt); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { /* * Force removal and re-try addition * TODO: better multipath&pinned support */ struct sockaddr *info_dst = info->rti_info[RTAX_DST]; info->rti_info[RTAX_DST] = ndst; /* Do not delete existing PINNED(interface) routes */ info->rti_flags &= ~RTF_PINNED; rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; if (rt_old != NULL) rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); } RIB_WUNLOCK(rnh); if (rt_old != NULL) RT_UNLOCK(rt_old); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); nhop_free(nh); uma_zfree(V_rtzone, rt); return (EEXIST); } if (rt_old != NULL) { rt_notifydelete(rt_old, info); RTFREE(rt_old); } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_ADD, rt, info); + ifa->ifa_rtrequest(RTM_ADD, rt, rt->rt_nhop, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); return (0); } static int del_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt) { struct sockaddr *dst, *netmask; struct sockaddr_storage mdst; struct rtentry *rt; int error; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; if (netmask) { if (dst->sa_len > sizeof(mdst)) return (EINVAL); rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } RIB_WLOCK(rnh); rt = rt_unlinkrte(rnh, info, &error); RIB_WUNLOCK(rnh); if (error != 0) return (error); rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); return (0); } static int change_route(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; struct nhop_object *nh; struct if_mtuinfo ifmtu; RIB_WLOCK_ASSERT(rnh); rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rt_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif nh = NULL; RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { /* * XXX: Temporarily set RTF_RNH_LOCKED flag in the rti_flags * to avoid rlock in the ifa_ifwithroute(). */ info->rti_flags |= RTF_RNH_LOCKED; error = rt_getifa_fib(info, rnh->rib_fibnum); info->rti_flags &= ~RTF_RNH_LOCKED; if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } error = nhop_create_from_nhop(rnh, rt->rt_nhop, info, &nh); if (error != 0) goto bad; /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL) { if (rt->rt_ifa->ifa_rtrequest != NULL) - rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); + rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, rt->rt_nhop, + info); ifa_free(rt->rt_ifa); rt->rt_ifa = NULL; } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, nh, info); /* Alter route MTU if necessary */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); /* Set default MTU */ if (rt->rt_mtu == 0) rt->rt_mtu = mtu; if (rt->rt_mtu != mtu) { /* Check if we really need to update */ ifmtu.ifp = rt->rt_ifp; ifmtu.mtu = mtu; if_updatemtu_cb(rt->rt_nodes, &ifmtu); } } /* Update nexthop */ nhop_free(rt->rt_nhop); rt->rt_nhop = nh; nh = NULL; /* * This route change may have modified the route's gateway. In that * case, any inpcbs that have cached this route need to invalidate their * llentry cache. */ rnh->rnh_gen++; if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (nh != NULL) nhop_free(nh); if (free_ifa != 0) { ifa_free(info->rti_ifa); info->rti_ifa = NULL; } return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) { if (info->rti_rmx->rmx_mtu != 0) { /* * MTU was explicitly provided by user. * Keep it. */ rt->rt_flags |= RTF_FIXEDMTU; } else { /* * User explicitly sets MTU to 0. * Assume rollback to default. */ rt->rt_flags &= ~RTF_FIXEDMTU; } rt->rt_mtu = info->rti_rmx->rmx_mtu; } if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { RIB_RLOCK_TRACKER; struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; struct sockaddr_dl_short *sdl = NULL; struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } else if (cmd == RTM_ADD) { sdl = (struct sockaddr_dl_short *)tempbuf; bzero(sdl, sizeof(struct sockaddr_dl_short)); sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(struct sockaddr_dl_short); sdl->sdl_type = ifa->ifa_ifp->if_type; sdl->sdl_index = ifa->ifa_ifp->if_index; } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RIB_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_dl, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); EVENTHANDLER_DIRECT_INVOKE(rt_addrmsg, ifa, cmd); return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @rt: valid rtentry * @ifp: target route interface * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg(int cmd, struct rtentry *rt, struct ifnet *ifp, int rti_addrs, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, rt, ifp, 0, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @info: addrinfo structure with valid data. * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(info->rti_info[RTAX_DST] != NULL, (":%s: RTAX_DST must be supplied", __func__)); return (rtsock_routemsg_info(cmd, info, fibnum)); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, rt, ifa->ifa_ifp, 0, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: head/sys/netinet6/nd6.c =================================================================== --- head/sys/netinet6/nd6.c (revision 360474) +++ head/sys/netinet6/nd6.c (revision 360475) @@ -1,2656 +1,2656 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ VNET_DEFINE_STATIC(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ VNET_DEFINE_STATIC(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif static eventhandler_tag lle_event_eh, iflladdr_event_eh, ifnet_link_event_eh; VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(struct rwlock, nd6_lock); VNET_DEFINE(uint64_t, nd6_list_genid); VNET_DEFINE(struct mtx, nd6_onlink_mtx); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static void nd6_free(struct llentry **, int); static void nd6_free_redirect(const struct llentry *); static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); -static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +static void nd6_rtrequest(int, struct rtentry *, struct nhop_object *, + struct rt_addrinfo *); static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **); static int nd6_need_cache(struct ifnet *); VNET_DEFINE_STATIC(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE_STATIC(struct callout, nd6_timer_ch); #define V_nd6_timer_ch VNET(nd6_timer_ch) SYSCTL_DECL(_net_inet6_icmp6); static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct rt_addrinfo rtinfo; struct sockaddr_in6 dst; struct sockaddr_dl gw; struct ifnet *ifp; int type; int fibnum; LLE_WLOCK_ASSERT(lle); if (lltable_get_af(lle->lle_tbl) != AF_INET6) return; switch (evt) { case LLENTRY_RESOLVED: type = RTM_ADD; KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); break; case LLENTRY_EXPIRED: type = RTM_DELETE; break; default: return; } ifp = lltable_get_ifp(lle->lle_tbl); bzero(&dst, sizeof(dst)); bzero(&gw, sizeof(gw)); bzero(&rtinfo, sizeof(rtinfo)); lltable_fill_sa_entry(lle, (struct sockaddr *)&dst); dst.sin6_scope_id = in6_getscopezone(ifp, in6_addrscope(&dst.sin6_addr)); gw.sdl_len = sizeof(struct sockaddr_dl); gw.sdl_family = AF_LINK; gw.sdl_alen = ifp->if_addrlen; gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib; rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | ( type == RTM_ADD ? RTF_UP: 0), 0, fibnum); } /* * A handler for interface link layer address change event. */ static void nd6_iflladdr(void *arg __unused, struct ifnet *ifp) { lltable_update_ifaddr(LLTABLE6(ifp)); } void nd6_init(void) { mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF); rw_init(&V_nd6_lock, "nd6 list"); LIST_INIT(&V_nd_prefix); nd6_defrouter_init(); /* Start timers. */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); nd6_dad_init(); if (IS_DEFAULT_VNET(curvnet)) { lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); ifnet_link_event_eh = EVENTHANDLER_REGISTER(ifnet_link_event, nd6_ifnet_link_event, NULL, EVENTHANDLER_PRI_ANY); } } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) { EVENTHANDLER_DEREGISTER(ifnet_link_event, ifnet_link_event_eh); EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); } rw_destroy(&V_nd6_lock); mtx_destroy(&V_nd6_onlink_mtx); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) nd->flags |= ND6_IFF_ACCEPT_RTADV; if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) { struct epoch_tracker et; struct ifaddr *ifa, *next; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; /* stop DAD processing */ nd6_dad_stop(ifa); } NET_EPOCH_EXIT(et); free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return; nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; ndi->maxmtu = ifp->if_mtu; /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ static void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->lle_timer); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->lle_timer, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->lle_timer, tick, nd6_llinfo_timer, ln); } } if (canceled > 0) LLE_REMREF(ln); } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. * * Set noinline to be dtrace-friendly */ static __noinline struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr hdr; struct mbuf *m; if (ln->la_hold == NULL) return (NULL); /* * assume every packet in la_hold has the same IP header */ m = ln->la_hold; if (sizeof(hdr) > m->m_len) return (NULL); m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr); *src = hdr.ip6_src; return (src); } /* * Checks if we need to switch from STALE state. * * RFC 4861 requires switching from STALE to DELAY state * on first packet matching entry, waiting V_nd6_delay and * transition to PROBE state (if upper layer confirmation was * not received). * * This code performs a bit differently: * On packet hit we don't change state (but desired state * can be guessed by control plane). However, after V_nd6_delay * seconds code will transition to PROBE state (so DELAY state * is kinda skipped in most situations). * * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so * we perform the following upon entering STALE state: * * 1) Arm timer to run each V_nd6_delay seconds to make sure that * if packet was transmitted at the start of given interval, we * would be able to switch to PROBE state in V_nd6_delay seconds * as user expects. * * 2) Reschedule timer until original V_nd6_gctimer expires keeping * lle in STALE state (remaining timer value stored in lle_remtime). * * 3) Reschedule timer if packet was transmitted less that V_nd6_delay * seconds ago. * * Returns non-zero value if the entry is still STALE (storing * the next timer interval in @pdelay). * * Returns zero value if original timer expired or we need to switch to * PROBE (store that in @do_switch variable). */ static int nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch) { int nd_delay, nd_gctimer, r_skip_req; time_t lle_hittime; long delay; *do_switch = 0; nd_gctimer = V_nd6_gctimer; nd_delay = V_nd6_delay; LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); if (r_skip_req > 0) { /* * Nonzero r_skip_req value was set upon entering * STALE state. Since value was not changed, no * packets were passed using this lle. Ask for * timer reschedule and keep STALE state. */ delay = (long)(MIN(nd_gctimer, nd_delay)); delay *= hz; if (lle->lle_remtime > delay) lle->lle_remtime -= delay; else { delay = lle->lle_remtime; lle->lle_remtime = 0; } if (delay == 0) { /* * The original ng6_gctime timeout ended, * no more rescheduling. */ return (0); } *pdelay = delay; return (1); } /* * Packet received. Verify timestamp */ delay = (long)(time_uptime - lle_hittime); if (delay < nd_delay) { /* * V_nd6_delay still not passed since the first * hit in STALE state. * Reshedule timer and return. */ *pdelay = (long)(nd_delay - delay) * hz; return (1); } /* Request switching to probe */ *do_switch = 1; return (0); } /* * Switch @lle state to new state optionally arming timers. * * Set noinline to be dtrace-friendly */ __noinline void nd6_llinfo_setstate(struct llentry *lle, int newstate) { struct ifnet *ifp; int nd_gctimer, nd_delay; long delay, remtime; delay = 0; remtime = 0; switch (newstate) { case ND6_LLINFO_INCOMPLETE: ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(lle)) { ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->reachable * hz; } break; case ND6_LLINFO_STALE: /* * Notify fast path that we want to know if any packet * is transmitted by setting r_skip_req. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); nd_delay = V_nd6_delay; nd_gctimer = V_nd6_gctimer; delay = (long)(MIN(nd_gctimer, nd_delay)) * hz; remtime = (long)nd_gctimer * hz - delay; break; case ND6_LLINFO_DELAY: lle->la_asked = 0; delay = (long)V_nd6_delay * hz; break; } if (delay > 0) nd6_llinfo_settimer_locked(lle, delay); lle->lle_remtime = remtime; lle->ln_state = newstate; } /* * Timer-dependent part of nd state machine. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_llinfo_timer(void *arg) { struct epoch_tracker et; struct llentry *ln; struct in6_addr *dst, *pdst, *psrc, src; struct ifnet *ifp; struct nd_ifinfo *ndi; int do_switch, send_ns; long delay; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; ifp = lltable_get_ifp(ln->lle_tbl); CURVNET_SET(ifp->if_vnet); ND6_RLOCK(); LLE_WLOCK(ln); if (callout_pending(&ln->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by nd6_llinfo_settimer_locked above since canceled * would have been 1. */ LLE_WUNLOCK(ln); ND6_RUNLOCK(); CURVNET_RESTORE(); return; } NET_EPOCH_ENTER(et); ndi = ND_IFINFO(ifp); send_ns = 0; dst = &ln->r_l3addr.addr6; pdst = dst; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { nd6_free(&ln, 0); goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; send_ns = 1; /* Send NS to multicast address */ pdst = NULL; } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } nd6_free(&ln, 0); if (m != NULL) icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); break; case ND6_LLINFO_STALE: if (nd6_is_stale(ln, &delay, &do_switch) != 0) { /* * No packet has used this entry and GC timeout * has not been passed. Reshedule timer and * return. */ nd6_llinfo_settimer_locked(ln, delay); break; } if (do_switch == 0) { /* * GC timer has ended and entry hasn't been used. * Run Garbage collector (RFC 4861, 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) nd6_free(&ln, 1); break; } /* Entry has been used AND delay timer has ended. */ /* FALLTHROUGH */ case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE); send_ns = 1; } else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */ break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; send_ns = 1; } else { nd6_free(&ln, 0); } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (ln != NULL) ND6_RUNLOCK(); if (send_ns != 0) { nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); psrc = nd6_llinfo_get_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd6_ns_output(ifp, psrc, pdst, dst, NULL); } if (ln != NULL) LLE_FREE_LOCKED(ln); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); struct epoch_tracker et; struct nd_prhead prl; struct nd_prefix *pr, *npr; struct ifnet *ifp; struct in6_ifaddr *ia6, *nia6; uint64_t genid; LIST_INIT(&prl); NET_EPOCH_ENTER(et); nd6_defrouter_timer(); /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ addrloop: CK_STAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) { /* * Schedule DAD for a tentative address. This happens * if the interface was down or not running * when the address was configured. */ int delay; delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); nd6_dad_start((struct ifaddr *)ia6, delay); } else { /* * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ ifp = ia6->ia_ifp; if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0 && ((ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)){ ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } NET_EPOCH_EXIT(et); ND6_WLOCK(); restart: LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* * Expire prefixes. Since the pltime is only used for * autoconfigured addresses, pltime processing for prefixes is * not necessary. * * Only unlink after all derived addresses have expired. This * may not occur until two hours after the prefix has expired * per RFC 4862. If the prefix expires before its derived * addresses, mark it off-link. This will be done automatically * after unlinking if no address references remain. */ if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME || time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime) continue; if (pr->ndpr_addrcnt == 0) { nd6_prefix_unlink(pr, &prl); continue; } if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { genid = V_nd6_list_genid; nd6_prefix_ref(pr); ND6_WUNLOCK(); ND6_ONLINK_LOCK(); (void)nd6_prefix_offlink(pr); ND6_ONLINK_UNLOCK(); ND6_WLOCK(); nd6_prefix_rele(pr); if (genid != V_nd6_list_genid) goto restart; } } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; NET_EPOCH_ASSERT(); ifp = ia6->ia_ifa.ifa_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Remove prefix and default router list entries corresponding to ifp. Neighbor * cache entries are freed in in6_domifdetach(). */ void nd6_purge(struct ifnet *ifp) { struct nd_prhead prl; struct nd_prefix *pr, *npr; LIST_INIT(&prl); /* Purge default router list entries toward ifp. */ nd6_defrouter_purge(ifp); ND6_WLOCK(); /* * Remove prefixes on ifp. We should have already removed addresses on * this interface, so no addresses should be referencing these prefixes. */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { if (pr->ndpr_ifp == ifp) nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); /* Delete the unlinked prefix objects. */ while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select_fib(ifp->if_fib); } } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6); return (ln); } static struct llentry * nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6); if (ln != NULL) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *ifa; struct rt_addrinfo info; struct sockaddr_in6 rt_key; const struct sockaddr *dst6; uint64_t genid; int error, fibnum; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } bzero(&rt_key, sizeof(rt_key)); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ ND6_RLOCK(); restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { dst6 = (const struct sockaddr *)&pr->ndpr_prefix; /* * We only need to check all FIBs if add_addr_allfibs * is unset. If set, checking any FIB will suffice. */ fibnum = V_rt_add_addr_allfibs ? rt_numfibs - 1 : 0; for (; fibnum < rt_numfibs; fibnum++) { genid = V_nd6_list_genid; ND6_RUNLOCK(); /* * Restore length field before * retrying lookup */ rt_key.sin6_len = sizeof(rt_key); error = rib_lookup_info(fibnum, dst6, 0, 0, &info); ND6_RLOCK(); if (genid != V_nd6_list_genid) goto restart; if (error == 0) break; } if (error != 0) continue; /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. In the case * where RADIX_MPATH is enabled, multiple route * entries (of the same rt_key value) will be * installed because the interface addresses all * differ. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &rt_key.sin6_addr)) continue; } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) { ND6_RUNLOCK(); return (1); } } ND6_RUNLOCK(); /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ if (ifp->if_flags & IFF_POINTOPOINT) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sin6_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { NET_EPOCH_EXIT(et); return 1; } } NET_EPOCH_EXIT(et); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && nd6_defrouter_list_empty() && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *lle; int rc = 0; NET_EPOCH_ASSERT(); IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_free(struct llentry **lnp, int gc) { struct ifnet *ifp; struct llentry *ln; struct nd_defrouter *dr; ln = *lnp; *lnp = NULL; LLE_WLOCK_ASSERT(ln); ND6_RLOCK_ASSERT(); ifp = lltable_get_ifp(ln->lle_tbl); if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp); else dr = NULL; ND6_RUNLOCK(); if ((ln->la_flags & LLE_DELETED) == 0) EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); LLE_REMREF(ln); LLE_WUNLOCK(ln); defrouter_rele(dr); return; } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select_fib() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&ln->r_l3addr.addr6, ifp); } if (dr) { /* * Since defrouter_select_fib() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select_fib(dr->ifp->if_fib); } /* * If this entry was added by an on-link redirect, remove the * corresponding host route. */ if (ln->la_flags & LLE_REDIRECT) nd6_free_redirect(ln); if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { /* Remove callout reference */ LLE_REMREF(ln); lltable_unlink_entry(ln->lle_tbl, ln); } IF_AFDATA_UNLOCK(ifp); llentry_free(ln); if (dr != NULL) defrouter_rele(dr); } static int nd6_isdynrte(const struct rtentry *rt, const struct nhop_object *nh, void *xap) { if (nh->nh_flags & NHF_REDIRECT) return (1); return (0); } /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. */ static void nd6_free_redirect(const struct llentry *ln) { int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6; info.rti_filter = nd6_isdynrte; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rtrequest1_fib(RTM_DELETE, &info, NULL, fibnum); } /* * Rejuvenate this function for routing operations related * processing. */ void -nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) +nd6_rtrequest(int req, struct rtentry *rt, struct nhop_object *nh, + struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; - struct nhop_object *nh; - nh = rt->rt_nhop; gateway = &nh->gw6_sa; switch (req) { case RTM_ADD: break; case RTM_DELETE: /* * Only indirect routes are interesting. */ if ((nh->nh_flags & NHF_GATEWAY) == 0) return; /* * check for default route */ if (nh->nh_flags & NHF_DEFAULT) { dr = defrouter_lookup(&gateway->sin6_addr, nh->nh_ifp); if (dr != NULL) { dr->installed = 0; defrouter_rele(dr); } } break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; struct epoch_tracker et; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intended for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } NET_EPOCH_EXIT(et); if (ifa != NULL) { /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } NET_EPOCH_EXIT(et); } } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } NET_EPOCH_EXIT(et); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } ND_IFINFO(ifp)->flags = ND.flags; break; } #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select_fib(RT_ALL_FIBS); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct in6_ifaddr *ia, *ia_next; struct nd_prefix *pr, *next; struct nd_prhead prl; LIST_INIT(&prl); ND6_WLOCK(); LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); /* XXXRW: in6_ifaddrhead locking. */ CK_STAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } nd6_prefix_del(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ defrouter_reset(); nd6_defrouter_flush_all(); defrouter_select_fib(RT_ALL_FIBS); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); NET_EPOCH_ENTER(et); ln = nd6_lookup(&nb_addr, 0, ifp); NET_EPOCH_EXIT(et); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + ln->lle_remtime / hz + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Calculates new isRouter value based on provided parameters and * returns it. */ static int nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr, int ln_router) { /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * is_new old_addr new_addr NS RS RA redir * D R * 0 n n (1) c ? s * 0 y n (2) c s s * 0 n y (3) c s s * 0 y y (4) c s s * 0 y y (5) c s s * 1 -- n (6) c c c s * 1 -- y (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_new) /* (6-7) */ ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln_router = 1; else { if (is_new) /* (6-7) */ ln_router = 0; } break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_new && (old_addr || new_addr)) || /* (2-5) */ (is_new && new_addr)) { /* (7) */ ln_router = 1; } break; } return (ln_router); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * */ void nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL, *ln_tmp; int is_newentry; int do_update; int olladdr; int llchange; int flags; uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; NET_EPOCH_ASSERT(); IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; ln = nd6_lookup(from, flags, ifp); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; ln = nd6_alloc(from, 0, ifp); if (ln == NULL) return; /* * Since we already know all the data for the new entry, * fill it before insertion. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp); if (ln_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; if (lladdr != NULL) { /* (7) */ nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } } else { lltable_free_entry(LLTABLE6(ifp), ln); ln = ln_tmp; ln_tmp = NULL; } } /* do nothing if static ndp is set */ if ((ln->la_flags & LLE_STATIC)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); return; } olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y y (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ do_update = 0; if (is_newentry == 0 && llchange != 0) { do_update = 1; /* (3,5) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { /* Entry was deleted */ return; } nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); } /* Calculates new router status */ router = nd6_is_router(type, code, is_newentry, olladdr, lladdr != NULL ? 1 : 0, ln->ln_router); ln->ln_router = router; /* Mark non-router redirects with special flag */ if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER) ln->la_flags |= LLE_REDIRECT; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, chain, &sin6); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select_fib() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select_fib() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if ((do_update || is_newentry) && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select_fib(ifp->if_fib); } } static void nd6_slowtimo(void *arg) { struct epoch_tracker et; CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } void nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain, struct sockaddr_in6 *sin6) { LLE_WLOCK_ASSERT(ln); *chain = ln->la_hold; ln->la_hold = NULL; lltable_fill_sa_entry(ln, (struct sockaddr *)sin6); if (ln->ln_state == ND6_LLINFO_STALE) { /* * The first time we send a packet to a * neighbor whose entry is STALE, we have * to change the state to DELAY and a sets * a timer to expire in DELAY_FIRST_PROBE_TIME * seconds to ensure do neighbor unreachability * detection on expiration. * (RFC 2461 7.3.3) */ nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY); } } int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route *ro) { int error; int ip6len; struct ip6_hdr *ip6; struct m_tag *mtag; #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro); return (error); } /* * Lookup link headerfor @sa_dst address. Stores found * data in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification * is required, call "slow" version. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; NET_EPOCH_ASSERT(); if (pflags != NULL) *pflags = 0; dst6 = (const struct sockaddr_in6 *)sa_dst; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (m != NULL && m->m_flags & M_MCAST) { switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr, desten); return (0); default: m_freem(m); return (EAFNOSUPPORT); } } ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, ifp); if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(ln->r_linkdata, desten, ln->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR); /* Check if we have feedback request from nd6 timer */ if (ln->r_skip_req != 0) { LLE_REQ_LOCK(ln); ln->r_skip_req = 0; /* Notify that entry was used */ ln->lle_hittime = time_uptime; LLE_REQ_UNLOCK(ln); } if (plle) { LLE_ADDREF(ln); *plle = ln; LLE_WUNLOCK(ln); } return (0); } else if (plle && ln) LLE_WUNLOCK(ln); return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle)); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Heavy version. * Function assume that destination LLE does not exist, * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired. * * Set noinline to be dtrace-friendly */ static __noinline int nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; int send_ns, ll_len; char *lladdr; NET_EPOCH_ASSERT(); /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ lle = nd6_alloc(&dst->sin6_addr, 0, ifp); if (lle == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Prefer any existing entry over newly-created one */ lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if (lle_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { lltable_free_entry(LLTABLE6(ifp), lle); lle = lle_tmp; lle_tmp = NULL; } } } if (lle == NULL) { m_freem(m); return (ENOBUFS); } LLE_WLOCK_ASSERT(lle); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY); /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { if (flags & LLE_ADDRONLY) { lladdr = lle->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = lle->r_linkdata; ll_len = lle->r_hdrlen; } bcopy(lladdr, desten, ll_len); if (pflags != NULL) *pflags = lle->la_flags; if (plle) { LLE_ADDREF(lle); *plle = lle; } LLE_WUNLOCK(lle); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. * Note that for newly-created lle la_asked will be 0, * so we will transition from ND6_LLINFO_NOSTATE to * ND6_LLINFO_INCOMPLETE state here. */ psrc = NULL; send_ns = 0; if (lle->la_asked == 0) { lle->la_asked++; send_ns = 1; psrc = nd6_llinfo_get_holdsrc(lle, &src); nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE); } LLE_WUNLOCK(lle); if (send_ns != 0) nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL); return (EWOULDBLOCK); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags) { int error; flags |= LLE_ADDRONLY; error = nd6_resolve_slow(ifp, flags, NULL, (const struct sockaddr_in6 *)dst, desten, pflags, NULL); return (error); } int nd6_flush_holdchain(struct ifnet *ifp, struct mbuf *chain, struct sockaddr_in6 *dst) { struct mbuf *m, *m_head; int error = 0; m_head = chain; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; error = nd6_output_ifp(ifp, ifp, m, dst, NULL); } /* * XXX * note that intermediate errors are blindly ignored */ return (error); } static int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than Ethernet and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln, *ln_tmp; struct sockaddr *dst; ifp = ia->ia_ifa.ifa_ifp; if (nd6_need_cache(ifp) == 0) return (0); ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; dst = (struct sockaddr *)&ia->ia_addr; ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst); if (ln == NULL) return (ENOBUFS); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Unlink any entry if exists */ ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst); if (ln_tmp != NULL) lltable_unlink_entry(LLTABLE6(ifp), ln_tmp); lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); LLE_WUNLOCK(ln); if (ln_tmp != NULL) llentry_free(ln_tmp); return (0); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all) { struct sockaddr_in6 mask, addr; struct sockaddr *saddr, *smask; struct ifnet *ifp; ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); saddr = (struct sockaddr *)&addr; smask = (struct sockaddr *)&mask; if (all != 0) lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC); else lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); ND6_RLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) break; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) goto out; } } out: ND6_RUNLOCK(); return (error); } SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", "NDP prefix list"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");