Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: stable/8/sys/amd64/include/xen
===================================================================
--- stable/8/sys/amd64/include/xen (revision 209276)
+++ stable/8/sys/amd64/include/xen (revision 209277)
Property changes on: stable/8/sys/amd64/include/xen
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/amd64/include/xen:r208553
Index: stable/8/sys/cddl/contrib/opensolaris
===================================================================
--- stable/8/sys/cddl/contrib/opensolaris (revision 209276)
+++ stable/8/sys/cddl/contrib/opensolaris (revision 209277)
Property changes on: stable/8/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/cddl/contrib/opensolaris:r208553
Index: stable/8/sys/contrib/dev/acpica
===================================================================
--- stable/8/sys/contrib/dev/acpica (revision 209276)
+++ stable/8/sys/contrib/dev/acpica (revision 209277)
Property changes on: stable/8/sys/contrib/dev/acpica
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/dev/acpica:r208553
Index: stable/8/sys/contrib/pf
===================================================================
--- stable/8/sys/contrib/pf (revision 209276)
+++ stable/8/sys/contrib/pf (revision 209277)
Property changes on: stable/8/sys/contrib/pf
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/pf:r208553
Index: stable/8/sys/dev/xen/xenpci
===================================================================
--- stable/8/sys/dev/xen/xenpci (revision 209276)
+++ stable/8/sys/dev/xen/xenpci (revision 209277)
Property changes on: stable/8/sys/dev/xen/xenpci
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/dev/xen/xenpci:r208553
Index: stable/8/sys/geom/sched
===================================================================
--- stable/8/sys/geom/sched (revision 209276)
+++ stable/8/sys/geom/sched (revision 209277)
Property changes on: stable/8/sys/geom/sched
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/geom/sched:r208553
Index: stable/8/sys/net/if.c
===================================================================
--- stable/8/sys/net/if.c (revision 209276)
+++ stable/8/sys/net/if.c (revision 209277)
@@ -1,3484 +1,3485 @@
/*-
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if.c 8.5 (Berkeley) 1/9/95
* $FreeBSD$
*/
#include "opt_compat.h"
#include "opt_inet6.h"
#include "opt_inet.h"
#include "opt_carp.h"
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/sbuf.h>
#include <sys/bus.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/refcount.h>
#include <sys/module.h>
#include <sys/rwlock.h>
#include <sys/sockio.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <sys/domain.h>
#include <sys/jail.h>
#include <machine/stdarg.h>
#include <vm/uma.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_clone.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/radix.h>
#include <net/route.h>
#include <net/vnet.h>
#if defined(INET) || defined(INET6)
/*XXX*/
#include <netinet/in.h>
#include <netinet/in_var.h>
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/in6_ifattach.h>
#endif
#endif
#ifdef INET
#include <netinet/if_ether.h>
#endif
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
#include <netinet/ip_carp.h>
#endif
#endif
#include <security/mac/mac_framework.h>
struct ifindex_entry {
struct ifnet *ife_ifnet;
};
static int slowtimo_started;
SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
/* Log link state change events */
static int log_link_state_change = 1;
SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
&log_link_state_change, 0,
"log interface link state change events");
/* Interface description */
static unsigned int ifdescr_maxlen = 1024;
SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
&ifdescr_maxlen, 0,
"administrative maximum length for interface description");
MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
/* global sx for non-critical path ifdescr */
static struct sx ifdescr_sx;
SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
void (*bstp_linkstate_p)(struct ifnet *ifp, int state);
void (*ng_ether_link_state_p)(struct ifnet *ifp, int state);
void (*lagg_linkstate_p)(struct ifnet *ifp, int state);
struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
/*
* XXX: Style; these should be sorted alphabetically, and unprototyped
* static functions should be prototyped. Currently they are sorted by
* declaration order.
*/
static void if_attachdomain(void *);
static void if_attachdomain1(struct ifnet *);
static int ifconf(u_long, caddr_t);
static void if_freemulti(struct ifmultiaddr *);
static void if_init(void *);
static void if_grow(void);
static void if_check(void *);
static void if_route(struct ifnet *, int flag, int fam);
static int if_setflag(struct ifnet *, int, int, int *, int);
static void if_slowtimo(void *);
static int if_transmit(struct ifnet *ifp, struct mbuf *m);
static void if_unroute(struct ifnet *, int flag, int fam);
static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
static int if_rtdel(struct radix_node *, void *);
static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
static void do_link_state_change(void *, int);
static int if_getgroup(struct ifgroupreq *, struct ifnet *);
static int if_getgroupmembers(struct ifgroupreq *);
static void if_delgroups(struct ifnet *);
static void if_attach_internal(struct ifnet *, int);
static void if_detach_internal(struct ifnet *, int);
#ifdef INET6
/*
* XXX: declare here to avoid to include many inet6 related files..
* should be more generalized?
*/
extern void nd6_setmtu(struct ifnet *);
#endif
VNET_DEFINE(int, if_index);
int ifqmaxlen = IFQ_MAXLEN;
VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */
VNET_DEFINE(struct ifgrouphead, ifg_head);
static VNET_DEFINE(int, if_indexlim) = 8;
/* Table of ifnet by index. */
static VNET_DEFINE(struct ifindex_entry *, ifindex_table);
#define V_if_indexlim VNET(if_indexlim)
#define V_ifindex_table VNET(ifindex_table)
/*
* The global network interface list (V_ifnet) and related state (such as
* if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
* an rwlock. Either may be acquired shared to stablize the list, but both
* must be acquired writable to modify the list. This model allows us to
* both stablize the interface list during interrupt thread processing, but
* also to stablize it over long-running ioctls, without introducing priority
* inversions and deadlocks.
*/
struct rwlock ifnet_rwlock;
struct sx ifnet_sxlock;
/*
* The allocation of network interfaces is a rather non-atomic affair; we
* need to select an index before we are ready to expose the interface for
* use, so will use this pointer value to indicate reservation.
*/
#define IFNET_HOLD (void *)(uintptr_t)(-1)
static if_com_alloc_t *if_com_alloc[256];
static if_com_free_t *if_com_free[256];
/*
* System initialization
*/
SYSINIT(interface_check, SI_SUB_PROTO_IF, SI_ORDER_FIRST, if_check, NULL);
MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
struct ifnet *
ifnet_byindex_locked(u_short idx)
{
if (idx > V_if_index)
return (NULL);
if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD)
return (NULL);
return (V_ifindex_table[idx].ife_ifnet);
}
struct ifnet *
ifnet_byindex(u_short idx)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
ifp = ifnet_byindex_locked(idx);
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
struct ifnet *
ifnet_byindex_ref(u_short idx)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
ifp = ifnet_byindex_locked(idx);
if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
IFNET_RUNLOCK_NOSLEEP();
return (NULL);
}
if_ref(ifp);
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
/*
* Allocate an ifindex array entry; return 0 on success or an error on
* failure.
*/
static int
ifindex_alloc_locked(u_short *idxp)
{
u_short idx;
IFNET_WLOCK_ASSERT();
/*
* Try to find an empty slot below V_if_index. If we fail, take the
* next slot.
*/
for (idx = 1; idx <= V_if_index; idx++) {
if (V_ifindex_table[idx].ife_ifnet == NULL)
break;
}
/* Catch if_index overflow. */
if (idx < 1)
return (ENOSPC);
if (idx > V_if_index)
V_if_index = idx;
if (V_if_index >= V_if_indexlim)
if_grow();
*idxp = idx;
return (0);
}
static void
ifindex_free_locked(u_short idx)
{
IFNET_WLOCK_ASSERT();
V_ifindex_table[idx].ife_ifnet = NULL;
while (V_if_index > 0 &&
V_ifindex_table[V_if_index].ife_ifnet == NULL)
V_if_index--;
}
static void
ifindex_free(u_short idx)
{
IFNET_WLOCK();
ifindex_free_locked(idx);
IFNET_WUNLOCK();
}
static void
ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
{
IFNET_WLOCK_ASSERT();
V_ifindex_table[idx].ife_ifnet = ifp;
}
static void
ifnet_setbyindex(u_short idx, struct ifnet *ifp)
{
IFNET_WLOCK();
ifnet_setbyindex_locked(idx, ifp);
IFNET_WUNLOCK();
}
struct ifaddr *
ifaddr_byindex(u_short idx)
{
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
ifa = ifnet_byindex_locked(idx)->if_addr;
if (ifa != NULL)
ifa_ref(ifa);
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
/*
* Network interface utility routines.
*
* Routines with ifa_ifwith* names take sockaddr *'s as
* parameters.
*/
static void
vnet_if_init(const void *unused __unused)
{
TAILQ_INIT(&V_ifnet);
TAILQ_INIT(&V_ifg_head);
if_grow(); /* create initial table */
vnet_if_clone_init();
}
VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_init,
NULL);
/* ARGSUSED*/
static void
if_init(void *dummy __unused)
{
IFNET_LOCK_INIT();
if_clone_init();
}
SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL);
#ifdef VIMAGE
static void
vnet_if_uninit(const void *unused __unused)
{
VNET_ASSERT(TAILQ_EMPTY(&V_ifnet));
VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head));
free((caddr_t)V_ifindex_table, M_IFNET);
}
VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
vnet_if_uninit, NULL);
#endif
static void
if_grow(void)
{
u_int n;
struct ifindex_entry *e;
V_if_indexlim <<= 1;
n = V_if_indexlim * sizeof(*e);
e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
if (V_ifindex_table != NULL) {
memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
free((caddr_t)V_ifindex_table, M_IFNET);
}
V_ifindex_table = e;
}
static void
if_check(void *dummy __unused)
{
/*
* If at least one interface added during boot uses
* if_watchdog then start the timer.
*/
if (slowtimo_started)
if_slowtimo(0);
}
/*
* Allocate a struct ifnet and an index for an interface. A layer 2
* common structure will also be allocated if an allocation routine is
* registered for the passed type.
*/
struct ifnet *
if_alloc(u_char type)
{
struct ifnet *ifp;
u_short idx;
ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
IFNET_WLOCK();
if (ifindex_alloc_locked(&idx) != 0) {
IFNET_WUNLOCK();
free(ifp, M_IFNET);
return (NULL);
}
ifnet_setbyindex_locked(idx, IFNET_HOLD);
IFNET_WUNLOCK();
ifp->if_index = idx;
ifp->if_type = type;
ifp->if_alloctype = type;
if (if_com_alloc[type] != NULL) {
ifp->if_l2com = if_com_alloc[type](type, ifp);
if (ifp->if_l2com == NULL) {
free(ifp, M_IFNET);
ifindex_free(idx);
return (NULL);
}
}
IF_ADDR_LOCK_INIT(ifp);
TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
ifp->if_afdata_initialized = 0;
IF_AFDATA_LOCK_INIT(ifp);
TAILQ_INIT(&ifp->if_addrhead);
TAILQ_INIT(&ifp->if_prefixhead);
TAILQ_INIT(&ifp->if_multiaddrs);
TAILQ_INIT(&ifp->if_groups);
#ifdef MAC
mac_ifnet_init(ifp);
#endif
ifq_init(&ifp->if_snd, ifp);
refcount_init(&ifp->if_refcount, 1); /* Index reference. */
ifnet_setbyindex(ifp->if_index, ifp);
return (ifp);
}
/*
* Do the actual work of freeing a struct ifnet, associated index, and layer
* 2 common structure. This call is made when the last reference to an
* interface is released.
*/
static void
if_free_internal(struct ifnet *ifp)
{
KASSERT((ifp->if_flags & IFF_DYING),
("if_free_internal: interface not dying"));
IFNET_WLOCK();
KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
("%s: freeing unallocated ifnet", ifp->if_xname));
ifindex_free_locked(ifp->if_index);
IFNET_WUNLOCK();
if (if_com_free[ifp->if_alloctype] != NULL)
if_com_free[ifp->if_alloctype](ifp->if_l2com,
ifp->if_alloctype);
#ifdef MAC
mac_ifnet_destroy(ifp);
#endif /* MAC */
if (ifp->if_description != NULL)
free(ifp->if_description, M_IFDESCR);
IF_AFDATA_DESTROY(ifp);
IF_ADDR_LOCK_DESTROY(ifp);
ifq_delete(&ifp->if_snd);
free(ifp, M_IFNET);
}
/*
* This version should only be called by intefaces that switch their type
* after calling if_alloc(). if_free_type() will go away again now that we
* have if_alloctype to cache the original allocation type. For now, assert
* that they match, since we require that in practice.
*/
void
if_free_type(struct ifnet *ifp, u_char type)
{
KASSERT(ifp->if_alloctype == type,
("if_free_type: type (%d) != alloctype (%d)", type,
ifp->if_alloctype));
ifp->if_flags |= IFF_DYING; /* XXX: Locking */
if (!refcount_release(&ifp->if_refcount))
return;
if_free_internal(ifp);
}
/*
* This is the normal version of if_free(), used by device drivers to free a
* detached network interface. The contents of if_free_type() will move into
* here when if_free_type() goes away.
*/
void
if_free(struct ifnet *ifp)
{
if_free_type(ifp, ifp->if_alloctype);
}
/*
* Interfaces to keep an ifnet type-stable despite the possibility of the
* driver calling if_free(). If there are additional references, we defer
* freeing the underlying data structure.
*/
void
if_ref(struct ifnet *ifp)
{
/* We don't assert the ifnet list lock here, but arguably should. */
refcount_acquire(&ifp->if_refcount);
}
void
if_rele(struct ifnet *ifp)
{
if (!refcount_release(&ifp->if_refcount))
return;
if_free_internal(ifp);
}
void
ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
{
mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
if (ifq->ifq_maxlen == 0)
ifq->ifq_maxlen = ifqmaxlen;
ifq->altq_type = 0;
ifq->altq_disc = NULL;
ifq->altq_flags &= ALTQF_CANTCHANGE;
ifq->altq_tbr = NULL;
ifq->altq_ifp = ifp;
}
void
ifq_delete(struct ifaltq *ifq)
{
mtx_destroy(&ifq->ifq_mtx);
}
/*
* Perform generic interface initalization tasks and attach the interface
* to the list of "active" interfaces. If vmove flag is set on entry
* to if_attach_internal(), perform only a limited subset of initialization
* tasks, given that we are moving from one vnet to another an ifnet which
* has already been fully initialized.
*
* XXX:
* - The decision to return void and thus require this function to
* succeed is questionable.
* - We should probably do more sanity checking. For instance we don't
* do anything to insure if_xname is unique or non-empty.
*/
void
if_attach(struct ifnet *ifp)
{
if_attach_internal(ifp, 0);
}
static void
if_attach_internal(struct ifnet *ifp, int vmove)
{
unsigned socksize, ifasize;
int namelen, masklen;
struct sockaddr_dl *sdl;
struct ifaddr *ifa;
if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
ifp->if_xname);
#ifdef VIMAGE
ifp->if_vnet = curvnet;
if (ifp->if_home_vnet == NULL)
ifp->if_home_vnet = curvnet;
#endif
if_addgroup(ifp, IFG_ALL);
getmicrotime(&ifp->if_lastchange);
ifp->if_data.ifi_epoch = time_uptime;
ifp->if_data.ifi_datalen = sizeof(struct if_data);
KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
(ifp->if_transmit != NULL && ifp->if_qflush != NULL),
("transmit and qflush must both either be set or both be NULL"));
if (ifp->if_transmit == NULL) {
ifp->if_transmit = if_transmit;
ifp->if_qflush = if_qflush;
}
if (!vmove) {
#ifdef MAC
mac_ifnet_create(ifp);
#endif
/*
* Create a Link Level name for this device.
*/
namelen = strlen(ifp->if_xname);
/*
* Always save enough space for any possiable name so we
* can do a rename in place later.
*/
masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
socksize = masklen + ifp->if_addrlen;
if (socksize < sizeof(*sdl))
socksize = sizeof(*sdl);
socksize = roundup2(socksize, sizeof(long));
ifasize = sizeof(*ifa) + 2 * socksize;
ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
ifa_init(ifa);
sdl = (struct sockaddr_dl *)(ifa + 1);
sdl->sdl_len = socksize;
sdl->sdl_family = AF_LINK;
bcopy(ifp->if_xname, sdl->sdl_data, namelen);
sdl->sdl_nlen = namelen;
sdl->sdl_index = ifp->if_index;
sdl->sdl_type = ifp->if_type;
ifp->if_addr = ifa;
ifa->ifa_ifp = ifp;
ifa->ifa_rtrequest = link_rtrequest;
ifa->ifa_addr = (struct sockaddr *)sdl;
sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
ifa->ifa_netmask = (struct sockaddr *)sdl;
sdl->sdl_len = masklen;
while (namelen != 0)
sdl->sdl_data[--namelen] = 0xff;
TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
/* Reliably crash if used uninitialized. */
ifp->if_broadcastaddr = NULL;
}
#ifdef VIMAGE
else {
/*
* Update the interface index in the link layer address
* of the interface.
*/
for (ifa = ifp->if_addr; ifa != NULL;
ifa = TAILQ_NEXT(ifa, ifa_link)) {
if (ifa->ifa_addr->sa_family == AF_LINK) {
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
sdl->sdl_index = ifp->if_index;
}
}
}
#endif
IFNET_WLOCK();
TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
#ifdef VIMAGE
curvnet->vnet_ifcnt++;
#endif
IFNET_WUNLOCK();
if (domain_init_status >= 2)
if_attachdomain1(ifp);
EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
/* Announce the interface. */
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
if (!vmove && ifp->if_watchdog != NULL) {
if_printf(ifp,
"WARNING: using obsoleted if_watchdog interface\n");
/*
* Note that we need if_slowtimo(). If this happens after
* boot, then call if_slowtimo() directly.
*/
if (atomic_cmpset_int(&slowtimo_started, 0, 1) && !cold)
if_slowtimo(0);
}
}
static void
if_attachdomain(void *dummy)
{
struct ifnet *ifp;
int s;
s = splnet();
TAILQ_FOREACH(ifp, &V_ifnet, if_link)
if_attachdomain1(ifp);
splx(s);
}
SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
if_attachdomain, NULL);
static void
if_attachdomain1(struct ifnet *ifp)
{
struct domain *dp;
int s;
s = splnet();
/*
* Since dp->dom_ifattach calls malloc() with M_WAITOK, we
* cannot lock ifp->if_afdata initialization, entirely.
*/
if (IF_AFDATA_TRYLOCK(ifp) == 0) {
splx(s);
return;
}
if (ifp->if_afdata_initialized >= domain_init_status) {
IF_AFDATA_UNLOCK(ifp);
splx(s);
printf("if_attachdomain called more than once on %s\n",
ifp->if_xname);
return;
}
ifp->if_afdata_initialized = domain_init_status;
IF_AFDATA_UNLOCK(ifp);
/* address family dependent data region */
bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
for (dp = domains; dp; dp = dp->dom_next) {
if (dp->dom_ifattach)
ifp->if_afdata[dp->dom_family] =
(*dp->dom_ifattach)(ifp);
}
splx(s);
}
/*
* Remove any unicast or broadcast network addresses from an interface.
*/
void
if_purgeaddrs(struct ifnet *ifp)
{
struct ifaddr *ifa, *next;
TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
if (ifa->ifa_addr->sa_family == AF_LINK)
continue;
#ifdef INET
/* XXX: Ugly!! ad hoc just for INET */
if (ifa->ifa_addr->sa_family == AF_INET) {
struct ifaliasreq ifr;
bzero(&ifr, sizeof(ifr));
ifr.ifra_addr = *ifa->ifa_addr;
if (ifa->ifa_dstaddr)
ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
NULL) == 0)
continue;
}
#endif /* INET */
#ifdef INET6
if (ifa->ifa_addr->sa_family == AF_INET6) {
in6_purgeaddr(ifa);
/* ifp_addrhead is already updated */
continue;
}
#endif /* INET6 */
TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
ifa_free(ifa);
}
}
/*
* Remove any multicast network addresses from an interface when an ifnet
* is going away.
*/
static void
if_purgemaddrs(struct ifnet *ifp)
{
struct ifmultiaddr *ifma;
struct ifmultiaddr *next;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
if_delmulti_locked(ifp, ifma, 1);
IF_ADDR_UNLOCK(ifp);
}
/*
* Detach an interface, removing it from the list of "active" interfaces.
* If vmove flag is set on entry to if_detach_internal(), perform only a
* limited subset of cleanup tasks, given that we are moving an ifnet from
* one vnet to another, where it must be fully operational.
*
* XXXRW: There are some significant questions about event ordering, and
* how to prevent things from starting to use the interface during detach.
*/
void
if_detach(struct ifnet *ifp)
{
if_detach_internal(ifp, 0);
}
static void
if_detach_internal(struct ifnet *ifp, int vmove)
{
struct ifaddr *ifa;
struct radix_node_head *rnh;
int i, j;
struct domain *dp;
struct ifnet *iter;
int found = 0;
IFNET_WLOCK();
TAILQ_FOREACH(iter, &V_ifnet, if_link)
if (iter == ifp) {
TAILQ_REMOVE(&V_ifnet, ifp, if_link);
found = 1;
break;
}
#ifdef VIMAGE
if (found)
curvnet->vnet_ifcnt--;
#endif
IFNET_WUNLOCK();
if (!found) {
if (vmove)
panic("%s: ifp=%p not on the ifnet tailq %p",
__func__, ifp, &V_ifnet);
else
return; /* XXX this should panic as well? */
}
/*
* Remove/wait for pending events.
*/
taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
/*
* Remove routes and flush queues.
*/
if_down(ifp);
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
altq_disable(&ifp->if_snd);
if (ALTQ_IS_ATTACHED(&ifp->if_snd))
altq_detach(&ifp->if_snd);
#endif
if_purgeaddrs(ifp);
#ifdef INET
in_ifdetach(ifp);
#endif
#ifdef INET6
/*
* Remove all IPv6 kernel structs related to ifp. This should be done
* before removing routing entries below, since IPv6 interface direct
* routes are expected to be removed by the IPv6-specific kernel API.
* Otherwise, the kernel will detect some inconsistency and bark it.
*/
in6_ifdetach(ifp);
#endif
if_purgemaddrs(ifp);
if (!vmove) {
/*
* Prevent further calls into the device driver via ifnet.
*/
if_dead(ifp);
/*
* Remove link ifaddr pointer and maybe decrement if_index.
* Clean up all addresses.
*/
ifp->if_addr = NULL;
/* We can now free link ifaddr. */
if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
ifa = TAILQ_FIRST(&ifp->if_addrhead);
TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
ifa_free(ifa);
}
}
/*
* Delete all remaining routes using this interface
* Unfortuneatly the only way to do this is to slog through
* the entire routing table looking for routes which point
* to this interface...oh well...
*/
for (i = 1; i <= AF_MAX; i++) {
for (j = 0; j < rt_numfibs; j++) {
rnh = rt_tables_get_rnh(j, i);
if (rnh == NULL)
continue;
RADIX_NODE_HEAD_LOCK(rnh);
(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
RADIX_NODE_HEAD_UNLOCK(rnh);
}
}
/* Announce that the interface is gone. */
rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
if_delgroups(ifp);
/*
* We cannot hold the lock over dom_ifdetach calls as they might
* sleep, for example trying to drain a callout, thus open up the
* theoretical race with re-attaching.
*/
IF_AFDATA_LOCK(ifp);
i = ifp->if_afdata_initialized;
ifp->if_afdata_initialized = 0;
IF_AFDATA_UNLOCK(ifp);
for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
(*dp->dom_ifdetach)(ifp,
ifp->if_afdata[dp->dom_family]);
}
}
#ifdef VIMAGE
/*
* if_vmove() performs a limited version of if_detach() in current
* vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
* An attempt is made to shrink if_index in current vnet, find an
* unused if_index in target vnet and calls if_grow() if necessary,
* and finally find an unused if_xname for the target vnet.
*/
void
if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
{
u_short idx;
/*
* Detach from current vnet, but preserve LLADDR info, do not
* mark as dead etc. so that the ifnet can be reattached later.
*/
if_detach_internal(ifp, 1);
/*
* Unlink the ifnet from ifindex_table[] in current vnet, and shrink
* the if_index for that vnet if possible.
*
* NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
* or we'd lock on one vnet and unlock on another.
*/
IFNET_WLOCK();
ifindex_free_locked(ifp->if_index);
/*
* Switch to the context of the target vnet.
*/
CURVNET_SET_QUIET(new_vnet);
if (ifindex_alloc_locked(&idx) != 0) {
IFNET_WUNLOCK();
panic("if_index overflow");
}
ifp->if_index = idx;
ifnet_setbyindex_locked(ifp->if_index, ifp);
IFNET_WUNLOCK();
if_attach_internal(ifp, 1);
CURVNET_RESTORE();
}
/*
* Move an ifnet to or from another child prison/vnet, specified by the jail id.
*/
static int
if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
{
struct prison *pr;
struct ifnet *difp;
/* Try to find the prison within our visibility. */
sx_slock(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, jid);
sx_sunlock(&allprison_lock);
if (pr == NULL)
return (ENXIO);
prison_hold_locked(pr);
mtx_unlock(&pr->pr_mtx);
/* Do not try to move the iface from and to the same prison. */
if (pr->pr_vnet == ifp->if_vnet) {
prison_free(pr);
return (EEXIST);
}
/* Make sure the named iface does not exists in the dst. prison/vnet. */
/* XXX Lock interfaces to avoid races. */
CURVNET_SET_QUIET(pr->pr_vnet);
difp = ifunit(ifname);
CURVNET_RESTORE();
if (difp != NULL) {
prison_free(pr);
return (EEXIST);
}
/* Move the interface into the child jail/vnet. */
if_vmove(ifp, pr->pr_vnet);
/* Report the new if_xname back to the userland. */
sprintf(ifname, "%s", ifp->if_xname);
prison_free(pr);
return (0);
}
static int
if_vmove_reclaim(struct thread *td, char *ifname, int jid)
{
struct prison *pr;
struct vnet *vnet_dst;
struct ifnet *ifp;
/* Try to find the prison within our visibility. */
sx_slock(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, jid);
sx_sunlock(&allprison_lock);
if (pr == NULL)
return (ENXIO);
prison_hold_locked(pr);
mtx_unlock(&pr->pr_mtx);
/* Make sure the named iface exists in the source prison/vnet. */
CURVNET_SET(pr->pr_vnet);
ifp = ifunit(ifname); /* XXX Lock to avoid races. */
if (ifp == NULL) {
CURVNET_RESTORE();
prison_free(pr);
return (ENXIO);
}
/* Do not try to move the iface from and to the same prison. */
vnet_dst = TD_TO_VNET(td);
if (vnet_dst == ifp->if_vnet) {
CURVNET_RESTORE();
prison_free(pr);
return (EEXIST);
}
/* Get interface back from child jail/vnet. */
if_vmove(ifp, vnet_dst);
CURVNET_RESTORE();
/* Report the new if_xname back to the userland. */
sprintf(ifname, "%s", ifp->if_xname);
prison_free(pr);
return (0);
}
#endif /* VIMAGE */
/*
* Add a group to an interface
*/
int
if_addgroup(struct ifnet *ifp, const char *groupname)
{
struct ifg_list *ifgl;
struct ifg_group *ifg = NULL;
struct ifg_member *ifgm;
if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
groupname[strlen(groupname) - 1] <= '9')
return (EINVAL);
IFNET_WLOCK();
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
IFNET_WUNLOCK();
return (EEXIST);
}
if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
M_NOWAIT)) == NULL) {
IFNET_WUNLOCK();
return (ENOMEM);
}
if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
M_TEMP, M_NOWAIT)) == NULL) {
free(ifgl, M_TEMP);
IFNET_WUNLOCK();
return (ENOMEM);
}
TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
if (!strcmp(ifg->ifg_group, groupname))
break;
if (ifg == NULL) {
if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
M_TEMP, M_NOWAIT)) == NULL) {
free(ifgl, M_TEMP);
free(ifgm, M_TEMP);
IFNET_WUNLOCK();
return (ENOMEM);
}
strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
ifg->ifg_refcnt = 0;
TAILQ_INIT(&ifg->ifg_members);
EVENTHANDLER_INVOKE(group_attach_event, ifg);
TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
}
ifg->ifg_refcnt++;
ifgl->ifgl_group = ifg;
ifgm->ifgm_ifp = ifp;
IF_ADDR_LOCK(ifp);
TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
IF_ADDR_UNLOCK(ifp);
IFNET_WUNLOCK();
EVENTHANDLER_INVOKE(group_change_event, groupname);
return (0);
}
/*
* Remove a group from an interface
*/
int
if_delgroup(struct ifnet *ifp, const char *groupname)
{
struct ifg_list *ifgl;
struct ifg_member *ifgm;
IFNET_WLOCK();
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
break;
if (ifgl == NULL) {
IFNET_WUNLOCK();
return (ENOENT);
}
IF_ADDR_LOCK(ifp);
TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
IF_ADDR_UNLOCK(ifp);
TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
if (ifgm->ifgm_ifp == ifp)
break;
if (ifgm != NULL) {
TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
free(ifgm, M_TEMP);
}
if (--ifgl->ifgl_group->ifg_refcnt == 0) {
TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
free(ifgl->ifgl_group, M_TEMP);
}
IFNET_WUNLOCK();
free(ifgl, M_TEMP);
EVENTHANDLER_INVOKE(group_change_event, groupname);
return (0);
}
/*
* Remove an interface from all groups
*/
static void
if_delgroups(struct ifnet *ifp)
{
struct ifg_list *ifgl;
struct ifg_member *ifgm;
char groupname[IFNAMSIZ];
IFNET_WLOCK();
while (!TAILQ_EMPTY(&ifp->if_groups)) {
ifgl = TAILQ_FIRST(&ifp->if_groups);
strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
IF_ADDR_LOCK(ifp);
TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
IF_ADDR_UNLOCK(ifp);
TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
if (ifgm->ifgm_ifp == ifp)
break;
if (ifgm != NULL) {
TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
ifgm_next);
free(ifgm, M_TEMP);
}
if (--ifgl->ifgl_group->ifg_refcnt == 0) {
TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
EVENTHANDLER_INVOKE(group_detach_event,
ifgl->ifgl_group);
free(ifgl->ifgl_group, M_TEMP);
}
IFNET_WUNLOCK();
free(ifgl, M_TEMP);
EVENTHANDLER_INVOKE(group_change_event, groupname);
IFNET_WLOCK();
}
IFNET_WUNLOCK();
}
/*
* Stores all groups from an interface in memory pointed
* to by data
*/
static int
if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
{
int len, error;
struct ifg_list *ifgl;
struct ifg_req ifgrq, *ifgp;
struct ifgroupreq *ifgr = data;
if (ifgr->ifgr_len == 0) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
ifgr->ifgr_len += sizeof(struct ifg_req);
IF_ADDR_UNLOCK(ifp);
return (0);
}
len = ifgr->ifgr_len;
ifgp = ifgr->ifgr_groups;
/* XXX: wire */
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
if (len < sizeof(ifgrq)) {
IF_ADDR_UNLOCK(ifp);
return (EINVAL);
}
bzero(&ifgrq, sizeof ifgrq);
strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
sizeof(ifgrq.ifgrq_group));
if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
IF_ADDR_UNLOCK(ifp);
return (error);
}
len -= sizeof(ifgrq);
ifgp++;
}
IF_ADDR_UNLOCK(ifp);
return (0);
}
/*
* Stores all members of a group in memory pointed to by data
*/
static int
if_getgroupmembers(struct ifgroupreq *data)
{
struct ifgroupreq *ifgr = data;
struct ifg_group *ifg;
struct ifg_member *ifgm;
struct ifg_req ifgrq, *ifgp;
int len, error;
IFNET_RLOCK();
TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
break;
if (ifg == NULL) {
IFNET_RUNLOCK();
return (ENOENT);
}
if (ifgr->ifgr_len == 0) {
TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
ifgr->ifgr_len += sizeof(ifgrq);
IFNET_RUNLOCK();
return (0);
}
len = ifgr->ifgr_len;
ifgp = ifgr->ifgr_groups;
TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
if (len < sizeof(ifgrq)) {
IFNET_RUNLOCK();
return (EINVAL);
}
bzero(&ifgrq, sizeof ifgrq);
strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
sizeof(ifgrq.ifgrq_member));
if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
IFNET_RUNLOCK();
return (error);
}
len -= sizeof(ifgrq);
ifgp++;
}
IFNET_RUNLOCK();
return (0);
}
/*
* Delete Routes for a Network Interface
*
* Called for each routing entry via the rnh->rnh_walktree() call above
* to delete all route entries referencing a detaching network interface.
*
* Arguments:
* rn pointer to node in the routing table
* arg argument passed to rnh->rnh_walktree() - detaching interface
*
* Returns:
* 0 successful
* errno failed - reason indicated
*
*/
static int
if_rtdel(struct radix_node *rn, void *arg)
{
struct rtentry *rt = (struct rtentry *)rn;
struct ifnet *ifp = arg;
int err;
if (rt->rt_ifp == ifp) {
/*
* Protect (sorta) against walktree recursion problems
* with cloned routes
*/
if ((rt->rt_flags & RTF_UP) == 0)
return (0);
err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
rt_mask(rt), rt->rt_flags|RTF_RNH_LOCKED,
(struct rtentry **) NULL, rt->rt_fibnum);
if (err) {
log(LOG_WARNING, "if_rtdel: error %d\n", err);
}
}
return (0);
}
/*
* Wrapper functions for struct ifnet address list locking macros. These are
* used by kernel modules to avoid encoding programming interface or binary
* interface assumptions that may be violated when kernel-internal locking
* approaches change.
*/
void
if_addr_rlock(struct ifnet *ifp)
{
IF_ADDR_LOCK(ifp);
}
void
if_addr_runlock(struct ifnet *ifp)
{
IF_ADDR_UNLOCK(ifp);
}
void
if_maddr_rlock(struct ifnet *ifp)
{
IF_ADDR_LOCK(ifp);
}
void
if_maddr_runlock(struct ifnet *ifp)
{
IF_ADDR_UNLOCK(ifp);
}
/*
* Reference count functions for ifaddrs.
*/
void
ifa_init(struct ifaddr *ifa)
{
mtx_init(&ifa->ifa_mtx, "ifaddr", NULL, MTX_DEF);
refcount_init(&ifa->ifa_refcnt, 1);
}
void
ifa_ref(struct ifaddr *ifa)
{
refcount_acquire(&ifa->ifa_refcnt);
}
void
ifa_free(struct ifaddr *ifa)
{
if (refcount_release(&ifa->ifa_refcnt)) {
mtx_destroy(&ifa->ifa_mtx);
free(ifa, M_IFADDR);
}
}
int
ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
{
int error = 0;
struct rtentry *rt = NULL;
struct rt_addrinfo info;
static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
bzero(&info, sizeof(info));
info.rti_ifp = V_loif;
info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
info.rti_info[RTAX_DST] = ia;
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
error = rtrequest1_fib(RTM_ADD, &info, &rt, 0);
if (error == 0 && rt != NULL) {
RT_LOCK(rt);
((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
ifa->ifa_ifp->if_type;
((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
ifa->ifa_ifp->if_index;
RT_REMREF(rt);
RT_UNLOCK(rt);
} else if (error != 0)
log(LOG_INFO, "ifa_add_loopback_route: insertion failed\n");
return (error);
}
int
ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
{
int error = 0;
struct rt_addrinfo info;
struct sockaddr_dl null_sdl;
bzero(&null_sdl, sizeof(null_sdl));
null_sdl.sdl_len = sizeof(null_sdl);
null_sdl.sdl_family = AF_LINK;
null_sdl.sdl_type = ifa->ifa_ifp->if_type;
null_sdl.sdl_index = ifa->ifa_ifp->if_index;
bzero(&info, sizeof(info));
info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
info.rti_info[RTAX_DST] = ia;
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
error = rtrequest1_fib(RTM_DELETE, &info, NULL, 0);
if (error != 0)
log(LOG_INFO, "ifa_del_loopback_route: deletion failed\n");
return (error);
}
/*
* XXX: Because sockaddr_dl has deeper structure than the sockaddr
* structs used to represent other address families, it is necessary
* to perform a different comparison.
*/
#define sa_equal(a1, a2) \
(bcmp((a1), (a2), ((a1))->sa_len) == 0)
#define sa_dl_equal(a1, a2) \
((((struct sockaddr_dl *)(a1))->sdl_len == \
((struct sockaddr_dl *)(a2))->sdl_len) && \
(bcmp(LLADDR((struct sockaddr_dl *)(a1)), \
LLADDR((struct sockaddr_dl *)(a2)), \
((struct sockaddr_dl *)(a1))->sdl_alen) == 0))
/*
* Locate an interface based on a complete address.
*/
/*ARGSUSED*/
static struct ifaddr *
ifa_ifwithaddr_internal(struct sockaddr *addr, int getref)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if (sa_equal(addr, ifa->ifa_addr)) {
if (getref)
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
/* IP6 doesn't have broadcast */
if ((ifp->if_flags & IFF_BROADCAST) &&
ifa->ifa_broadaddr &&
ifa->ifa_broadaddr->sa_len != 0 &&
sa_equal(ifa->ifa_broadaddr, addr)) {
if (getref)
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
struct ifaddr *
ifa_ifwithaddr(struct sockaddr *addr)
{
return (ifa_ifwithaddr_internal(addr, 1));
}
int
ifa_ifwithaddr_check(struct sockaddr *addr)
{
return (ifa_ifwithaddr_internal(addr, 0) != NULL);
}
/*
* Locate an interface based on the broadcast address.
*/
/* ARGSUSED */
struct ifaddr *
ifa_ifwithbroadaddr(struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if ((ifp->if_flags & IFF_BROADCAST) &&
ifa->ifa_broadaddr &&
ifa->ifa_broadaddr->sa_len != 0 &&
sa_equal(ifa->ifa_broadaddr, addr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
/*
* Locate the point to point interface with a given destination address.
*/
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithdstaddr(struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
continue;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if (ifa->ifa_dstaddr != NULL &&
sa_equal(addr, ifa->ifa_dstaddr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
/*
* Find an interface on a specific network. If many, choice
* is most specific found.
*/
struct ifaddr *
-ifa_ifwithnet(struct sockaddr *addr)
+ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifaddr *ifa_maybe = NULL;
u_int af = addr->sa_family;
char *addr_data = addr->sa_data, *cplim;
/*
* AF_LINK addresses can be looked up directly by their index number,
* so do that if we can.
*/
if (af == AF_LINK) {
struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
return (ifaddr_byindex(sdl->sdl_index));
}
/*
* Scan though each interface, looking for ones that have addresses
* in this address family. Maintain a reference on ifa_maybe once
* we find one, as we release the IF_ADDR_LOCK() that kept it stable
* when we move onto the next interface.
*/
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
char *cp, *cp2, *cp3;
if (ifa->ifa_addr->sa_family != af)
next: continue;
- if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
+ if (af == AF_INET &&
+ ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
/*
* This is a bit broken as it doesn't
* take into account that the remote end may
* be a single node in the network we are
* looking for.
* The trouble is that we don't know the
* netmask for the remote end.
*/
if (ifa->ifa_dstaddr != NULL &&
sa_equal(addr, ifa->ifa_dstaddr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
} else {
/*
* if we have a special address handler,
* then use it instead of the generic one.
*/
if (ifa->ifa_claim_addr) {
if ((*ifa->ifa_claim_addr)(ifa, addr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
continue;
}
/*
* Scan all the bits in the ifa's address.
* If a bit dissagrees with what we are
* looking for, mask it with the netmask
* to see if it really matters.
* (A byte at a time)
*/
if (ifa->ifa_netmask == 0)
continue;
cp = addr_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = ifa->ifa_netmask->sa_len
+ (char *)ifa->ifa_netmask;
while (cp3 < cplim)
if ((*cp++ ^ *cp2++) & *cp3++)
goto next; /* next address! */
/*
* If the netmask of what we just found
* is more specific than what we had before
* (if we had one) then remember the new one
* before continuing to search
* for an even better one.
*/
if (ifa_maybe == NULL ||
rn_refines((caddr_t)ifa->ifa_netmask,
(caddr_t)ifa_maybe->ifa_netmask)) {
if (ifa_maybe != NULL)
ifa_free(ifa_maybe);
ifa_maybe = ifa;
ifa_ref(ifa_maybe);
}
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = ifa_maybe;
ifa_maybe = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
if (ifa_maybe != NULL)
ifa_free(ifa_maybe);
return (ifa);
}
/*
* Find an interface address specific to an interface best matching
* a given address.
*/
struct ifaddr *
ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
{
struct ifaddr *ifa;
char *cp, *cp2, *cp3;
char *cplim;
struct ifaddr *ifa_maybe = NULL;
u_int af = addr->sa_family;
if (af >= AF_MAX)
return (0);
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != af)
continue;
if (ifa_maybe == NULL)
ifa_maybe = ifa;
if (ifa->ifa_netmask == 0) {
if (sa_equal(addr, ifa->ifa_addr) ||
(ifa->ifa_dstaddr &&
sa_equal(addr, ifa->ifa_dstaddr)))
goto done;
continue;
}
if (ifp->if_flags & IFF_POINTOPOINT) {
if (sa_equal(addr, ifa->ifa_dstaddr))
goto done;
} else {
cp = addr->sa_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
for (; cp3 < cplim; cp3++)
if ((*cp++ ^ *cp2++) & *cp3)
break;
if (cp3 == cplim)
goto done;
}
}
ifa = ifa_maybe;
done:
if (ifa != NULL)
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
return (ifa);
}
#include <net/if_llatbl.h>
/*
* Default action when installing a route with a Link Level gateway.
* Lookup an appropriate real ifa to point to.
* This should be moved to /sys/net/link.c eventually.
*/
static void
link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
{
struct ifaddr *ifa, *oifa;
struct sockaddr *dst;
struct ifnet *ifp;
RT_LOCK_ASSERT(rt);
if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
return;
ifa = ifaof_ifpforaddr(dst, ifp);
if (ifa) {
oifa = rt->rt_ifa;
rt->rt_ifa = ifa;
ifa_free(oifa);
if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
ifa->ifa_rtrequest(cmd, rt, info);
}
}
/*
* Mark an interface down and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
static void
if_unroute(struct ifnet *ifp, int flag, int fam)
{
struct ifaddr *ifa;
KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
ifp->if_flags &= ~flag;
getmicrotime(&ifp->if_lastchange);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
ifp->if_qflush(ifp);
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
if (ifp->if_carp)
carp_carpdev_state(ifp->if_carp);
#endif
#endif
rt_ifmsg(ifp);
}
/*
* Mark an interface up and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
static void
if_route(struct ifnet *ifp, int flag, int fam)
{
struct ifaddr *ifa;
KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
ifp->if_flags |= flag;
getmicrotime(&ifp->if_lastchange);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
pfctlinput(PRC_IFUP, ifa->ifa_addr);
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
if (ifp->if_carp)
carp_carpdev_state(ifp->if_carp);
#endif
#endif
rt_ifmsg(ifp);
#ifdef INET6
in6_if_up(ifp);
#endif
}
void (*vlan_link_state_p)(struct ifnet *, int); /* XXX: private from if_vlan */
void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */
/*
* Handle a change in the interface link state. To avoid LORs
* between driver lock and upper layer locks, as well as possible
* recursions, we post event to taskqueue, and all job
* is done in static do_link_state_change().
*/
void
if_link_state_change(struct ifnet *ifp, int link_state)
{
/* Return if state hasn't changed. */
if (ifp->if_link_state == link_state)
return;
ifp->if_link_state = link_state;
taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
}
static void
do_link_state_change(void *arg, int pending)
{
struct ifnet *ifp = (struct ifnet *)arg;
int link_state = ifp->if_link_state;
CURVNET_SET(ifp->if_vnet);
/* Notify that the link state has changed. */
rt_ifmsg(ifp);
if (ifp->if_vlantrunk != NULL)
(*vlan_link_state_p)(ifp, 0);
if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
IFP2AC(ifp)->ac_netgraph != NULL)
(*ng_ether_link_state_p)(ifp, link_state);
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
if (ifp->if_carp)
carp_carpdev_state(ifp->if_carp);
#endif
#endif
if (ifp->if_bridge) {
KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!"));
(*bstp_linkstate_p)(ifp, link_state);
}
if (ifp->if_lagg) {
KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!"));
(*lagg_linkstate_p)(ifp, link_state);
}
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("IFNET", ifp->if_xname,
(link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
NULL);
if (pending > 1)
if_printf(ifp, "%d link states coalesced\n", pending);
if (log_link_state_change)
log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
(link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
CURVNET_RESTORE();
}
/*
* Mark an interface down and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
void
if_down(struct ifnet *ifp)
{
if_unroute(ifp, IFF_UP, AF_UNSPEC);
}
/*
* Mark an interface up and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
void
if_up(struct ifnet *ifp)
{
if_route(ifp, IFF_UP, AF_UNSPEC);
}
/*
* Flush an interface queue.
*/
void
if_qflush(struct ifnet *ifp)
{
struct mbuf *m, *n;
struct ifaltq *ifq;
ifq = &ifp->if_snd;
IFQ_LOCK(ifq);
#ifdef ALTQ
if (ALTQ_IS_ENABLED(ifq))
ALTQ_PURGE(ifq);
#endif
n = ifq->ifq_head;
while ((m = n) != 0) {
n = m->m_act;
m_freem(m);
}
ifq->ifq_head = 0;
ifq->ifq_tail = 0;
ifq->ifq_len = 0;
IFQ_UNLOCK(ifq);
}
/*
* Handle interface watchdog timer routines. Called
* from softclock, we decrement timers (if set) and
* call the appropriate interface routine on expiration.
*
* XXXRW: Note that because timeouts run with Giant, if_watchdog() is called
* holding Giant.
*/
static void
if_slowtimo(void *arg)
{
VNET_ITERATOR_DECL(vnet_iter);
struct ifnet *ifp;
int s = splimp();
VNET_LIST_RLOCK_NOSLEEP();
IFNET_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (ifp->if_timer == 0 || --ifp->if_timer)
continue;
if (ifp->if_watchdog)
(*ifp->if_watchdog)(ifp);
}
CURVNET_RESTORE();
}
IFNET_RUNLOCK_NOSLEEP();
VNET_LIST_RUNLOCK_NOSLEEP();
splx(s);
timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ);
}
/*
* Map interface name to interface structure pointer, with or without
* returning a reference.
*/
struct ifnet *
ifunit_ref(const char *name)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
!(ifp->if_flags & IFF_DYING))
break;
}
if (ifp != NULL)
if_ref(ifp);
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
struct ifnet *
ifunit(const char *name)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
break;
}
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
/*
* Hardware specific interface ioctls.
*/
static int
ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
{
struct ifreq *ifr;
struct ifstat *ifs;
int error = 0;
int new_flags, temp_flags;
size_t namelen, onamelen;
size_t descrlen;
char *descrbuf, *odescrbuf;
char new_name[IFNAMSIZ];
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
ifr = (struct ifreq *)data;
switch (cmd) {
case SIOCGIFINDEX:
ifr->ifr_index = ifp->if_index;
break;
case SIOCGIFFLAGS:
temp_flags = ifp->if_flags | ifp->if_drv_flags;
ifr->ifr_flags = temp_flags & 0xffff;
ifr->ifr_flagshigh = temp_flags >> 16;
break;
case SIOCGIFCAP:
ifr->ifr_reqcap = ifp->if_capabilities;
ifr->ifr_curcap = ifp->if_capenable;
break;
#ifdef MAC
case SIOCGIFMAC:
error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
break;
#endif
case SIOCGIFMETRIC:
ifr->ifr_metric = ifp->if_metric;
break;
case SIOCGIFMTU:
ifr->ifr_mtu = ifp->if_mtu;
break;
case SIOCGIFPHYS:
ifr->ifr_phys = ifp->if_physical;
break;
case SIOCGIFDESCR:
error = 0;
sx_slock(&ifdescr_sx);
if (ifp->if_description == NULL)
error = ENOMSG;
else {
/* space for terminating nul */
descrlen = strlen(ifp->if_description) + 1;
if (ifr->ifr_buffer.length < descrlen)
ifr->ifr_buffer.buffer = NULL;
else
error = copyout(ifp->if_description,
ifr->ifr_buffer.buffer, descrlen);
ifr->ifr_buffer.length = descrlen;
}
sx_sunlock(&ifdescr_sx);
break;
case SIOCSIFDESCR:
error = priv_check(td, PRIV_NET_SETIFDESCR);
if (error)
return (error);
/*
* Copy only (length-1) bytes to make sure that
* if_description is always nul terminated. The
* length parameter is supposed to count the
* terminating nul in.
*/
if (ifr->ifr_buffer.length > ifdescr_maxlen)
return (ENAMETOOLONG);
else if (ifr->ifr_buffer.length == 0)
descrbuf = NULL;
else {
descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
M_WAITOK | M_ZERO);
error = copyin(ifr->ifr_buffer.buffer, descrbuf,
ifr->ifr_buffer.length - 1);
if (error) {
free(descrbuf, M_IFDESCR);
break;
}
}
sx_xlock(&ifdescr_sx);
odescrbuf = ifp->if_description;
ifp->if_description = descrbuf;
sx_xunlock(&ifdescr_sx);
getmicrotime(&ifp->if_lastchange);
free(odescrbuf, M_IFDESCR);
break;
case SIOCSIFFLAGS:
error = priv_check(td, PRIV_NET_SETIFFLAGS);
if (error)
return (error);
/*
* Currently, no driver owned flags pass the IFF_CANTCHANGE
* check, so we don't need special handling here yet.
*/
new_flags = (ifr->ifr_flags & 0xffff) |
(ifr->ifr_flagshigh << 16);
if (ifp->if_flags & IFF_SMART) {
/* Smart drivers twiddle their own routes */
} else if (ifp->if_flags & IFF_UP &&
(new_flags & IFF_UP) == 0) {
int s = splimp();
if_down(ifp);
splx(s);
} else if (new_flags & IFF_UP &&
(ifp->if_flags & IFF_UP) == 0) {
int s = splimp();
if_up(ifp);
splx(s);
}
/* See if permanently promiscuous mode bit is about to flip */
if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
if (new_flags & IFF_PPROMISC)
ifp->if_flags |= IFF_PROMISC;
else if (ifp->if_pcount == 0)
ifp->if_flags &= ~IFF_PROMISC;
log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
ifp->if_xname,
(new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
}
ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
(new_flags &~ IFF_CANTCHANGE);
if (ifp->if_ioctl) {
(void) (*ifp->if_ioctl)(ifp, cmd, data);
}
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFCAP:
error = priv_check(td, PRIV_NET_SETIFCAP);
if (error)
return (error);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
if (ifr->ifr_reqcap & ~ifp->if_capabilities)
return (EINVAL);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
#ifdef MAC
case SIOCSIFMAC:
error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
break;
#endif
case SIOCSIFNAME:
error = priv_check(td, PRIV_NET_SETIFNAME);
if (error)
return (error);
error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
if (error != 0)
return (error);
if (new_name[0] == '\0')
return (EINVAL);
if (ifunit(new_name) != NULL)
return (EEXIST);
/*
* XXX: Locking. Nothing else seems to lock if_flags,
* and there are numerous other races with the
* ifunit() checks not being atomic with namespace
* changes (renames, vmoves, if_attach, etc).
*/
ifp->if_flags |= IFF_RENAMING;
/* Announce the departure of the interface. */
rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
log(LOG_INFO, "%s: changing name to '%s'\n",
ifp->if_xname, new_name);
strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
ifa = ifp->if_addr;
IFA_LOCK(ifa);
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
namelen = strlen(new_name);
onamelen = sdl->sdl_nlen;
/*
* Move the address if needed. This is safe because we
* allocate space for a name of length IFNAMSIZ when we
* create this in if_attach().
*/
if (namelen != onamelen) {
bcopy(sdl->sdl_data + onamelen,
sdl->sdl_data + namelen, sdl->sdl_alen);
}
bcopy(new_name, sdl->sdl_data, namelen);
sdl->sdl_nlen = namelen;
sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
bzero(sdl->sdl_data, onamelen);
while (namelen != 0)
sdl->sdl_data[--namelen] = 0xff;
IFA_UNLOCK(ifa);
EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
/* Announce the return of the interface. */
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
ifp->if_flags &= ~IFF_RENAMING;
break;
#ifdef VIMAGE
case SIOCSIFVNET:
error = priv_check(td, PRIV_NET_SETIFVNET);
if (error)
return (error);
error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
break;
#endif
case SIOCSIFMETRIC:
error = priv_check(td, PRIV_NET_SETIFMETRIC);
if (error)
return (error);
ifp->if_metric = ifr->ifr_metric;
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFPHYS:
error = priv_check(td, PRIV_NET_SETIFPHYS);
if (error)
return (error);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFMTU:
{
u_long oldmtu = ifp->if_mtu;
error = priv_check(td, PRIV_NET_SETIFMTU);
if (error)
return (error);
if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
return (EINVAL);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0) {
getmicrotime(&ifp->if_lastchange);
rt_ifmsg(ifp);
}
/*
* If the link MTU changed, do network layer specific procedure.
*/
if (ifp->if_mtu != oldmtu) {
#ifdef INET6
nd6_setmtu(ifp);
#endif
}
break;
}
case SIOCADDMULTI:
case SIOCDELMULTI:
if (cmd == SIOCADDMULTI)
error = priv_check(td, PRIV_NET_ADDMULTI);
else
error = priv_check(td, PRIV_NET_DELMULTI);
if (error)
return (error);
/* Don't allow group membership on non-multicast interfaces. */
if ((ifp->if_flags & IFF_MULTICAST) == 0)
return (EOPNOTSUPP);
/* Don't let users screw up protocols' entries. */
if (ifr->ifr_addr.sa_family != AF_LINK)
return (EINVAL);
if (cmd == SIOCADDMULTI) {
struct ifmultiaddr *ifma;
/*
* Userland is only permitted to join groups once
* via the if_addmulti() KPI, because it cannot hold
* struct ifmultiaddr * between calls. It may also
* lose a race while we check if the membership
* already exists.
*/
IF_ADDR_LOCK(ifp);
ifma = if_findmulti(ifp, &ifr->ifr_addr);
IF_ADDR_UNLOCK(ifp);
if (ifma != NULL)
error = EADDRINUSE;
else
error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
} else {
error = if_delmulti(ifp, &ifr->ifr_addr);
}
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFPHYADDR:
case SIOCDIFPHYADDR:
#ifdef INET6
case SIOCSIFPHYADDR_IN6:
#endif
case SIOCSLIFPHYADDR:
case SIOCSIFMEDIA:
case SIOCSIFGENERIC:
error = priv_check(td, PRIV_NET_HWIOCTL);
if (error)
return (error);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
case SIOCGIFSTATUS:
ifs = (struct ifstat *)data;
ifs->ascii[0] = '\0';
case SIOCGIFPSRCADDR:
case SIOCGIFPDSTADDR:
case SIOCGLIFPHYADDR:
case SIOCGIFMEDIA:
case SIOCGIFGENERIC:
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
break;
case SIOCSIFLLADDR:
error = priv_check(td, PRIV_NET_SETLLADDR);
if (error)
return (error);
error = if_setlladdr(ifp,
ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
EVENTHANDLER_INVOKE(iflladdr_event, ifp);
break;
case SIOCAIFGROUP:
{
struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
error = priv_check(td, PRIV_NET_ADDIFGROUP);
if (error)
return (error);
if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
return (error);
break;
}
case SIOCGIFGROUP:
if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
return (error);
break;
case SIOCDIFGROUP:
{
struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
error = priv_check(td, PRIV_NET_DELIFGROUP);
if (error)
return (error);
if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
return (error);
break;
}
default:
error = ENOIOCTL;
break;
}
return (error);
}
/*
* Interface ioctls.
*/
int
ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
{
struct ifnet *ifp;
struct ifreq *ifr;
int error;
int oif_flags;
switch (cmd) {
case SIOCGIFCONF:
case OSIOCGIFCONF:
#ifdef __amd64__
case SIOCGIFCONF32:
#endif
return (ifconf(cmd, data));
}
ifr = (struct ifreq *)data;
switch (cmd) {
#ifdef VIMAGE
case SIOCSIFRVNET:
error = priv_check(td, PRIV_NET_SETIFVNET);
if (error)
return (error);
return (if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid));
#endif
case SIOCIFCREATE:
case SIOCIFCREATE2:
error = priv_check(td, PRIV_NET_IFCREATE);
if (error)
return (error);
return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
case SIOCIFDESTROY:
error = priv_check(td, PRIV_NET_IFDESTROY);
if (error)
return (error);
return if_clone_destroy(ifr->ifr_name);
case SIOCIFGCLONERS:
return (if_clone_list((struct if_clonereq *)data));
case SIOCGIFGMEMB:
return (if_getgroupmembers((struct ifgroupreq *)data));
}
ifp = ifunit_ref(ifr->ifr_name);
if (ifp == NULL)
return (ENXIO);
error = ifhwioctl(cmd, ifp, data, td);
if (error != ENOIOCTL) {
if_rele(ifp);
return (error);
}
oif_flags = ifp->if_flags;
if (so->so_proto == NULL) {
if_rele(ifp);
return (EOPNOTSUPP);
}
#ifndef COMPAT_43
error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd,
data,
ifp, td));
if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL)
error = (*ifp->if_ioctl)(ifp, cmd, data);
#else
{
u_long ocmd = cmd;
switch (cmd) {
case SIOCSIFDSTADDR:
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
#if BYTE_ORDER != BIG_ENDIAN
if (ifr->ifr_addr.sa_family == 0 &&
ifr->ifr_addr.sa_len < 16) {
ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
ifr->ifr_addr.sa_len = 16;
}
#else
if (ifr->ifr_addr.sa_len == 0)
ifr->ifr_addr.sa_len = 16;
#endif
break;
case OSIOCGIFADDR:
cmd = SIOCGIFADDR;
break;
case OSIOCGIFDSTADDR:
cmd = SIOCGIFDSTADDR;
break;
case OSIOCGIFBRDADDR:
cmd = SIOCGIFBRDADDR;
break;
case OSIOCGIFNETMASK:
cmd = SIOCGIFNETMASK;
}
error = ((*so->so_proto->pr_usrreqs->pru_control)(so,
cmd,
data,
ifp, td));
if (error == EOPNOTSUPP && ifp != NULL &&
ifp->if_ioctl != NULL)
error = (*ifp->if_ioctl)(ifp, cmd, data);
switch (ocmd) {
case OSIOCGIFADDR:
case OSIOCGIFDSTADDR:
case OSIOCGIFBRDADDR:
case OSIOCGIFNETMASK:
*(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
}
}
#endif /* COMPAT_43 */
if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
#ifdef INET6
if (ifp->if_flags & IFF_UP) {
int s = splimp();
in6_if_up(ifp);
splx(s);
}
#endif
}
if_rele(ifp);
return (error);
}
/*
* The code common to handling reference counted flags,
* e.g., in ifpromisc() and if_allmulti().
* The "pflag" argument can specify a permanent mode flag to check,
* such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
*
* Only to be used on stack-owned flags, not driver-owned flags.
*/
static int
if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
{
struct ifreq ifr;
int error;
int oldflags, oldcount;
/* Sanity checks to catch programming errors */
KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
("%s: setting driver-owned flag %d", __func__, flag));
if (onswitch)
KASSERT(*refcount >= 0,
("%s: increment negative refcount %d for flag %d",
__func__, *refcount, flag));
else
KASSERT(*refcount > 0,
("%s: decrement non-positive refcount %d for flag %d",
__func__, *refcount, flag));
/* In case this mode is permanent, just touch refcount */
if (ifp->if_flags & pflag) {
*refcount += onswitch ? 1 : -1;
return (0);
}
/* Save ifnet parameters for if_ioctl() may fail */
oldcount = *refcount;
oldflags = ifp->if_flags;
/*
* See if we aren't the only and touching refcount is enough.
* Actually toggle interface flag if we are the first or last.
*/
if (onswitch) {
if ((*refcount)++)
return (0);
ifp->if_flags |= flag;
} else {
if (--(*refcount))
return (0);
ifp->if_flags &= ~flag;
}
/* Call down the driver since we've changed interface flags */
if (ifp->if_ioctl == NULL) {
error = EOPNOTSUPP;
goto recover;
}
ifr.ifr_flags = ifp->if_flags & 0xffff;
ifr.ifr_flagshigh = ifp->if_flags >> 16;
error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
if (error)
goto recover;
/* Notify userland that interface flags have changed */
rt_ifmsg(ifp);
return (0);
recover:
/* Recover after driver error */
*refcount = oldcount;
ifp->if_flags = oldflags;
return (error);
}
/*
* Set/clear promiscuous mode on interface ifp based on the truth value
* of pswitch. The calls are reference counted so that only the first
* "on" request actually has an effect, as does the final "off" request.
* Results are undefined if the "off" and "on" requests are not matched.
*/
int
ifpromisc(struct ifnet *ifp, int pswitch)
{
int error;
int oldflags = ifp->if_flags;
error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
&ifp->if_pcount, pswitch);
/* If promiscuous mode status has changed, log a message */
if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
log(LOG_INFO, "%s: promiscuous mode %s\n",
ifp->if_xname,
(ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
return (error);
}
/*
* Return interface configuration
* of system. List may be used
* in later ioctl's (above) to get
* other information.
*/
/*ARGSUSED*/
static int
ifconf(u_long cmd, caddr_t data)
{
struct ifconf *ifc = (struct ifconf *)data;
#ifdef __amd64__
struct ifconf32 *ifc32 = (struct ifconf32 *)data;
struct ifconf ifc_swab;
#endif
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifreq ifr;
struct sbuf *sb;
int error, full = 0, valid_len, max_len;
#ifdef __amd64__
if (cmd == SIOCGIFCONF32) {
ifc_swab.ifc_len = ifc32->ifc_len;
ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf;
ifc = &ifc_swab;
}
#endif
/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
max_len = MAXPHYS - 1;
/* Prevent hostile input from being able to crash the system */
if (ifc->ifc_len <= 0)
return (EINVAL);
again:
if (ifc->ifc_len <= max_len) {
max_len = ifc->ifc_len;
full = 1;
}
sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
max_len = 0;
valid_len = 0;
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
int addrs;
/*
* Zero the ifr_name buffer to make sure we don't
* disclose the contents of the stack.
*/
memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
>= sizeof(ifr.ifr_name)) {
sbuf_delete(sb);
IFNET_RUNLOCK();
return (ENAMETOOLONG);
}
addrs = 0;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa = ifa->ifa_addr;
if (prison_if(curthread->td_ucred, sa) != 0)
continue;
addrs++;
#ifdef COMPAT_43
if (cmd == OSIOCGIFCONF) {
struct osockaddr *osa =
(struct osockaddr *)&ifr.ifr_addr;
ifr.ifr_addr = *sa;
osa->sa_family = sa->sa_family;
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
} else
#endif
if (sa->sa_len <= sizeof(*sa)) {
ifr.ifr_addr = *sa;
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
} else {
sbuf_bcat(sb, &ifr,
offsetof(struct ifreq, ifr_addr));
max_len += offsetof(struct ifreq, ifr_addr);
sbuf_bcat(sb, sa, sa->sa_len);
max_len += sa->sa_len;
}
if (!sbuf_overflowed(sb))
valid_len = sbuf_len(sb);
}
IF_ADDR_UNLOCK(ifp);
if (addrs == 0) {
bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
if (!sbuf_overflowed(sb))
valid_len = sbuf_len(sb);
}
}
IFNET_RUNLOCK();
/*
* If we didn't allocate enough space (uncommon), try again. If
* we have already allocated as much space as we are allowed,
* return what we've got.
*/
if (valid_len != max_len && !full) {
sbuf_delete(sb);
goto again;
}
ifc->ifc_len = valid_len;
#ifdef __amd64__
if (cmd == SIOCGIFCONF32)
ifc32->ifc_len = valid_len;
#endif
sbuf_finish(sb);
error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
sbuf_delete(sb);
return (error);
}
/*
* Just like ifpromisc(), but for all-multicast-reception mode.
*/
int
if_allmulti(struct ifnet *ifp, int onswitch)
{
return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
}
struct ifmultiaddr *
if_findmulti(struct ifnet *ifp, struct sockaddr *sa)
{
struct ifmultiaddr *ifma;
IF_ADDR_LOCK_ASSERT(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (sa->sa_family == AF_LINK) {
if (sa_dl_equal(ifma->ifma_addr, sa))
break;
} else {
if (sa_equal(ifma->ifma_addr, sa))
break;
}
}
return ifma;
}
/*
* Allocate a new ifmultiaddr and initialize based on passed arguments. We
* make copies of passed sockaddrs. The ifmultiaddr will not be added to
* the ifnet multicast address list here, so the caller must do that and
* other setup work (such as notifying the device driver). The reference
* count is initialized to 1.
*/
static struct ifmultiaddr *
if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
int mflags)
{
struct ifmultiaddr *ifma;
struct sockaddr *dupsa;
ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
M_ZERO);
if (ifma == NULL)
return (NULL);
dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
if (dupsa == NULL) {
free(ifma, M_IFMADDR);
return (NULL);
}
bcopy(sa, dupsa, sa->sa_len);
ifma->ifma_addr = dupsa;
ifma->ifma_ifp = ifp;
ifma->ifma_refcount = 1;
ifma->ifma_protospec = NULL;
if (llsa == NULL) {
ifma->ifma_lladdr = NULL;
return (ifma);
}
dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
if (dupsa == NULL) {
free(ifma->ifma_addr, M_IFMADDR);
free(ifma, M_IFMADDR);
return (NULL);
}
bcopy(llsa, dupsa, llsa->sa_len);
ifma->ifma_lladdr = dupsa;
return (ifma);
}
/*
* if_freemulti: free ifmultiaddr structure and possibly attached related
* addresses. The caller is responsible for implementing reference
* counting, notifying the driver, handling routing messages, and releasing
* any dependent link layer state.
*/
static void
if_freemulti(struct ifmultiaddr *ifma)
{
KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
ifma->ifma_refcount));
KASSERT(ifma->ifma_protospec == NULL,
("if_freemulti: protospec not NULL"));
if (ifma->ifma_lladdr != NULL)
free(ifma->ifma_lladdr, M_IFMADDR);
free(ifma->ifma_addr, M_IFMADDR);
free(ifma, M_IFMADDR);
}
/*
* Register an additional multicast address with a network interface.
*
* - If the address is already present, bump the reference count on the
* address and return.
* - If the address is not link-layer, look up a link layer address.
* - Allocate address structures for one or both addresses, and attach to the
* multicast address list on the interface. If automatically adding a link
* layer address, the protocol address will own a reference to the link
* layer address, to be freed when it is freed.
* - Notify the network device driver of an addition to the multicast address
* list.
*
* 'sa' points to caller-owned memory with the desired multicast address.
*
* 'retifma' will be used to return a pointer to the resulting multicast
* address reference, if desired.
*/
int
if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
struct ifmultiaddr **retifma)
{
struct ifmultiaddr *ifma, *ll_ifma;
struct sockaddr *llsa;
int error;
/*
* If the address is already present, return a new reference to it;
* otherwise, allocate storage and set up a new address.
*/
IF_ADDR_LOCK(ifp);
ifma = if_findmulti(ifp, sa);
if (ifma != NULL) {
ifma->ifma_refcount++;
if (retifma != NULL)
*retifma = ifma;
IF_ADDR_UNLOCK(ifp);
return (0);
}
/*
* The address isn't already present; resolve the protocol address
* into a link layer address, and then look that up, bump its
* refcount or allocate an ifma for that also. If 'llsa' was
* returned, we will need to free it later.
*/
llsa = NULL;
ll_ifma = NULL;
if (ifp->if_resolvemulti != NULL) {
error = ifp->if_resolvemulti(ifp, &llsa, sa);
if (error)
goto unlock_out;
}
/*
* Allocate the new address. Don't hook it up yet, as we may also
* need to allocate a link layer multicast address.
*/
ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
if (ifma == NULL) {
error = ENOMEM;
goto free_llsa_out;
}
/*
* If a link layer address is found, we'll need to see if it's
* already present in the address list, or allocate is as well.
* When this block finishes, the link layer address will be on the
* list.
*/
if (llsa != NULL) {
ll_ifma = if_findmulti(ifp, llsa);
if (ll_ifma == NULL) {
ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
if (ll_ifma == NULL) {
--ifma->ifma_refcount;
if_freemulti(ifma);
error = ENOMEM;
goto free_llsa_out;
}
TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
ifma_link);
} else
ll_ifma->ifma_refcount++;
ifma->ifma_llifma = ll_ifma;
}
/*
* We now have a new multicast address, ifma, and possibly a new or
* referenced link layer address. Add the primary address to the
* ifnet address list.
*/
TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
if (retifma != NULL)
*retifma = ifma;
/*
* Must generate the message while holding the lock so that 'ifma'
* pointer is still valid.
*/
rt_newmaddrmsg(RTM_NEWMADDR, ifma);
IF_ADDR_UNLOCK(ifp);
/*
* We are certain we have added something, so call down to the
* interface to let them know about it.
*/
if (ifp->if_ioctl != NULL) {
(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
}
if (llsa != NULL)
free(llsa, M_IFMADDR);
return (0);
free_llsa_out:
if (llsa != NULL)
free(llsa, M_IFMADDR);
unlock_out:
IF_ADDR_UNLOCK(ifp);
return (error);
}
/*
* Delete a multicast group membership by network-layer group address.
*
* Returns ENOENT if the entry could not be found. If ifp no longer
* exists, results are undefined. This entry point should only be used
* from subsystems which do appropriate locking to hold ifp for the
* duration of the call.
* Network-layer protocol domains must use if_delmulti_ifma().
*/
int
if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
{
struct ifmultiaddr *ifma;
int lastref;
#ifdef INVARIANTS
struct ifnet *oifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(oifp, &V_ifnet, if_link)
if (ifp == oifp)
break;
if (ifp != oifp)
ifp = NULL;
IFNET_RUNLOCK_NOSLEEP();
KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
#endif
if (ifp == NULL)
return (ENOENT);
IF_ADDR_LOCK(ifp);
lastref = 0;
ifma = if_findmulti(ifp, sa);
if (ifma != NULL)
lastref = if_delmulti_locked(ifp, ifma, 0);
IF_ADDR_UNLOCK(ifp);
if (ifma == NULL)
return (ENOENT);
if (lastref && ifp->if_ioctl != NULL) {
(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
}
return (0);
}
/*
* Delete all multicast group membership for an interface.
* Should be used to quickly flush all multicast filters.
*/
void
if_delallmulti(struct ifnet *ifp)
{
struct ifmultiaddr *ifma;
struct ifmultiaddr *next;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
if_delmulti_locked(ifp, ifma, 0);
IF_ADDR_UNLOCK(ifp);
}
/*
* Delete a multicast group membership by group membership pointer.
* Network-layer protocol domains must use this routine.
*
* It is safe to call this routine if the ifp disappeared.
*/
void
if_delmulti_ifma(struct ifmultiaddr *ifma)
{
struct ifnet *ifp;
int lastref;
ifp = ifma->ifma_ifp;
#ifdef DIAGNOSTIC
if (ifp == NULL) {
printf("%s: ifma_ifp seems to be detached\n", __func__);
} else {
struct ifnet *oifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(oifp, &V_ifnet, if_link)
if (ifp == oifp)
break;
if (ifp != oifp) {
printf("%s: ifnet %p disappeared\n", __func__, ifp);
ifp = NULL;
}
IFNET_RUNLOCK_NOSLEEP();
}
#endif
/*
* If and only if the ifnet instance exists: Acquire the address lock.
*/
if (ifp != NULL)
IF_ADDR_LOCK(ifp);
lastref = if_delmulti_locked(ifp, ifma, 0);
if (ifp != NULL) {
/*
* If and only if the ifnet instance exists:
* Release the address lock.
* If the group was left: update the hardware hash filter.
*/
IF_ADDR_UNLOCK(ifp);
if (lastref && ifp->if_ioctl != NULL) {
(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
}
}
}
/*
* Perform deletion of network-layer and/or link-layer multicast address.
*
* Return 0 if the reference count was decremented.
* Return 1 if the final reference was released, indicating that the
* hardware hash filter should be reprogrammed.
*/
static int
if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
{
struct ifmultiaddr *ll_ifma;
if (ifp != NULL && ifma->ifma_ifp != NULL) {
KASSERT(ifma->ifma_ifp == ifp,
("%s: inconsistent ifp %p", __func__, ifp));
IF_ADDR_LOCK_ASSERT(ifp);
}
ifp = ifma->ifma_ifp;
/*
* If the ifnet is detaching, null out references to ifnet,
* so that upper protocol layers will notice, and not attempt
* to obtain locks for an ifnet which no longer exists. The
* routing socket announcement must happen before the ifnet
* instance is detached from the system.
*/
if (detaching) {
#ifdef DIAGNOSTIC
printf("%s: detaching ifnet instance %p\n", __func__, ifp);
#endif
/*
* ifp may already be nulled out if we are being reentered
* to delete the ll_ifma.
*/
if (ifp != NULL) {
rt_newmaddrmsg(RTM_DELMADDR, ifma);
ifma->ifma_ifp = NULL;
}
}
if (--ifma->ifma_refcount > 0)
return 0;
/*
* If this ifma is a network-layer ifma, a link-layer ifma may
* have been associated with it. Release it first if so.
*/
ll_ifma = ifma->ifma_llifma;
if (ll_ifma != NULL) {
KASSERT(ifma->ifma_lladdr != NULL,
("%s: llifma w/o lladdr", __func__));
if (detaching)
ll_ifma->ifma_ifp = NULL; /* XXX */
if (--ll_ifma->ifma_refcount == 0) {
if (ifp != NULL) {
TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
ifma_link);
}
if_freemulti(ll_ifma);
}
}
if (ifp != NULL)
TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
if_freemulti(ifma);
/*
* The last reference to this instance of struct ifmultiaddr
* was released; the hardware should be notified of this change.
*/
return 1;
}
/*
* Set the link layer address on an interface.
*
* At this time we only support certain types of interfaces,
* and we don't allow the length of the address to change.
*/
int
if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
{
struct sockaddr_dl *sdl;
struct ifaddr *ifa;
struct ifreq ifr;
IF_ADDR_LOCK(ifp);
ifa = ifp->if_addr;
if (ifa == NULL) {
IF_ADDR_UNLOCK(ifp);
return (EINVAL);
}
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
if (sdl == NULL) {
ifa_free(ifa);
return (EINVAL);
}
if (len != sdl->sdl_alen) { /* don't allow length to change */
ifa_free(ifa);
return (EINVAL);
}
switch (ifp->if_type) {
case IFT_ETHER:
case IFT_FDDI:
case IFT_XETHER:
case IFT_ISO88025:
case IFT_L2VLAN:
case IFT_BRIDGE:
case IFT_ARCNET:
case IFT_IEEE8023ADLAG:
case IFT_IEEE80211:
bcopy(lladdr, LLADDR(sdl), len);
ifa_free(ifa);
break;
default:
ifa_free(ifa);
return (ENODEV);
}
/*
* If the interface is already up, we need
* to re-init it in order to reprogram its
* address filter.
*/
if ((ifp->if_flags & IFF_UP) != 0) {
if (ifp->if_ioctl) {
ifp->if_flags &= ~IFF_UP;
ifr.ifr_flags = ifp->if_flags & 0xffff;
ifr.ifr_flagshigh = ifp->if_flags >> 16;
(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
ifp->if_flags |= IFF_UP;
ifr.ifr_flags = ifp->if_flags & 0xffff;
ifr.ifr_flagshigh = ifp->if_flags >> 16;
(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
}
#ifdef INET
/*
* Also send gratuitous ARPs to notify other nodes about
* the address change.
*/
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family == AF_INET)
arp_ifinit(ifp, ifa);
}
#endif
}
return (0);
}
/*
* The name argument must be a pointer to storage which will last as
* long as the interface does. For physical devices, the result of
* device_get_name(dev) is a good choice and for pseudo-devices a
* static string works well.
*/
void
if_initname(struct ifnet *ifp, const char *name, int unit)
{
ifp->if_dname = name;
ifp->if_dunit = unit;
if (unit != IF_DUNIT_NONE)
snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
else
strlcpy(ifp->if_xname, name, IFNAMSIZ);
}
int
if_printf(struct ifnet *ifp, const char * fmt, ...)
{
va_list ap;
int retval;
retval = printf("%s: ", ifp->if_xname);
va_start(ap, fmt);
retval += vprintf(fmt, ap);
va_end(ap);
return (retval);
}
void
if_start(struct ifnet *ifp)
{
(*(ifp)->if_start)(ifp);
}
/*
* Backwards compatibility interface for drivers
* that have not implemented it
*/
static int
if_transmit(struct ifnet *ifp, struct mbuf *m)
{
int error;
IFQ_HANDOFF(ifp, m, error);
return (error);
}
int
if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
{
int active = 0;
IF_LOCK(ifq);
if (_IF_QFULL(ifq)) {
_IF_DROP(ifq);
IF_UNLOCK(ifq);
m_freem(m);
return (0);
}
if (ifp != NULL) {
ifp->if_obytes += m->m_pkthdr.len + adjust;
if (m->m_flags & (M_BCAST|M_MCAST))
ifp->if_omcasts++;
active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
}
_IF_ENQUEUE(ifq, m);
IF_UNLOCK(ifq);
if (ifp != NULL && !active)
(*(ifp)->if_start)(ifp);
return (1);
}
void
if_register_com_alloc(u_char type,
if_com_alloc_t *a, if_com_free_t *f)
{
KASSERT(if_com_alloc[type] == NULL,
("if_register_com_alloc: %d already registered", type));
KASSERT(if_com_free[type] == NULL,
("if_register_com_alloc: %d free already registered", type));
if_com_alloc[type] = a;
if_com_free[type] = f;
}
void
if_deregister_com_alloc(u_char type)
{
KASSERT(if_com_alloc[type] != NULL,
("if_deregister_com_alloc: %d not registered", type));
KASSERT(if_com_free[type] != NULL,
("if_deregister_com_alloc: %d free not registered", type));
if_com_alloc[type] = NULL;
if_com_free[type] = NULL;
}
#ifdef DDB
static void
if_show_ifnet(struct ifnet *ifp)
{
if (ifp == NULL)
return;
db_printf("%s:\n", ifp->if_xname);
#define IF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, ifp->e);
IF_DB_PRINTF("%s", if_dname);
IF_DB_PRINTF("%d", if_dunit);
IF_DB_PRINTF("%s", if_description);
IF_DB_PRINTF("%u", if_index);
IF_DB_PRINTF("%u", if_refcount);
IF_DB_PRINTF("%p", if_softc);
IF_DB_PRINTF("%p", if_l2com);
IF_DB_PRINTF("%p", if_vnet);
IF_DB_PRINTF("%p", if_home_vnet);
IF_DB_PRINTF("%p", if_addr);
IF_DB_PRINTF("%p", if_llsoftc);
IF_DB_PRINTF("%p", if_label);
IF_DB_PRINTF("%u", if_pcount);
IF_DB_PRINTF("0x%08x", if_flags);
IF_DB_PRINTF("0x%08x", if_drv_flags);
IF_DB_PRINTF("0x%08x", if_capabilities);
IF_DB_PRINTF("0x%08x", if_capenable);
IF_DB_PRINTF("%p", if_snd.ifq_head);
IF_DB_PRINTF("%p", if_snd.ifq_tail);
IF_DB_PRINTF("%d", if_snd.ifq_len);
IF_DB_PRINTF("%d", if_snd.ifq_maxlen);
IF_DB_PRINTF("%d", if_snd.ifq_drops);
IF_DB_PRINTF("%p", if_snd.ifq_drv_head);
IF_DB_PRINTF("%p", if_snd.ifq_drv_tail);
IF_DB_PRINTF("%d", if_snd.ifq_drv_len);
IF_DB_PRINTF("%d", if_snd.ifq_drv_maxlen);
IF_DB_PRINTF("%d", if_snd.altq_type);
IF_DB_PRINTF("%x", if_snd.altq_flags);
#undef IF_DB_PRINTF
}
DB_SHOW_COMMAND(ifnet, db_show_ifnet)
{
if (!have_addr) {
db_printf("usage: show ifnet <struct ifnet *>\n");
return;
}
if_show_ifnet((struct ifnet *)addr);
}
DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets)
{
VNET_ITERATOR_DECL(vnet_iter);
struct ifnet *ifp;
u_short idx;
VNET_FOREACH(vnet_iter) {
CURVNET_SET_QUIET(vnet_iter);
#ifdef VIMAGE
db_printf("vnet=%p\n", curvnet);
#endif
for (idx = 1; idx <= V_if_index; idx++) {
ifp = V_ifindex_table[idx].ife_ifnet;
if (ifp == NULL)
continue;
db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp);
if (db_pager_quit)
break;
}
CURVNET_RESTORE();
}
}
#endif
Index: stable/8/sys/net/if_var.h
===================================================================
--- stable/8/sys/net/if_var.h (revision 209276)
+++ stable/8/sys/net/if_var.h (revision 209277)
@@ -1,904 +1,904 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* From: @(#)if.h 8.1 (Berkeley) 6/10/93
* $FreeBSD$
*/
#ifndef _NET_IF_VAR_H_
#define _NET_IF_VAR_H_
/*
* Structures defining a network interface, providing a packet
* transport mechanism (ala level 0 of the PUP protocols).
*
* Each interface accepts output datagrams of a specified maximum
* length, and provides higher level routines with input datagrams
* received from its medium.
*
* Output occurs when the routine if_output is called, with three parameters:
* (*ifp->if_output)(ifp, m, dst, rt)
* Here m is the mbuf chain to be sent and dst is the destination address.
* The output routine encapsulates the supplied datagram if necessary,
* and then transmits it on its medium.
*
* On input, each interface unwraps the data received by it, and either
* places it on the input queue of an internetwork datagram routine
* and posts the associated software interrupt, or passes the datagram to a raw
* packet input routine.
*
* Routines exist for locating interfaces by their addresses
* or for locating an interface on a certain network, as well as more general
* routing and gateway routines maintaining information used to locate
* interfaces. These routines live in the files if.c and route.c
*/
#ifdef __STDC__
/*
* Forward structure declarations for function prototypes [sic].
*/
struct mbuf;
struct thread;
struct rtentry;
struct rt_addrinfo;
struct socket;
struct ether_header;
struct carp_if;
struct ifvlantrunk;
struct route;
struct vnet;
#endif
#include <sys/queue.h> /* get TAILQ macros */
#ifdef _KERNEL
#include <sys/mbuf.h>
#include <sys/eventhandler.h>
#include <sys/buf_ring.h>
#include <net/vnet.h>
#endif /* _KERNEL */
#include <sys/lock.h> /* XXX */
#include <sys/mutex.h> /* XXX */
#include <sys/rwlock.h> /* XXX */
#include <sys/sx.h> /* XXX */
#include <sys/event.h> /* XXX */
#include <sys/_task.h>
#define IF_DUNIT_NONE -1
#include <altq/if_altq.h>
TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */
TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */
TAILQ_HEAD(ifprefixhead, ifprefix);
TAILQ_HEAD(ifmultihead, ifmultiaddr);
TAILQ_HEAD(ifgrouphead, ifg_group);
/*
* Structure defining a queue for a network interface.
*/
struct ifqueue {
struct mbuf *ifq_head;
struct mbuf *ifq_tail;
int ifq_len;
int ifq_maxlen;
int ifq_drops;
struct mtx ifq_mtx;
};
/*
* Structure defining a network interface.
*
* (Would like to call this struct ``if'', but C isn't PL/1.)
*/
struct ifnet {
void *if_softc; /* pointer to driver state */
void *if_l2com; /* pointer to protocol bits */
struct vnet *if_vnet; /* pointer to network stack instance */
TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */
char if_xname[IFNAMSIZ]; /* external name (name + unit) */
const char *if_dname; /* driver name */
int if_dunit; /* unit or IF_DUNIT_NONE */
u_int if_refcount; /* reference count */
struct ifaddrhead if_addrhead; /* linked list of addresses per if */
/*
* if_addrhead is the list of all addresses associated to
* an interface.
* Some code in the kernel assumes that first element
* of the list has type AF_LINK, and contains sockaddr_dl
* addresses which store the link-level address and the name
* of the interface.
* However, access to the AF_LINK address through this
* field is deprecated. Use if_addr or ifaddr_byindex() instead.
*/
int if_pcount; /* number of promiscuous listeners */
struct carp_if *if_carp; /* carp interface structure */
struct bpf_if *if_bpf; /* packet filter structure */
u_short if_index; /* numeric abbreviation for this if */
short if_timer; /* time 'til if_watchdog called */
struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
int if_flags; /* up/down, broadcast, etc. */
int if_capabilities; /* interface features & capabilities */
int if_capenable; /* enabled features & capabilities */
void *if_linkmib; /* link-type-specific MIB data */
size_t if_linkmiblen; /* length of above data */
struct if_data if_data;
struct ifmultihead if_multiaddrs; /* multicast addresses configured */
int if_amcount; /* number of all-multicast requests */
/* procedure handles */
int (*if_output) /* output routine (enqueue) */
(struct ifnet *, struct mbuf *, struct sockaddr *,
struct route *);
void (*if_input) /* input routine (from h/w driver) */
(struct ifnet *, struct mbuf *);
void (*if_start) /* initiate output routine */
(struct ifnet *);
int (*if_ioctl) /* ioctl routine */
(struct ifnet *, u_long, caddr_t);
void (*if_watchdog) /* timer routine */
(struct ifnet *);
void (*if_init) /* Init routine */
(void *);
int (*if_resolvemulti) /* validate/resolve multicast */
(struct ifnet *, struct sockaddr **, struct sockaddr *);
void (*if_qflush) /* flush any queues */
(struct ifnet *);
int (*if_transmit) /* initiate output routine */
(struct ifnet *, struct mbuf *);
void (*if_reassign) /* reassign to vnet routine */
(struct ifnet *, struct vnet *, char *);
struct vnet *if_home_vnet; /* where this ifnet originates from */
struct ifaddr *if_addr; /* pointer to link-level address */
void *if_llsoftc; /* link layer softc */
int if_drv_flags; /* driver-managed status flags */
struct ifaltq if_snd; /* output queue (includes altq) */
const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
void *if_bridge; /* bridge glue */
struct label *if_label; /* interface MAC label */
/* these are only used by IPv6 */
struct ifprefixhead if_prefixhead; /* list of prefixes per if */
void *if_afdata[AF_MAX];
int if_afdata_initialized;
struct rwlock if_afdata_lock;
struct task if_linktask; /* task for link change events */
struct mtx if_addr_mtx; /* mutex to protect address lists */
LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */
TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */
/* protected by if_addr_mtx */
void *if_pf_kif;
void *if_lagg; /* lagg glue */
u_char if_alloctype; /* if_type at time of allocation */
/*
* Spare fields are added so that we can modify sensitive data
* structures without changing the kernel binary interface, and must
* be used with care where binary compatibility is required.
*/
char if_cspare[3];
char *if_description; /* interface description */
void *if_pspare[7];
int if_ispare[4];
};
typedef void if_init_f_t(void *);
/*
* XXX These aliases are terribly dangerous because they could apply
* to anything.
*/
#define if_mtu if_data.ifi_mtu
#define if_type if_data.ifi_type
#define if_physical if_data.ifi_physical
#define if_addrlen if_data.ifi_addrlen
#define if_hdrlen if_data.ifi_hdrlen
#define if_metric if_data.ifi_metric
#define if_link_state if_data.ifi_link_state
#define if_baudrate if_data.ifi_baudrate
#define if_hwassist if_data.ifi_hwassist
#define if_ipackets if_data.ifi_ipackets
#define if_ierrors if_data.ifi_ierrors
#define if_opackets if_data.ifi_opackets
#define if_oerrors if_data.ifi_oerrors
#define if_collisions if_data.ifi_collisions
#define if_ibytes if_data.ifi_ibytes
#define if_obytes if_data.ifi_obytes
#define if_imcasts if_data.ifi_imcasts
#define if_omcasts if_data.ifi_omcasts
#define if_iqdrops if_data.ifi_iqdrops
#define if_noproto if_data.ifi_noproto
#define if_lastchange if_data.ifi_lastchange
/* for compatibility with other BSDs */
#define if_addrlist if_addrhead
#define if_list if_link
#define if_name(ifp) ((ifp)->if_xname)
/*
* Locks for address lists on the network interface.
*/
#define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_mtx, \
"if_addr_mtx", NULL, MTX_DEF)
#define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_mtx)
#define IF_ADDR_LOCK(if) mtx_lock(&(if)->if_addr_mtx)
#define IF_ADDR_UNLOCK(if) mtx_unlock(&(if)->if_addr_mtx)
#define IF_ADDR_LOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED)
/*
* Function variations on locking macros intended to be used by loadable
* kernel modules in order to divorce them from the internals of address list
* locking.
*/
void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */
void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */
void if_maddr_rlock(struct ifnet *ifp); /* if_multiaddrs */
void if_maddr_runlock(struct ifnet *ifp); /* if_multiaddrs */
/*
* Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq)
* are queues of messages stored on ifqueue structures
* (defined above). Entries are added to and deleted from these structures
* by these macros, which should be called with ipl raised to splimp().
*/
#define IF_LOCK(ifq) mtx_lock(&(ifq)->ifq_mtx)
#define IF_UNLOCK(ifq) mtx_unlock(&(ifq)->ifq_mtx)
#define IF_LOCK_ASSERT(ifq) mtx_assert(&(ifq)->ifq_mtx, MA_OWNED)
#define _IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen)
#define _IF_DROP(ifq) ((ifq)->ifq_drops++)
#define _IF_QLEN(ifq) ((ifq)->ifq_len)
#define _IF_ENQUEUE(ifq, m) do { \
(m)->m_nextpkt = NULL; \
if ((ifq)->ifq_tail == NULL) \
(ifq)->ifq_head = m; \
else \
(ifq)->ifq_tail->m_nextpkt = m; \
(ifq)->ifq_tail = m; \
(ifq)->ifq_len++; \
} while (0)
#define IF_ENQUEUE(ifq, m) do { \
IF_LOCK(ifq); \
_IF_ENQUEUE(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define _IF_PREPEND(ifq, m) do { \
(m)->m_nextpkt = (ifq)->ifq_head; \
if ((ifq)->ifq_tail == NULL) \
(ifq)->ifq_tail = (m); \
(ifq)->ifq_head = (m); \
(ifq)->ifq_len++; \
} while (0)
#define IF_PREPEND(ifq, m) do { \
IF_LOCK(ifq); \
_IF_PREPEND(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define _IF_DEQUEUE(ifq, m) do { \
(m) = (ifq)->ifq_head; \
if (m) { \
if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \
(ifq)->ifq_tail = NULL; \
(m)->m_nextpkt = NULL; \
(ifq)->ifq_len--; \
} \
} while (0)
#define IF_DEQUEUE(ifq, m) do { \
IF_LOCK(ifq); \
_IF_DEQUEUE(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head)
#define IF_POLL(ifq, m) _IF_POLL(ifq, m)
#define _IF_DRAIN(ifq) do { \
struct mbuf *m; \
for (;;) { \
_IF_DEQUEUE(ifq, m); \
if (m == NULL) \
break; \
m_freem(m); \
} \
} while (0)
#define IF_DRAIN(ifq) do { \
IF_LOCK(ifq); \
_IF_DRAIN(ifq); \
IF_UNLOCK(ifq); \
} while(0)
#ifdef _KERNEL
/* interface link layer address change event */
typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t);
/* interface address change event */
typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
/* new interface arrival event */
typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
/* interface departure event */
typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);
/*
* interface groups
*/
struct ifg_group {
char ifg_group[IFNAMSIZ];
u_int ifg_refcnt;
void *ifg_pf_kif;
TAILQ_HEAD(, ifg_member) ifg_members;
TAILQ_ENTRY(ifg_group) ifg_next;
};
struct ifg_member {
TAILQ_ENTRY(ifg_member) ifgm_next;
struct ifnet *ifgm_ifp;
};
struct ifg_list {
struct ifg_group *ifgl_group;
TAILQ_ENTRY(ifg_list) ifgl_next;
};
/* group attach event */
typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
/* group detach event */
typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
/* group change event */
typedef void (*group_change_event_handler_t)(void *, const char *);
EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
#define IF_AFDATA_LOCK_INIT(ifp) \
rw_init(&(ifp)->if_afdata_lock, "if_afdata")
#define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp)
#define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp)
#define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock)
#define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED)
#define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED)
int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp,
int adjust);
#define IF_HANDOFF(ifq, m, ifp) \
if_handoff((struct ifqueue *)ifq, m, ifp, 0)
#define IF_HANDOFF_ADJ(ifq, m, ifp, adj) \
if_handoff((struct ifqueue *)ifq, m, ifp, adj)
void if_start(struct ifnet *);
#define IFQ_ENQUEUE(ifq, m, err) \
do { \
IF_LOCK(ifq); \
if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_ENQUEUE(ifq, m, NULL, err); \
else { \
if (_IF_QFULL(ifq)) { \
m_freem(m); \
(err) = ENOBUFS; \
} else { \
_IF_ENQUEUE(ifq, m); \
(err) = 0; \
} \
} \
if (err) \
(ifq)->ifq_drops++; \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_DEQUEUE_NOLOCK(ifq, m) \
do { \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue_ptr(ifq, ALTDQ_REMOVE); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_DEQUEUE(ifq, m); \
else \
_IF_DEQUEUE(ifq, m); \
} while (0)
#define IFQ_DEQUEUE(ifq, m) \
do { \
IF_LOCK(ifq); \
IFQ_DEQUEUE_NOLOCK(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_POLL_NOLOCK(ifq, m) \
do { \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue_ptr(ifq, ALTDQ_POLL); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_POLL(ifq, m); \
else \
_IF_POLL(ifq, m); \
} while (0)
#define IFQ_POLL(ifq, m) \
do { \
IF_LOCK(ifq); \
IFQ_POLL_NOLOCK(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_PURGE_NOLOCK(ifq) \
do { \
if (ALTQ_IS_ENABLED(ifq)) { \
ALTQ_PURGE(ifq); \
} else \
_IF_DRAIN(ifq); \
} while (0)
#define IFQ_PURGE(ifq) \
do { \
IF_LOCK(ifq); \
IFQ_PURGE_NOLOCK(ifq); \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_SET_READY(ifq) \
do { ((ifq)->altq_flags |= ALTQF_READY); } while (0)
#define IFQ_LOCK(ifq) IF_LOCK(ifq)
#define IFQ_UNLOCK(ifq) IF_UNLOCK(ifq)
#define IFQ_LOCK_ASSERT(ifq) IF_LOCK_ASSERT(ifq)
#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0)
#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++)
#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len)
#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++)
#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len))
/*
* The IFF_DRV_OACTIVE test should really occur in the device driver, not in
* the handoff logic, as that flag is locked by the device driver.
*/
#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \
do { \
int len; \
short mflags; \
\
len = (m)->m_pkthdr.len; \
mflags = (m)->m_flags; \
IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \
if ((err) == 0) { \
(ifp)->if_obytes += len + (adj); \
if (mflags & M_MCAST) \
(ifp)->if_omcasts++; \
if (((ifp)->if_drv_flags & IFF_DRV_OACTIVE) == 0) \
if_start(ifp); \
} \
} while (0)
#define IFQ_HANDOFF(ifp, m, err) \
IFQ_HANDOFF_ADJ(ifp, m, 0, err)
#define IFQ_DRV_DEQUEUE(ifq, m) \
do { \
(m) = (ifq)->ifq_drv_head; \
if (m) { \
if (((ifq)->ifq_drv_head = (m)->m_nextpkt) == NULL) \
(ifq)->ifq_drv_tail = NULL; \
(m)->m_nextpkt = NULL; \
(ifq)->ifq_drv_len--; \
} else { \
IFQ_LOCK(ifq); \
IFQ_DEQUEUE_NOLOCK(ifq, m); \
while ((ifq)->ifq_drv_len < (ifq)->ifq_drv_maxlen) { \
struct mbuf *m0; \
IFQ_DEQUEUE_NOLOCK(ifq, m0); \
if (m0 == NULL) \
break; \
m0->m_nextpkt = NULL; \
if ((ifq)->ifq_drv_tail == NULL) \
(ifq)->ifq_drv_head = m0; \
else \
(ifq)->ifq_drv_tail->m_nextpkt = m0; \
(ifq)->ifq_drv_tail = m0; \
(ifq)->ifq_drv_len++; \
} \
IFQ_UNLOCK(ifq); \
} \
} while (0)
#define IFQ_DRV_PREPEND(ifq, m) \
do { \
(m)->m_nextpkt = (ifq)->ifq_drv_head; \
if ((ifq)->ifq_drv_tail == NULL) \
(ifq)->ifq_drv_tail = (m); \
(ifq)->ifq_drv_head = (m); \
(ifq)->ifq_drv_len++; \
} while (0)
#define IFQ_DRV_IS_EMPTY(ifq) \
(((ifq)->ifq_drv_len == 0) && ((ifq)->ifq_len == 0))
#define IFQ_DRV_PURGE(ifq) \
do { \
struct mbuf *m, *n = (ifq)->ifq_drv_head; \
while((m = n) != NULL) { \
n = m->m_nextpkt; \
m_freem(m); \
} \
(ifq)->ifq_drv_head = (ifq)->ifq_drv_tail = NULL; \
(ifq)->ifq_drv_len = 0; \
IFQ_PURGE(ifq); \
} while (0)
#ifdef _KERNEL
static __inline void
drbr_stats_update(struct ifnet *ifp, int len, int mflags)
{
#ifndef NO_SLOW_STATS
ifp->if_obytes += len;
if (mflags & M_MCAST)
ifp->if_omcasts++;
#endif
}
static __inline int
drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m)
{
int error = 0;
int len = m->m_pkthdr.len;
int mflags = m->m_flags;
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_ENQUEUE(&ifp->if_snd, m, error);
return (error);
}
#endif
if ((error = buf_ring_enqueue_bytes(br, m, len)) == ENOBUFS) {
br->br_drops++;
m_freem(m);
} else
drbr_stats_update(ifp, len, mflags);
return (error);
}
static __inline void
drbr_flush(struct ifnet *ifp, struct buf_ring *br)
{
struct mbuf *m;
#ifdef ALTQ
if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
IFQ_PURGE(&ifp->if_snd);
#endif
while ((m = buf_ring_dequeue_sc(br)) != NULL)
m_freem(m);
}
static __inline void
drbr_free(struct buf_ring *br, struct malloc_type *type)
{
drbr_flush(NULL, br);
buf_ring_free(br, type);
}
static __inline struct mbuf *
drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
struct mbuf *m;
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_DEQUEUE(&ifp->if_snd, m);
return (m);
}
#endif
return (buf_ring_dequeue_sc(br));
}
static __inline struct mbuf *
drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
int (*func) (struct mbuf *, void *), void *arg)
{
struct mbuf *m;
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_LOCK(&ifp->if_snd);
IFQ_POLL_NOLOCK(&ifp->if_snd, m);
if (m != NULL && func(m, arg) == 0) {
IFQ_UNLOCK(&ifp->if_snd);
return (NULL);
}
IFQ_DEQUEUE_NOLOCK(&ifp->if_snd, m);
IFQ_UNLOCK(&ifp->if_snd);
return (m);
}
#endif
m = buf_ring_peek(br);
if (m == NULL || func(m, arg) == 0)
return (NULL);
return (buf_ring_dequeue_sc(br));
}
static __inline int
drbr_empty(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
return (IFQ_IS_EMPTY(&ifp->if_snd));
#endif
return (buf_ring_empty(br));
}
static __inline int
drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
return (1);
#endif
return (!buf_ring_empty(br));
}
static __inline int
drbr_inuse(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
return (ifp->if_snd.ifq_len);
#endif
return (buf_ring_count(br));
}
#endif
/*
* 72 was chosen below because it is the size of a TCP/IP
* header (40) + the minimum mss (32).
*/
#define IF_MINMTU 72
#define IF_MAXMTU 65535
#endif /* _KERNEL */
/*
* The ifaddr structure contains information about one address
* of an interface. They are maintained by the different address families,
* are allocated and attached when an address is set, and are linked
* together so all addresses for an interface can be located.
*
* NOTE: a 'struct ifaddr' is always at the beginning of a larger
* chunk of malloc'ed memory, where we store the three addresses
* (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
*/
struct ifaddr {
struct sockaddr *ifa_addr; /* address of interface */
struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */
#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */
struct sockaddr *ifa_netmask; /* used to determine subnet */
struct if_data if_data; /* not all members are meaningful */
struct ifnet *ifa_ifp; /* back-pointer to interface */
TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */
void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */
(int, struct rtentry *, struct rt_addrinfo *);
u_short ifa_flags; /* mostly rt_flags for cloning */
u_int ifa_refcnt; /* references to this structure */
int ifa_metric; /* cost of going out this interface */
int (*ifa_claim_addr) /* check if an addr goes to this if */
(struct ifaddr *, struct sockaddr *);
struct mtx ifa_mtx;
};
#define IFA_ROUTE RTF_UP /* route installed */
#define IFA_RTSELF RTF_HOST /* loopback route to self installed */
/* for compatibility with other BSDs */
#define ifa_list ifa_link
#ifdef _KERNEL
#define IFA_LOCK(ifa) mtx_lock(&(ifa)->ifa_mtx)
#define IFA_UNLOCK(ifa) mtx_unlock(&(ifa)->ifa_mtx)
void ifa_free(struct ifaddr *ifa);
void ifa_init(struct ifaddr *ifa);
void ifa_ref(struct ifaddr *ifa);
#endif
/*
* The prefix structure contains information about one prefix
* of an interface. They are maintained by the different address families,
* are allocated and attached when a prefix or an address is set,
* and are linked together so all prefixes for an interface can be located.
*/
struct ifprefix {
struct sockaddr *ifpr_prefix; /* prefix of interface */
struct ifnet *ifpr_ifp; /* back-pointer to interface */
TAILQ_ENTRY(ifprefix) ifpr_list; /* queue macro glue */
u_char ifpr_plen; /* prefix length in bits */
u_char ifpr_type; /* protocol dependent prefix type */
};
/*
* Multicast address structure. This is analogous to the ifaddr
* structure except that it keeps track of multicast addresses.
*/
struct ifmultiaddr {
TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
struct sockaddr *ifma_addr; /* address this membership is for */
struct sockaddr *ifma_lladdr; /* link-layer translation, if any */
struct ifnet *ifma_ifp; /* back-pointer to interface */
u_int ifma_refcount; /* reference count */
void *ifma_protospec; /* protocol-specific state, if any */
struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */
};
#ifdef _KERNEL
extern struct rwlock ifnet_rwlock;
extern struct sx ifnet_sxlock;
#define IFNET_LOCK_INIT() do { \
rw_init_flags(&ifnet_rwlock, "ifnet_rw", RW_RECURSE); \
sx_init_flags(&ifnet_sxlock, "ifnet_sx", SX_RECURSE); \
} while(0)
#define IFNET_WLOCK() do { \
sx_xlock(&ifnet_sxlock); \
rw_wlock(&ifnet_rwlock); \
} while (0)
#define IFNET_WUNLOCK() do { \
rw_wunlock(&ifnet_rwlock); \
sx_xunlock(&ifnet_sxlock); \
} while (0)
/*
* To assert the ifnet lock, you must know not only whether it's for read or
* write, but also whether it was acquired with sleep support or not.
*/
#define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED)
#define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED)
#define IFNET_WLOCK_ASSERT() do { \
sx_assert(&ifnet_sxlock, SA_XLOCKED); \
rw_assert(&ifnet_rwlock, RA_WLOCKED); \
} while (0)
#define IFNET_RLOCK() sx_slock(&ifnet_sxlock)
#define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock)
#define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock)
#define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock)
/*
* Look up an ifnet given its index; the _ref variant also acquires a
* reference that must be freed using if_rele(). It is almost always a bug
* to call ifnet_byindex() instead if ifnet_byindex_ref().
*/
struct ifnet *ifnet_byindex(u_short idx);
struct ifnet *ifnet_byindex_locked(u_short idx);
struct ifnet *ifnet_byindex_ref(u_short idx);
/*
* Given the index, ifaddr_byindex() returns the one and only
* link-level ifaddr for the interface. You are not supposed to use
* it to traverse the list of addresses associated to the interface.
*/
struct ifaddr *ifaddr_byindex(u_short idx);
VNET_DECLARE(struct ifnethead, ifnet);
VNET_DECLARE(struct ifgrouphead, ifg_head);
VNET_DECLARE(int, if_index);
VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */
VNET_DECLARE(int, useloopback);
#define V_ifnet VNET(ifnet)
#define V_ifg_head VNET(ifg_head)
#define V_if_index VNET(if_index)
#define V_loif VNET(loif)
#define V_useloopback VNET(useloopback)
extern int ifqmaxlen;
int if_addgroup(struct ifnet *, const char *);
int if_delgroup(struct ifnet *, const char *);
int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **);
int if_allmulti(struct ifnet *, int);
struct ifnet* if_alloc(u_char);
void if_attach(struct ifnet *);
void if_dead(struct ifnet *);
int if_delmulti(struct ifnet *, struct sockaddr *);
void if_delmulti_ifma(struct ifmultiaddr *);
void if_detach(struct ifnet *);
void if_vmove(struct ifnet *, struct vnet *);
void if_purgeaddrs(struct ifnet *);
void if_delallmulti(struct ifnet *);
void if_down(struct ifnet *);
struct ifmultiaddr *
if_findmulti(struct ifnet *, struct sockaddr *);
void if_free(struct ifnet *);
void if_free_type(struct ifnet *, u_char);
void if_initname(struct ifnet *, const char *, int);
void if_link_state_change(struct ifnet *, int);
int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
void if_qflush(struct ifnet *);
void if_ref(struct ifnet *);
void if_rele(struct ifnet *);
int if_setlladdr(struct ifnet *, const u_char *, int);
void if_up(struct ifnet *);
int ifioctl(struct socket *, u_long, caddr_t, struct thread *);
int ifpromisc(struct ifnet *, int);
struct ifnet *ifunit(const char *);
struct ifnet *ifunit_ref(const char *);
void ifq_init(struct ifaltq *, struct ifnet *ifp);
void ifq_delete(struct ifaltq *);
int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *);
int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *);
struct ifaddr *ifa_ifwithaddr(struct sockaddr *);
int ifa_ifwithaddr_check(struct sockaddr *);
struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *);
struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *);
-struct ifaddr *ifa_ifwithnet(struct sockaddr *);
+struct ifaddr *ifa_ifwithnet(struct sockaddr *, int);
struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *);
struct ifaddr *ifa_ifwithroute_fib(int, struct sockaddr *, struct sockaddr *, u_int);
struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *);
int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen);
typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp);
typedef void if_com_free_t(void *com, u_char type);
void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f);
void if_deregister_com_alloc(u_char type);
#define IF_LLADDR(ifp) \
LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))
#ifdef DEVICE_POLLING
enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };
typedef int poll_handler_t(struct ifnet *ifp, enum poll_cmd cmd, int count);
int ether_poll_register(poll_handler_t *h, struct ifnet *ifp);
int ether_poll_deregister(struct ifnet *ifp);
#endif /* DEVICE_POLLING */
#endif /* _KERNEL */
#endif /* !_NET_IF_VAR_H_ */
Index: stable/8/sys/net/route.c
===================================================================
--- stable/8/sys/net/route.c (revision 209276)
+++ stable/8/sys/net/route.c (revision 209277)
@@ -1,1599 +1,1599 @@
/*-
* Copyright (c) 1980, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)route.c 8.3.1.1 (Berkeley) 2/23/95
* $FreeBSD$
*/
/************************************************************************
* Note: In this file a 'fib' is a "forwarding information base" *
* Which is the new name for an in kernel routing (next hop) table. *
***********************************************************************/
#include "opt_inet.h"
#include "opt_route.h"
#include "opt_mrouting.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/syslog.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/vnet.h>
#include <net/flowtable.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <netinet/in.h>
#include <netinet/ip_mroute.h>
#include <vm/uma.h>
u_int rt_numfibs = RT_NUMFIBS;
SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
/*
* Allow the boot code to allow LESS than RT_MAXFIBS to be used.
* We can't do more because storage is statically allocated for now.
* (for compatibility reasons.. this will change).
*/
TUNABLE_INT("net.fibs", &rt_numfibs);
/*
* By default add routes to all fibs for new interfaces.
* Once this is set to 0 then only allocate routes on interface
* changes for the FIB of the caller when adding a new set of addresses
* to an interface. XXX this is a shotgun aproach to a problem that needs
* a more fine grained solution.. that will come.
*/
u_int rt_add_addr_allfibs = 1;
SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
&rt_add_addr_allfibs, 0, "");
TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
VNET_DEFINE(struct rtstat, rtstat);
#define V_rtstat VNET(rtstat)
VNET_DEFINE(struct radix_node_head *, rt_tables);
#define V_rt_tables VNET(rt_tables)
VNET_DEFINE(int, rttrash); /* routes not in table but not freed */
#define V_rttrash VNET(rttrash)
/* compare two sockaddr structures */
#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
/*
* Convert a 'struct radix_node *' to a 'struct rtentry *'.
* The operation can be done safely (in this code) because a
* 'struct rtentry' starts with two 'struct radix_node''s, the first
* one representing leaf nodes in the routing tree, which is
* what the code in radix.c passes us as a 'struct radix_node'.
*
* But because there are a lot of assumptions in this conversion,
* do not cast explicitly, but always use the macro below.
*/
#define RNTORT(p) ((struct rtentry *)(p))
static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */
#define V_rtzone VNET(rtzone)
#if 0
/* default fib for tunnels to use */
u_int tunnel_fib = 0;
SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, "");
#endif
/*
* handler for net.my_fibnum
*/
static int
sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
{
int fibnum;
int error;
fibnum = curthread->td_proc->p_fibnum;
error = sysctl_handle_int(oidp, &fibnum, 0, req);
return (error);
}
SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
static __inline struct radix_node_head **
rt_tables_get_rnh_ptr(int table, int fam)
{
struct radix_node_head **rnh;
KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
__func__));
KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
__func__));
/* rnh is [fib=0][af=0]. */
rnh = (struct radix_node_head **)V_rt_tables;
/* Get the offset to the requested table and fam. */
rnh += table * (AF_MAX+1) + fam;
return (rnh);
}
struct radix_node_head *
rt_tables_get_rnh(int table, int fam)
{
return (*rt_tables_get_rnh_ptr(table, fam));
}
/*
* route initialization must occur before ip6_init2(), which happenas at
* SI_ORDER_MIDDLE.
*/
static void
route_init(void)
{
struct domain *dom;
int max_keylen = 0;
/* whack the tunable ints into line. */
if (rt_numfibs > RT_MAXFIBS)
rt_numfibs = RT_MAXFIBS;
if (rt_numfibs == 0)
rt_numfibs = 1;
for (dom = domains; dom; dom = dom->dom_next)
if (dom->dom_maxrtkey > max_keylen)
max_keylen = dom->dom_maxrtkey;
rn_init(max_keylen); /* init all zeroes, all ones, mask table */
}
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
static void
vnet_route_init(const void *unused __unused)
{
struct domain *dom;
struct radix_node_head **rnh;
int table;
int fam;
V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtattach) {
for (table = 0; table < rt_numfibs; table++) {
if ( (fam = dom->dom_family) == AF_INET ||
table == 0) {
/* for now only AF_INET has > 1 table */
/* XXX MRT
* rtattach will be also called
* from vfs_export.c but the
* offset will be 0
* (only for AF_INET and AF_INET6
* which don't need it anyhow)
*/
rnh = rt_tables_get_rnh_ptr(table, fam);
if (rnh == NULL)
panic("%s: rnh NULL", __func__);
dom->dom_rtattach((void **)rnh,
dom->dom_rtoffset);
} else {
break;
}
}
}
}
}
VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
vnet_route_init, 0);
#ifdef VIMAGE
static void
vnet_route_uninit(const void *unused __unused)
{
int table;
int fam;
struct domain *dom;
struct radix_node_head **rnh;
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtdetach) {
for (table = 0; table < rt_numfibs; table++) {
if ( (fam = dom->dom_family) == AF_INET ||
table == 0) {
/* For now only AF_INET has > 1 tbl. */
rnh = rt_tables_get_rnh_ptr(table, fam);
if (rnh == NULL)
panic("%s: rnh NULL", __func__);
dom->dom_rtdetach((void **)rnh,
dom->dom_rtoffset);
} else {
break;
}
}
}
}
}
VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
vnet_route_uninit, 0);
#endif
#ifndef _SYS_SYSPROTO_H_
struct setfib_args {
int fibnum;
};
#endif
int
setfib(struct thread *td, struct setfib_args *uap)
{
if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
return EINVAL;
td->td_proc->p_fibnum = uap->fibnum;
return (0);
}
/*
* Packet routing routines.
*/
void
rtalloc(struct route *ro)
{
rtalloc_ign_fib(ro, 0UL, 0);
}
void
rtalloc_fib(struct route *ro, u_int fibnum)
{
rtalloc_ign_fib(ro, 0UL, fibnum);
}
void
rtalloc_ign(struct route *ro, u_long ignore)
{
struct rtentry *rt;
if ((rt = ro->ro_rt) != NULL) {
if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
return;
RTFREE(rt);
ro->ro_rt = NULL;
}
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
}
void
rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
{
struct rtentry *rt;
if ((rt = ro->ro_rt) != NULL) {
if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
return;
RTFREE(rt);
ro->ro_rt = NULL;
}
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
}
/*
* Look up the route that matches the address given
* Or, at least try.. Create a cloned route if needed.
*
* The returned route, if any, is locked.
*/
struct rtentry *
rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
{
return (rtalloc1_fib(dst, report, ignflags, 0));
}
struct rtentry *
rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
u_int fibnum)
{
struct radix_node_head *rnh;
struct rtentry *rt;
struct radix_node *rn;
struct rtentry *newrt;
struct rt_addrinfo info;
int err = 0, msgtype = RTM_MISS;
int needlock;
KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
fibnum = 0;
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
newrt = NULL;
/*
* Look up the address in the table for that Address Family
*/
if (rnh == NULL) {
V_rtstat.rts_unreach++;
goto miss;
}
needlock = !(ignflags & RTF_RNH_LOCKED);
if (needlock)
RADIX_NODE_HEAD_RLOCK(rnh);
#ifdef INVARIANTS
else
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
#endif
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
newrt = rt = RNTORT(rn);
RT_LOCK(newrt);
RT_ADDREF(newrt);
if (needlock)
RADIX_NODE_HEAD_RUNLOCK(rnh);
goto done;
} else if (needlock)
RADIX_NODE_HEAD_RUNLOCK(rnh);
/*
* Either we hit the root or couldn't find any match,
* Which basically means
* "caint get there frm here"
*/
V_rtstat.rts_unreach++;
miss:
if (report) {
/*
* If required, report the failure to the supervising
* Authorities.
* For a delete, this is not an error. (report == 0)
*/
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
rt_missmsg(msgtype, &info, 0, err);
}
done:
if (newrt)
RT_LOCK_ASSERT(newrt);
return (newrt);
}
/*
* Remove a reference count from an rtentry.
* If the count gets low enough, take it out of the routing table
*/
void
rtfree(struct rtentry *rt)
{
struct radix_node_head *rnh;
KASSERT(rt != NULL,("%s: NULL rt", __func__));
rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
RT_LOCK_ASSERT(rt);
/*
* The callers should use RTFREE_LOCKED() or RTFREE(), so
* we should come here exactly with the last reference.
*/
RT_REMREF(rt);
if (rt->rt_refcnt > 0) {
log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
goto done;
}
/*
* On last reference give the "close method" a chance
* to cleanup private state. This also permits (for
* IPv4 and IPv6) a chance to decide if the routing table
* entry should be purged immediately or at a later time.
* When an immediate purge is to happen the close routine
* typically calls rtexpunge which clears the RTF_UP flag
* on the entry so that the code below reclaims the storage.
*/
if (rt->rt_refcnt == 0 && rnh->rnh_close)
rnh->rnh_close((struct radix_node *)rt, rnh);
/*
* If we are no longer "up" (and ref == 0)
* then we can free the resources associated
* with the route.
*/
if ((rt->rt_flags & RTF_UP) == 0) {
if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic("rtfree 2");
/*
* the rtentry must have been removed from the routing table
* so it is represented in rttrash.. remove that now.
*/
V_rttrash--;
#ifdef DIAGNOSTIC
if (rt->rt_refcnt < 0) {
printf("rtfree: %p not freed (neg refs)\n", rt);
goto done;
}
#endif
/*
* release references on items we hold them on..
* e.g other routes and ifaddrs.
*/
if (rt->rt_ifa)
ifa_free(rt->rt_ifa);
/*
* The key is separatly alloc'd so free it (see rt_setgate()).
* This also frees the gateway, as they are always malloc'd
* together.
*/
Free(rt_key(rt));
/*
* and the rtentry itself of course
*/
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
return;
}
done:
RT_UNLOCK(rt);
}
/*
* Force a routing table entry to the specified
* destination to go through the given gateway.
* Normally called as a result of a routing redirect
* message from the network layer.
*/
void
rtredirect(struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct sockaddr *src)
{
rtredirect_fib(dst, gateway, netmask, flags, src, 0);
}
void
rtredirect_fib(struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct sockaddr *src,
u_int fibnum)
{
struct rtentry *rt, *rt0 = NULL;
int error = 0;
short *stat = NULL;
struct rt_addrinfo info;
struct ifaddr *ifa;
struct radix_node_head *rnh;
ifa = NULL;
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL) {
error = EAFNOSUPPORT;
goto out;
}
/* verify the gateway is directly reachable */
- if ((ifa = ifa_ifwithnet(gateway)) == NULL) {
+ if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
error = ENETUNREACH;
goto out;
}
rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */
/*
* If the redirect isn't from our current router for this dst,
* it's either old or wrong. If it redirects us to ourselves,
* we have a routing loop, perhaps as a result of an interface
* going down recently.
*/
if (!(flags & RTF_DONE) && rt &&
(!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
error = EINVAL;
else if (ifa_ifwithaddr_check(gateway))
error = EHOSTUNREACH;
if (error)
goto done;
/*
* Create a new entry if we just got back a wildcard entry
* or the the lookup failed. This is necessary for hosts
* which use routing redirects generated by smart gateways
* to dynamically build the routing tables.
*/
if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
goto create;
/*
* Don't listen to the redirect if it's
* for a route to an interface.
*/
if (rt->rt_flags & RTF_GATEWAY) {
if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
/*
* Changing from route to net => route to host.
* Create new route, rather than smashing route to net.
*/
create:
rt0 = rt;
rt = NULL;
flags |= RTF_GATEWAY | RTF_DYNAMIC;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
info.rti_ifa = ifa;
info.rti_flags = flags;
if (rt0 != NULL)
RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */
error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
if (rt != NULL) {
RT_LOCK(rt);
if (rt0 != NULL)
EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
flags = rt->rt_flags;
}
if (rt0 != NULL)
RTFREE(rt0);
stat = &V_rtstat.rts_dynamic;
} else {
struct rtentry *gwrt;
/*
* Smash the current notion of the gateway to
* this destination. Should check about netmask!!!
*/
rt->rt_flags |= RTF_MODIFIED;
flags |= RTF_MODIFIED;
stat = &V_rtstat.rts_newgateway;
/*
* add the key and gateway (in one malloc'd chunk).
*/
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
RT_LOCK(rt);
rt_setgate(rt, rt_key(rt), gateway);
gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
RADIX_NODE_HEAD_UNLOCK(rnh);
EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
RTFREE_LOCKED(gwrt);
}
} else
error = EHOSTUNREACH;
done:
if (rt)
RTFREE_LOCKED(rt);
out:
if (error)
V_rtstat.rts_badredirect++;
else if (stat != NULL)
(*stat)++;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
info.rti_info[RTAX_AUTHOR] = src;
rt_missmsg(RTM_REDIRECT, &info, flags, error);
if (ifa != NULL)
ifa_free(ifa);
}
int
rtioctl(u_long req, caddr_t data)
{
return (rtioctl_fib(req, data, 0));
}
/*
* Routing table ioctl interface.
*/
int
rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
{
/*
* If more ioctl commands are added here, make sure the proper
* super-user checks are being performed because it is possible for
* prison-root to make it this far if raw sockets have been enabled
* in jails.
*/
#ifdef INET
/* Multicast goop, grrr... */
return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
#else /* INET */
return ENXIO;
#endif /* INET */
}
/*
* For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
*/
struct ifaddr *
ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
{
return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
}
struct ifaddr *
ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
u_int fibnum)
{
register struct ifaddr *ifa;
int not_found = 0;
if ((flags & RTF_GATEWAY) == 0) {
/*
* If we are adding a route to an interface,
* and the interface is a pt to pt link
* we should search for the destination
* as our clue to the interface. Otherwise
* we can use the local address.
*/
ifa = NULL;
if (flags & RTF_HOST)
ifa = ifa_ifwithdstaddr(dst);
if (ifa == NULL)
ifa = ifa_ifwithaddr(gateway);
} else {
/*
* If we are adding a route to a remote net
* or host, the gateway may still be on the
* other end of a pt to pt link.
*/
ifa = ifa_ifwithdstaddr(gateway);
}
if (ifa == NULL)
- ifa = ifa_ifwithnet(gateway);
+ ifa = ifa_ifwithnet(gateway, 0);
if (ifa == NULL) {
struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
if (rt == NULL)
return (NULL);
/*
* dismiss a gateway that is reachable only
* through the default router
*/
switch (gateway->sa_family) {
case AF_INET:
if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
not_found = 1;
break;
case AF_INET6:
if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
not_found = 1;
break;
default:
break;
}
if (!not_found && rt->rt_ifa != NULL) {
ifa = rt->rt_ifa;
ifa_ref(ifa);
}
RT_REMREF(rt);
RT_UNLOCK(rt);
if (not_found || ifa == NULL)
return (NULL);
}
if (ifa->ifa_addr->sa_family != dst->sa_family) {
struct ifaddr *oifa = ifa;
ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
if (ifa == NULL)
ifa = oifa;
else
ifa_free(oifa);
}
return (ifa);
}
/*
* Do appropriate manipulations of a routing tree given
* all the bits of info needed
*/
int
rtrequest(int req,
struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct rtentry **ret_nrt)
{
return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
}
int
rtrequest_fib(int req,
struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct rtentry **ret_nrt,
u_int fibnum)
{
struct rt_addrinfo info;
if (dst->sa_len == 0)
return(EINVAL);
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = flags;
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
return rtrequest1_fib(req, &info, ret_nrt, fibnum);
}
/*
* These (questionable) definitions of apparent local variables apply
* to the next two functions. XXXXXX!!!
*/
#define dst info->rti_info[RTAX_DST]
#define gateway info->rti_info[RTAX_GATEWAY]
#define netmask info->rti_info[RTAX_NETMASK]
#define ifaaddr info->rti_info[RTAX_IFA]
#define ifpaddr info->rti_info[RTAX_IFP]
#define flags info->rti_flags
int
rt_getifa(struct rt_addrinfo *info)
{
return (rt_getifa_fib(info, 0));
}
/*
* Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined,
* it will be referenced so the caller must free it.
*/
int
rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
{
struct ifaddr *ifa;
int error = 0;
/*
* ifp may be specified by sockaddr_dl
* when protocol address is ambiguous.
*/
if (info->rti_ifp == NULL && ifpaddr != NULL &&
ifpaddr->sa_family == AF_LINK &&
- (ifa = ifa_ifwithnet(ifpaddr)) != NULL) {
+ (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
info->rti_ifp = ifa->ifa_ifp;
ifa_free(ifa);
}
if (info->rti_ifa == NULL && ifaaddr != NULL)
info->rti_ifa = ifa_ifwithaddr(ifaaddr);
if (info->rti_ifa == NULL) {
struct sockaddr *sa;
sa = ifaaddr != NULL ? ifaaddr :
(gateway != NULL ? gateway : dst);
if (sa != NULL && info->rti_ifp != NULL)
info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
else if (dst != NULL && gateway != NULL)
info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
fibnum);
else if (sa != NULL)
info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
fibnum);
}
if ((ifa = info->rti_ifa) != NULL) {
if (info->rti_ifp == NULL)
info->rti_ifp = ifa->ifa_ifp;
} else
error = ENETUNREACH;
return (error);
}
/*
* Expunges references to a route that's about to be reclaimed.
* The route must be locked.
*/
int
rtexpunge(struct rtentry *rt)
{
#if !defined(RADIX_MPATH)
struct radix_node *rn;
#else
struct rt_addrinfo info;
int fib;
struct rtentry *rt0;
#endif
struct radix_node_head *rnh;
struct ifaddr *ifa;
int error = 0;
/*
* Find the correct routing tree to use for this Address Family
*/
rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
RT_LOCK_ASSERT(rt);
if (rnh == NULL)
return (EAFNOSUPPORT);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
#ifdef RADIX_MPATH
fib = rt->rt_fibnum;
bzero(&info, sizeof(info));
info.rti_ifp = rt->rt_ifp;
info.rti_flags = RTF_RNH_LOCKED;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
RT_UNLOCK(rt);
error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
if (error == 0 && rt0 != NULL) {
rt = rt0;
RT_LOCK(rt);
} else if (error != 0) {
RT_LOCK(rt);
return (error);
}
#else
/*
* Remove the item from the tree; it should be there,
* but when callers invoke us blindly it may not (sigh).
*/
rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
if (rn == NULL) {
error = ESRCH;
goto bad;
}
KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
("unexpected flags 0x%x", rn->rn_flags));
KASSERT(rt == RNTORT(rn),
("lookup mismatch, rt %p rn %p", rt, rn));
#endif /* RADIX_MPATH */
rt->rt_flags &= ~RTF_UP;
/*
* Give the protocol a chance to keep things in sync.
*/
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
struct rt_addrinfo info;
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = rt->rt_flags;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
}
/*
* one more rtentry floating around that is not
* linked to the routing table.
*/
V_rttrash++;
#if !defined(RADIX_MPATH)
bad:
#endif
return (error);
}
#ifdef RADIX_MPATH
static int
rn_mpath_update(int req, struct rt_addrinfo *info,
struct radix_node_head *rnh, struct rtentry **ret_nrt)
{
/*
* if we got multipath routes, we require users to specify
* a matching RTAX_GATEWAY.
*/
struct rtentry *rt, *rto = NULL;
register struct radix_node *rn;
int error = 0;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
return (ESRCH);
rto = rt = RNTORT(rn);
rt = rt_mpath_matchgate(rt, gateway);
if (rt == NULL)
return (ESRCH);
/*
* this is the first entry in the chain
*/
if (rto == rt) {
rn = rn_mpath_next((struct radix_node *)rt);
/*
* there is another entry, now it's active
*/
if (rn) {
rto = RNTORT(rn);
RT_LOCK(rto);
rto->rt_flags |= RTF_UP;
RT_UNLOCK(rto);
} else if (rt->rt_flags & RTF_GATEWAY) {
/*
* For gateway routes, we need to
* make sure that we we are deleting
* the correct gateway.
* rt_mpath_matchgate() does not
* check the case when there is only
* one route in the chain.
*/
if (gateway &&
(rt->rt_gateway->sa_len != gateway->sa_len ||
memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
error = ESRCH;
else {
/*
* remove from tree before returning it
* to the caller
*/
rn = rnh->rnh_deladdr(dst, netmask, rnh);
KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
goto gwdelete;
}
}
/*
* use the normal delete code to remove
* the first entry
*/
if (req != RTM_DELETE)
goto nondelete;
error = ENOENT;
goto done;
}
/*
* if the entry is 2nd and on up
*/
if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
panic ("rtrequest1: rt_mpath_deldup");
gwdelete:
RT_LOCK(rt);
RT_ADDREF(rt);
if (req == RTM_DELETE) {
rt->rt_flags &= ~RTF_UP;
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
}
nondelete:
if (req != RTM_DELETE)
panic("unrecognized request %d", req);
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
done:
return (error);
}
#endif
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
{
int error = 0, needlock = 0;
register struct rtentry *rt;
#ifdef FLOWTABLE
register struct rtentry *rt0;
#endif
register struct radix_node *rn;
register struct radix_node_head *rnh;
struct ifaddr *ifa;
struct sockaddr *ndst;
#define senderr(x) { error = x ; goto bad; }
KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
fibnum = 0;
/*
* Find the correct routing tree to use for this Address Family
*/
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL)
return (EAFNOSUPPORT);
needlock = ((flags & RTF_RNH_LOCKED) == 0);
flags &= ~RTF_RNH_LOCKED;
if (needlock)
RADIX_NODE_HEAD_LOCK(rnh);
else
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
/*
* If we are adding a host route then we don't want to put
* a netmask in the tree, nor do we want to clone it.
*/
if (flags & RTF_HOST)
netmask = NULL;
switch (req) {
case RTM_DELETE:
#ifdef RADIX_MPATH
if (rn_mpath_capable(rnh)) {
error = rn_mpath_update(req, info, rnh, ret_nrt);
/*
* "bad" holds true for the success case
* as well
*/
if (error != ENOENT)
goto bad;
error = 0;
}
#endif
/*
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
rn = rnh->rnh_deladdr(dst, netmask, rnh);
if (rn == NULL)
senderr(ESRCH);
if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic ("rtrequest delete");
rt = RNTORT(rn);
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
/*
* give the protocol a chance to keep things in sync.
*/
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
ifa->ifa_rtrequest(RTM_DELETE, rt, info);
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
break;
case RTM_RESOLVE:
/*
* resolve was only used for route cloning
* here for compat
*/
break;
case RTM_ADD:
if ((flags & RTF_GATEWAY) && !gateway)
senderr(EINVAL);
if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
(gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
senderr(EINVAL);
if (info->rti_ifa == NULL) {
error = rt_getifa_fib(info, fibnum);
if (error)
senderr(error);
} else
ifa_ref(info->rti_ifa);
ifa = info->rti_ifa;
rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
if (rt == NULL) {
if (ifa != NULL)
ifa_free(ifa);
senderr(ENOBUFS);
}
RT_LOCK_INIT(rt);
rt->rt_flags = RTF_UP | flags;
rt->rt_fibnum = fibnum;
/*
* Add the gateway. Possibly re-malloc-ing the storage for it
*
*/
RT_LOCK(rt);
if ((error = rt_setgate(rt, dst, gateway)) != 0) {
RT_LOCK_DESTROY(rt);
if (ifa != NULL)
ifa_free(ifa);
uma_zfree(V_rtzone, rt);
senderr(error);
}
/*
* point to the (possibly newly malloc'd) dest address.
*/
ndst = (struct sockaddr *)rt_key(rt);
/*
* make sure it contains the value we want (masked if needed).
*/
if (netmask) {
rt_maskedcopy(dst, ndst, netmask);
} else
bcopy(dst, ndst, dst->sa_len);
/*
* We use the ifa reference returned by rt_getifa_fib().
* This moved from below so that rnh->rnh_addaddr() can
* examine the ifa and ifa->ifa_ifp if it so desires.
*/
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_rmx.rmx_weight = 1;
#ifdef RADIX_MPATH
/* do not permit exactly the same dst/mask/gw pair */
if (rn_mpath_capable(rnh) &&
rt_mpath_conflict(rnh, rt, netmask)) {
if (rt->rt_ifa) {
ifa_free(rt->rt_ifa);
}
Free(rt_key(rt));
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
senderr(EEXIST);
}
#endif
#ifdef FLOWTABLE
rt0 = NULL;
/* XXX
* "flow-table" only support IPv4 at the moment.
*/
#ifdef INET
if (dst->sa_family == AF_INET) {
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
struct sockaddr *mask;
u_char *m, *n;
int len;
/*
* compare mask to see if the new route is
* more specific than the existing one
*/
rt0 = RNTORT(rn);
RT_LOCK(rt0);
RT_ADDREF(rt0);
RT_UNLOCK(rt0);
/*
* A host route is already present, so
* leave the flow-table entries as is.
*/
if (rt0->rt_flags & RTF_HOST) {
RTFREE(rt0);
rt0 = NULL;
} else if (!(flags & RTF_HOST) && netmask) {
mask = rt_mask(rt0);
len = mask->sa_len;
m = (u_char *)mask;
n = (u_char *)netmask;
while (len-- > 0) {
if (*n != *m)
break;
n++;
m++;
}
if (len == 0 || (*n < *m)) {
RTFREE(rt0);
rt0 = NULL;
}
}
}
}
#endif
#endif
/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
/*
* If it still failed to go into the tree,
* then un-make it (this should be a function)
*/
if (rn == NULL) {
if (rt->rt_ifa)
ifa_free(rt->rt_ifa);
Free(rt_key(rt));
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
#ifdef FLOWTABLE
if (rt0 != NULL)
RTFREE(rt0);
#endif
senderr(EEXIST);
}
#ifdef FLOWTABLE
else if (rt0 != NULL) {
#ifdef INET
flowtable_route_flush(V_ip_ft, rt0);
#endif
RTFREE(rt0);
}
#endif
/*
* If this protocol has something to add to this then
* allow it to do that as well.
*/
if (ifa->ifa_rtrequest)
ifa->ifa_rtrequest(req, rt, info);
/*
* actually return a resultant rtentry and
* give the caller a single reference.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_ADDREF(rt);
}
RT_UNLOCK(rt);
break;
default:
error = EOPNOTSUPP;
}
bad:
if (needlock)
RADIX_NODE_HEAD_UNLOCK(rnh);
return (error);
#undef senderr
}
#undef dst
#undef gateway
#undef netmask
#undef ifaaddr
#undef ifpaddr
#undef flags
int
rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
{
/* XXX dst may be overwritten, can we move this to below */
int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
#ifdef INVARIANTS
struct radix_node_head *rnh;
rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
#endif
RT_LOCK_ASSERT(rt);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
/*
* Prepare to store the gateway in rt->rt_gateway.
* Both dst and gateway are stored one after the other in the same
* malloc'd chunk. If we have room, we can reuse the old buffer,
* rt_gateway already points to the right place.
* Otherwise, malloc a new block and update the 'dst' address.
*/
if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
caddr_t new;
R_Malloc(new, caddr_t, dlen + glen);
if (new == NULL)
return ENOBUFS;
/*
* XXX note, we copy from *dst and not *rt_key(rt) because
* rt_setgate() can be called to initialize a newly
* allocated route entry, in which case rt_key(rt) == NULL
* (and also rt->rt_gateway == NULL).
* Free()/free() handle a NULL argument just fine.
*/
bcopy(dst, new, dlen);
Free(rt_key(rt)); /* free old block, if any */
rt_key(rt) = (struct sockaddr *)new;
rt->rt_gateway = (struct sockaddr *)(new + dlen);
}
/*
* Copy the new gateway value into the memory chunk.
*/
bcopy(gate, rt->rt_gateway, glen);
return (0);
}
void
rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
{
register u_char *cp1 = (u_char *)src;
register u_char *cp2 = (u_char *)dst;
register u_char *cp3 = (u_char *)netmask;
u_char *cplim = cp2 + *cp3;
u_char *cplim2 = cp2 + *cp1;
*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
cp3 += 2;
if (cplim > cplim2)
cplim = cplim2;
while (cp2 < cplim)
*cp2++ = *cp1++ & *cp3++;
if (cp2 < cplim2)
bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
}
/*
* Set up a routing table entry, normally
* for an interface.
*/
#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
static inline int
rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
{
struct sockaddr *dst;
struct sockaddr *netmask;
struct rtentry *rt = NULL;
struct rt_addrinfo info;
int error = 0;
int startfib, endfib;
char tempbuf[_SOCKADDR_TMPSIZE];
int didwork = 0;
int a_failure = 0;
static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
netmask = NULL;
} else {
dst = ifa->ifa_addr;
netmask = ifa->ifa_netmask;
}
if ( dst->sa_family != AF_INET)
fibnum = 0;
if (fibnum == -1) {
if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
startfib = endfib = curthread->td_proc->p_fibnum;
} else {
startfib = 0;
endfib = rt_numfibs - 1;
}
} else {
KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
startfib = fibnum;
endfib = fibnum;
}
if (dst->sa_len == 0)
return(EINVAL);
/*
* If it's a delete, check that if it exists,
* it's on the correct interface or we might scrub
* a route to another ifa which would
* be confusing at best and possibly worse.
*/
if (cmd == RTM_DELETE) {
/*
* It's a delete, so it should already exist..
* If it's a net, mask off the host bits
* (Assuming we have a mask)
* XXX this is kinda inet specific..
*/
if (netmask != NULL) {
rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
dst = (struct sockaddr *)tempbuf;
}
}
/*
* Now go through all the requested tables (fibs) and do the
* requested action. Realistically, this will either be fib 0
* for protocols that don't do multiple tables or all the
* tables for those that do. XXX For this version only AF_INET.
* When that changes code should be refactored to protocol
* independent parts and protocol dependent parts.
*/
for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
if (cmd == RTM_DELETE) {
struct radix_node_head *rnh;
struct radix_node *rn;
/*
* Look up an rtentry that is in the routing tree and
* contains the correct info.
*/
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL)
/* this table doesn't exist but others might */
continue;
RADIX_NODE_HEAD_LOCK(rnh);
#ifdef RADIX_MPATH
if (rn_mpath_capable(rnh)) {
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
error = ESRCH;
else {
rt = RNTORT(rn);
/*
* for interface route the
* rt->rt_gateway is sockaddr_intf
* for cloning ARP entries, so
* rt_mpath_matchgate must use the
* interface address
*/
rt = rt_mpath_matchgate(rt,
ifa->ifa_addr);
if (!rt)
error = ESRCH;
}
}
else
#endif
rn = rnh->rnh_lookup(dst, netmask, rnh);
error = (rn == NULL ||
(rn->rn_flags & RNF_ROOT) ||
RNTORT(rn)->rt_ifa != ifa ||
!sa_equal((struct sockaddr *)rn->rn_key, dst));
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error) {
/* this is only an error if bad on ALL tables */
continue;
}
}
/*
* Do the actual request
*/
bzero((caddr_t)&info, sizeof(info));
info.rti_ifa = ifa;
info.rti_flags = flags | ifa->ifa_flags;
info.rti_info[RTAX_DST] = dst;
/*
* doing this for compatibility reasons
*/
if (cmd == RTM_ADD)
info.rti_info[RTAX_GATEWAY] =
(struct sockaddr *)&null_sdl;
else
info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = netmask;
error = rtrequest1_fib(cmd, &info, &rt, fibnum);
if (error == 0 && rt != NULL) {
/*
* notify any listening routing agents of the change
*/
RT_LOCK(rt);
#ifdef RADIX_MPATH
/*
* in case address alias finds the first address
* e.g. ifconfig bge0 192.103.54.246/24
* e.g. ifconfig bge0 192.103.54.247/24
* the address set in the route is 192.103.54.246
* so we need to replace it with 192.103.54.247
*/
if (memcmp(rt->rt_ifa->ifa_addr,
ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
ifa_free(rt->rt_ifa);
ifa_ref(ifa);
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_ifa = ifa;
}
#endif
/*
* doing this for compatibility reasons
*/
if (cmd == RTM_ADD) {
((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
rt->rt_ifp->if_type;
((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
rt->rt_ifp->if_index;
}
RT_ADDREF(rt);
RT_UNLOCK(rt);
rt_newaddrmsg(cmd, ifa, error, rt);
RT_LOCK(rt);
RT_REMREF(rt);
if (cmd == RTM_DELETE) {
/*
* If we are deleting, and we found an entry,
* then it's been removed from the tree..
* now throw it away.
*/
RTFREE_LOCKED(rt);
} else {
if (cmd == RTM_ADD) {
/*
* We just wanted to add it..
* we don't actually need a reference.
*/
RT_REMREF(rt);
}
RT_UNLOCK(rt);
}
didwork = 1;
}
if (error)
a_failure = error;
}
if (cmd == RTM_DELETE) {
if (didwork) {
error = 0;
} else {
/* we only give an error if it wasn't in any table */
error = ((flags & RTF_HOST) ?
EHOSTUNREACH : ENETUNREACH);
}
} else {
if (a_failure) {
/* return an error if any of them failed */
error = a_failure;
}
}
return (error);
}
/* special one for inet internal use. may not use. */
int
rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
{
return (rtinit1(ifa, cmd, flags, -1));
}
/*
* Set up a routing table entry, normally
* for an interface.
*/
int
rtinit(struct ifaddr *ifa, int cmd, int flags)
{
struct sockaddr *dst;
int fib = 0;
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
} else {
dst = ifa->ifa_addr;
}
if (dst->sa_family == AF_INET)
fib = -1;
return (rtinit1(ifa, cmd, flags, fib));
}
Index: stable/8/sys/net/rtsock.c
===================================================================
--- stable/8/sys/net/rtsock.c (revision 209276)
+++ stable/8/sys/net/rtsock.c (revision 209277)
@@ -1,1683 +1,1694 @@
/*-
* Copyright (c) 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
* $FreeBSD$
*/
#include "opt_compat.h"
#include "opt_sctp.h"
#include "opt_mpath.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/domain.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/rwlock.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_llatbl.h>
+#include <net/if_types.h>
#include <net/netisr.h>
#include <net/raw_cb.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#ifdef INET6
#include <netinet6/scope6_var.h>
#endif
#if defined(INET) || defined(INET6)
#ifdef SCTP
extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
#endif /* SCTP */
#endif
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
#include <compat/freebsd32/freebsd32.h>
struct if_data32 {
uint8_t ifi_type;
uint8_t ifi_physical;
uint8_t ifi_addrlen;
uint8_t ifi_hdrlen;
uint8_t ifi_link_state;
uint8_t ifi_spare_char1;
uint8_t ifi_spare_char2;
uint8_t ifi_datalen;
uint32_t ifi_mtu;
uint32_t ifi_metric;
uint32_t ifi_baudrate;
uint32_t ifi_ipackets;
uint32_t ifi_ierrors;
uint32_t ifi_opackets;
uint32_t ifi_oerrors;
uint32_t ifi_collisions;
uint32_t ifi_ibytes;
uint32_t ifi_obytes;
uint32_t ifi_imcasts;
uint32_t ifi_omcasts;
uint32_t ifi_iqdrops;
uint32_t ifi_noproto;
uint32_t ifi_hwassist;
int32_t ifi_epoch;
struct timeval32 ifi_lastchange;
};
struct if_msghdr32 {
uint16_t ifm_msglen;
uint8_t ifm_version;
uint8_t ifm_type;
int32_t ifm_addrs;
int32_t ifm_flags;
uint16_t ifm_index;
struct if_data32 ifm_data;
};
#endif
MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
/* NB: these are not modified */
static struct sockaddr route_src = { 2, PF_ROUTE, };
static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, };
static struct {
int ip_count; /* attached w/ AF_INET */
int ip6_count; /* attached w/ AF_INET6 */
int ipx_count; /* attached w/ AF_IPX */
int any_count; /* total attached */
} route_cb;
struct mtx rtsock_mtx;
MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
#define RTSOCK_LOCK() mtx_lock(&rtsock_mtx)
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
struct walkarg {
int w_tmemsize;
int w_op, w_arg;
caddr_t w_tmem;
struct sysctl_req *w_req;
};
static void rts_input(struct mbuf *m);
static struct mbuf *rt_msg1(int type, struct rt_addrinfo *rtinfo);
static int rt_msg2(int type, struct rt_addrinfo *rtinfo,
caddr_t cp, struct walkarg *w);
static int rt_xaddrs(caddr_t cp, caddr_t cplim,
struct rt_addrinfo *rtinfo);
static int sysctl_dumpentry(struct radix_node *rn, void *vw);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *m, struct socket *so);
static void rt_setmetrics(u_long which, const struct rt_metrics *in,
struct rt_metrics_lite *out);
static void rt_getmetrics(const struct rt_metrics_lite *in,
struct rt_metrics *out);
static void rt_dispatch(struct mbuf *, const struct sockaddr *);
static struct netisr_handler rtsock_nh = {
.nh_name = "rtsock",
.nh_handler = rts_input,
.nh_proto = NETISR_ROUTE,
.nh_policy = NETISR_POLICY_SOURCE,
};
static int
sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
{
int error, qlimit;
netisr_getqlimit(&rtsock_nh, &qlimit);
error = sysctl_handle_int(oidp, &qlimit, 0, req);
if (error || !req->newptr)
return (error);
if (qlimit < 1)
return (EINVAL);
return (netisr_setqlimit(&rtsock_nh, qlimit));
}
SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_route_netisr_maxqlen, "I",
"maximum routing socket dispatch queue length");
static void
rts_init(void)
{
int tmp;
if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
rtsock_nh.nh_qlimit = tmp;
netisr_register(&rtsock_nh);
}
SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);
static void
rts_input(struct mbuf *m)
{
struct sockproto route_proto;
unsigned short *family;
struct m_tag *tag;
route_proto.sp_family = PF_ROUTE;
tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
if (tag != NULL) {
family = (unsigned short *)(tag + 1);
route_proto.sp_protocol = *family;
m_tag_delete(m, tag);
} else
route_proto.sp_protocol = 0;
raw_input(m, &route_proto, &route_src);
}
/*
* It really doesn't make any sense at all for this code to share much
* with raw_usrreq.c, since its functionality is so restricted. XXX
*/
static void
rts_abort(struct socket *so)
{
raw_usrreqs.pru_abort(so);
}
static void
rts_close(struct socket *so)
{
raw_usrreqs.pru_close(so);
}
/* pru_accept is EOPNOTSUPP */
static int
rts_attach(struct socket *so, int proto, struct thread *td)
{
struct rawcb *rp;
int s, error;
KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));
/* XXX */
rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO);
if (rp == NULL)
return ENOBUFS;
/*
* The splnet() is necessary to block protocols from sending
* error notifications (like RTM_REDIRECT or RTM_LOSING) while
* this PCB is extant but incompletely initialized.
* Probably we should try to do more of this work beforehand and
* eliminate the spl.
*/
s = splnet();
so->so_pcb = (caddr_t)rp;
so->so_fibnum = td->td_proc->p_fibnum;
error = raw_attach(so, proto);
rp = sotorawcb(so);
if (error) {
splx(s);
so->so_pcb = NULL;
free(rp, M_PCB);
return error;
}
RTSOCK_LOCK();
switch(rp->rcb_proto.sp_protocol) {
case AF_INET:
route_cb.ip_count++;
break;
case AF_INET6:
route_cb.ip6_count++;
break;
case AF_IPX:
route_cb.ipx_count++;
break;
}
route_cb.any_count++;
RTSOCK_UNLOCK();
soisconnected(so);
so->so_options |= SO_USELOOPBACK;
splx(s);
return 0;
}
static int
rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
}
static int
rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
}
/* pru_connect2 is EOPNOTSUPP */
/* pru_control is EOPNOTSUPP */
static void
rts_detach(struct socket *so)
{
struct rawcb *rp = sotorawcb(so);
KASSERT(rp != NULL, ("rts_detach: rp == NULL"));
RTSOCK_LOCK();
switch(rp->rcb_proto.sp_protocol) {
case AF_INET:
route_cb.ip_count--;
break;
case AF_INET6:
route_cb.ip6_count--;
break;
case AF_IPX:
route_cb.ipx_count--;
break;
}
route_cb.any_count--;
RTSOCK_UNLOCK();
raw_usrreqs.pru_detach(so);
}
static int
rts_disconnect(struct socket *so)
{
return (raw_usrreqs.pru_disconnect(so));
}
/* pru_listen is EOPNOTSUPP */
static int
rts_peeraddr(struct socket *so, struct sockaddr **nam)
{
return (raw_usrreqs.pru_peeraddr(so, nam));
}
/* pru_rcvd is EOPNOTSUPP */
/* pru_rcvoob is EOPNOTSUPP */
static int
rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct thread *td)
{
return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
}
/* pru_sense is null */
static int
rts_shutdown(struct socket *so)
{
return (raw_usrreqs.pru_shutdown(so));
}
static int
rts_sockaddr(struct socket *so, struct sockaddr **nam)
{
return (raw_usrreqs.pru_sockaddr(so, nam));
}
static struct pr_usrreqs route_usrreqs = {
.pru_abort = rts_abort,
.pru_attach = rts_attach,
.pru_bind = rts_bind,
.pru_connect = rts_connect,
.pru_detach = rts_detach,
.pru_disconnect = rts_disconnect,
.pru_peeraddr = rts_peeraddr,
.pru_send = rts_send,
.pru_shutdown = rts_shutdown,
.pru_sockaddr = rts_sockaddr,
.pru_close = rts_close,
};
#ifndef _SOCKADDR_UNION_DEFINED
#define _SOCKADDR_UNION_DEFINED
/*
* The union of all possible address formats we handle.
*/
union sockaddr_union {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
};
#endif /* _SOCKADDR_UNION_DEFINED */
static int
rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
{
/* First, see if the returned address is part of the jail. */
if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
return (0);
}
switch (info->rti_info[RTAX_DST]->sa_family) {
#ifdef INET
case AF_INET:
{
struct in_addr ia;
struct ifaddr *ifa;
int found;
found = 0;
/*
* Try to find an address on the given outgoing interface
* that belongs to the jail.
*/
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa;
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
ia = ((struct sockaddr_in *)sa)->sin_addr;
if (prison_check_ip4(cred, &ia) == 0) {
found = 1;
break;
}
}
IF_ADDR_UNLOCK(ifp);
if (!found) {
/*
* As a last resort return the 'default' jail address.
*/
ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
sin_addr;
if (prison_get_ip4(cred, &ia) != 0)
return (ESRCH);
}
bzero(&saun->sin, sizeof(struct sockaddr_in));
saun->sin.sin_len = sizeof(struct sockaddr_in);
saun->sin.sin_family = AF_INET;
saun->sin.sin_addr.s_addr = ia.s_addr;
info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
break;
}
#endif
#ifdef INET6
case AF_INET6:
{
struct in6_addr ia6;
struct ifaddr *ifa;
int found;
found = 0;
/*
* Try to find an address on the given outgoing interface
* that belongs to the jail.
*/
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa;
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET6)
continue;
bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
&ia6, sizeof(struct in6_addr));
if (prison_check_ip6(cred, &ia6) == 0) {
found = 1;
break;
}
}
IF_ADDR_UNLOCK(ifp);
if (!found) {
/*
* As a last resort return the 'default' jail address.
*/
ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)->
sin6_addr;
if (prison_get_ip6(cred, &ia6) != 0)
return (ESRCH);
}
bzero(&saun->sin6, sizeof(struct sockaddr_in6));
saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
saun->sin6.sin6_family = AF_INET6;
bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
if (sa6_recoverscope(&saun->sin6) != 0)
return (ESRCH);
info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
break;
}
#endif
default:
return (ESRCH);
}
return (0);
}
/*ARGSUSED*/
static int
route_output(struct mbuf *m, struct socket *so)
{
#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
struct rt_msghdr *rtm = NULL;
struct rtentry *rt = NULL;
struct radix_node_head *rnh;
struct rt_addrinfo info;
int len, error = 0;
struct ifnet *ifp = NULL;
union sockaddr_union saun;
#define senderr(e) { error = e; goto flush;}
if (m == NULL || ((m->m_len < sizeof(long)) &&
(m = m_pullup(m, sizeof(long))) == NULL))
return (ENOBUFS);
if ((m->m_flags & M_PKTHDR) == 0)
panic("route_output");
len = m->m_pkthdr.len;
if (len < sizeof(*rtm) ||
len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
info.rti_info[RTAX_DST] = NULL;
senderr(EINVAL);
}
R_Malloc(rtm, struct rt_msghdr *, len);
if (rtm == NULL) {
info.rti_info[RTAX_DST] = NULL;
senderr(ENOBUFS);
}
m_copydata(m, 0, len, (caddr_t)rtm);
if (rtm->rtm_version != RTM_VERSION) {
info.rti_info[RTAX_DST] = NULL;
senderr(EPROTONOSUPPORT);
}
rtm->rtm_pid = curproc->p_pid;
bzero(&info, sizeof(info));
info.rti_addrs = rtm->rtm_addrs;
if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) {
info.rti_info[RTAX_DST] = NULL;
senderr(EINVAL);
}
info.rti_flags = rtm->rtm_flags;
if (info.rti_info[RTAX_DST] == NULL ||
info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
(info.rti_info[RTAX_GATEWAY] != NULL &&
info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
senderr(EINVAL);
/*
* Verify that the caller has the appropriate privilege; RTM_GET
* is the only operation the non-superuser is allowed.
*/
if (rtm->rtm_type != RTM_GET) {
error = priv_check(curthread, PRIV_NET_ROUTE);
if (error)
senderr(error);
}
/*
* The given gateway address may be an interface address.
* For example, issuing a "route change" command on a route
* entry that was created from a tunnel, and the gateway
* address given is the local end point. In this case the
* RTF_GATEWAY flag must be cleared or the destination will
* not be reachable even though there is no error message.
*/
if (info.rti_info[RTAX_GATEWAY] != NULL &&
info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
struct route gw_ro;
bzero(&gw_ro, sizeof(gw_ro));
gw_ro.ro_dst = *info.rti_info[RTAX_GATEWAY];
rtalloc_ign_fib(&gw_ro, 0, so->so_fibnum);
/*
* A host route through the loopback interface is
* installed for each interface adddress. In pre 8.0
* releases the interface address of a PPP link type
* is not reachable locally. This behavior is fixed as
* part of the new L2/L3 redesign and rewrite work. The
* signature of this interface address route is the
* AF_LINK sa_family type of the rt_gateway, and the
* rt_ifp has the IFF_LOOPBACK flag set.
*/
if (gw_ro.ro_rt != NULL &&
gw_ro.ro_rt->rt_gateway->sa_family == AF_LINK &&
gw_ro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)
info.rti_flags &= ~RTF_GATEWAY;
if (gw_ro.ro_rt != NULL)
RTFREE(gw_ro.ro_rt);
}
switch (rtm->rtm_type) {
struct rtentry *saved_nrt;
case RTM_ADD:
if (info.rti_info[RTAX_GATEWAY] == NULL)
senderr(EINVAL);
saved_nrt = NULL;
/* support for new ARP code */
if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
(rtm->rtm_flags & RTF_LLDATA) != 0) {
error = lla_rt_output(rtm, &info);
break;
}
error = rtrequest1_fib(RTM_ADD, &info, &saved_nrt,
so->so_fibnum);
if (error == 0 && saved_nrt) {
RT_LOCK(saved_nrt);
rt_setmetrics(rtm->rtm_inits,
&rtm->rtm_rmx, &saved_nrt->rt_rmx);
rtm->rtm_index = saved_nrt->rt_ifp->if_index;
RT_REMREF(saved_nrt);
RT_UNLOCK(saved_nrt);
}
break;
case RTM_DELETE:
saved_nrt = NULL;
/* support for new ARP code */
if (info.rti_info[RTAX_GATEWAY] &&
(info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
(rtm->rtm_flags & RTF_LLDATA) != 0) {
error = lla_rt_output(rtm, &info);
break;
}
error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt,
so->so_fibnum);
if (error == 0) {
RT_LOCK(saved_nrt);
rt = saved_nrt;
goto report;
}
break;
case RTM_GET:
case RTM_CHANGE:
case RTM_LOCK:
rnh = rt_tables_get_rnh(so->so_fibnum,
info.rti_info[RTAX_DST]->sa_family);
if (rnh == NULL)
senderr(EAFNOSUPPORT);
RADIX_NODE_HEAD_RLOCK(rnh);
rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST],
info.rti_info[RTAX_NETMASK], rnh);
if (rt == NULL) { /* XXX looks bogus */
RADIX_NODE_HEAD_RUNLOCK(rnh);
senderr(ESRCH);
}
#ifdef RADIX_MPATH
/*
* for RTM_CHANGE/LOCK, if we got multipath routes,
* we require users to specify a matching RTAX_GATEWAY.
*
* for RTM_GET, gate is optional even with multipath.
* if gate == NULL the first match is returned.
* (no need to call rt_mpath_matchgate if gate == NULL)
*/
if (rn_mpath_capable(rnh) &&
(rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
if (!rt) {
RADIX_NODE_HEAD_RUNLOCK(rnh);
senderr(ESRCH);
}
}
#endif
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
* another search to retrieve the prefix route of
* the local end point of the PPP link.
*/
- if ((rtm->rtm_flags & RTF_ANNOUNCE) &&
- (rt->rt_ifp->if_flags & IFF_POINTOPOINT)) {
+ if (rtm->rtm_flags & RTF_ANNOUNCE) {
struct sockaddr laddr;
- rt_maskedcopy(rt->rt_ifa->ifa_addr,
- &laddr,
- rt->rt_ifa->ifa_netmask);
+
+ if (rt->rt_ifp != NULL &&
+ rt->rt_ifp->if_type == IFT_PROPVIRTUAL) {
+ struct ifaddr *ifa;
+
+ ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1);
+ if (ifa != NULL)
+ rt_maskedcopy(ifa->ifa_addr,
+ &laddr,
+ ifa->ifa_netmask);
+ } else
+ rt_maskedcopy(rt->rt_ifa->ifa_addr,
+ &laddr,
+ rt->rt_ifa->ifa_netmask);
/*
* refactor rt and no lock operation necessary
*/
rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh);
if (rt == NULL) {
RADIX_NODE_HEAD_RUNLOCK(rnh);
senderr(ESRCH);
}
}
RT_LOCK(rt);
RT_ADDREF(rt);
RADIX_NODE_HEAD_RUNLOCK(rnh);
/*
* Fix for PR: 82974
*
* RTM_CHANGE/LOCK need a perfect match, rn_lookup()
* returns a perfect match in case a netmask is
* specified. For host routes only a longest prefix
* match is returned so it is necessary to compare the
* existence of the netmask. If both have a netmask
* rnh_lookup() did a perfect match and if none of them
* have a netmask both are host routes which is also a
* perfect match.
*/
if (rtm->rtm_type != RTM_GET &&
(!rt_mask(rt) != !info.rti_info[RTAX_NETMASK])) {
RT_UNLOCK(rt);
senderr(ESRCH);
}
switch(rtm->rtm_type) {
case RTM_GET:
report:
RT_LOCK_ASSERT(rt);
if ((rt->rt_flags & RTF_HOST) == 0
? jailed_without_vnet(curthread->td_ucred)
: prison_if(curthread->td_ucred,
rt_key(rt)) != 0) {
RT_UNLOCK(rt);
senderr(ESRCH);
}
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_GENMASK] = 0;
if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
ifp = rt->rt_ifp;
if (ifp) {
info.rti_info[RTAX_IFP] =
ifp->if_addr->ifa_addr;
error = rtm_get_jailed(&info, ifp, rt,
&saun, curthread->td_ucred);
if (error != 0) {
RT_UNLOCK(rt);
senderr(error);
}
if (ifp->if_flags & IFF_POINTOPOINT)
info.rti_info[RTAX_BRD] =
rt->rt_ifa->ifa_dstaddr;
rtm->rtm_index = ifp->if_index;
} else {
info.rti_info[RTAX_IFP] = NULL;
info.rti_info[RTAX_IFA] = NULL;
}
} else if ((ifp = rt->rt_ifp) != NULL) {
rtm->rtm_index = ifp->if_index;
}
len = rt_msg2(rtm->rtm_type, &info, NULL, NULL);
if (len > rtm->rtm_msglen) {
struct rt_msghdr *new_rtm;
R_Malloc(new_rtm, struct rt_msghdr *, len);
if (new_rtm == NULL) {
RT_UNLOCK(rt);
senderr(ENOBUFS);
}
bcopy(rtm, new_rtm, rtm->rtm_msglen);
Free(rtm); rtm = new_rtm;
}
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL);
rtm->rtm_flags = rt->rt_flags;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
case RTM_CHANGE:
/*
* New gateway could require new ifaddr, ifp;
* flags may also be different; ifp may be specified
* by ll sockaddr when protocol address is ambiguous
*/
if (((rt->rt_flags & RTF_GATEWAY) &&
info.rti_info[RTAX_GATEWAY] != NULL) ||
info.rti_info[RTAX_IFP] != NULL ||
(info.rti_info[RTAX_IFA] != NULL &&
!sa_equal(info.rti_info[RTAX_IFA],
rt->rt_ifa->ifa_addr))) {
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
error = rt_getifa_fib(&info, rt->rt_fibnum);
/*
* XXXRW: Really we should release this
* reference later, but this maintains
* historical behavior.
*/
if (info.rti_ifa != NULL)
ifa_free(info.rti_ifa);
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error != 0)
senderr(error);
RT_LOCK(rt);
}
if (info.rti_ifa != NULL &&
info.rti_ifa != rt->rt_ifa &&
rt->rt_ifa != NULL &&
rt->rt_ifa->ifa_rtrequest != NULL) {
rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt,
&info);
ifa_free(rt->rt_ifa);
}
if (info.rti_info[RTAX_GATEWAY] != NULL) {
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
RT_LOCK(rt);
error = rt_setgate(rt, rt_key(rt),
info.rti_info[RTAX_GATEWAY]);
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error != 0) {
RT_UNLOCK(rt);
senderr(error);
}
rt->rt_flags |= (RTF_GATEWAY & info.rti_flags);
}
if (info.rti_ifa != NULL &&
info.rti_ifa != rt->rt_ifa) {
ifa_ref(info.rti_ifa);
rt->rt_ifa = info.rti_ifa;
rt->rt_ifp = info.rti_ifp;
}
/* Allow some flags to be toggled on change. */
rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) |
(rtm->rtm_flags & RTF_FMASK);
rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
&rt->rt_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, &info);
/* FALLTHROUGH */
case RTM_LOCK:
/* We don't support locks anymore */
break;
}
RT_UNLOCK(rt);
break;
default:
senderr(EOPNOTSUPP);
}
flush:
if (rtm) {
if (error)
rtm->rtm_errno = error;
else
rtm->rtm_flags |= RTF_DONE;
}
if (rt) /* XXX can this be true? */
RTFREE(rt);
{
struct rawcb *rp = NULL;
/*
* Check to see if we don't want our own messages.
*/
if ((so->so_options & SO_USELOOPBACK) == 0) {
if (route_cb.any_count <= 1) {
if (rtm)
Free(rtm);
m_freem(m);
return (error);
}
/* There is another listener, so construct message */
rp = sotorawcb(so);
}
if (rtm) {
m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
if (m->m_pkthdr.len < rtm->rtm_msglen) {
m_freem(m);
m = NULL;
} else if (m->m_pkthdr.len > rtm->rtm_msglen)
m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
Free(rtm);
}
if (m) {
if (rp) {
/*
* XXX insure we don't get a copy by
* invalidating our protocol
*/
unsigned short family = rp->rcb_proto.sp_family;
rp->rcb_proto.sp_family = 0;
rt_dispatch(m, info.rti_info[RTAX_DST]);
rp->rcb_proto.sp_family = family;
} else
rt_dispatch(m, info.rti_info[RTAX_DST]);
}
}
return (error);
#undef sa_equal
}
static void
rt_setmetrics(u_long which, const struct rt_metrics *in,
struct rt_metrics_lite *out)
{
#define metric(f, e) if (which & (f)) out->e = in->e;
/*
* Only these are stored in the routing entry since introduction
* of tcp hostcache. The rest is ignored.
*/
metric(RTV_MTU, rmx_mtu);
metric(RTV_WEIGHT, rmx_weight);
/* Userland -> kernel timebase conversion. */
if (which & RTV_EXPIRE)
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_second + time_uptime : 0;
#undef metric
}
static void
rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out)
{
#define metric(e) out->e = in->e;
bzero(out, sizeof(*out));
metric(rmx_mtu);
metric(rmx_weight);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_uptime + time_second : 0;
#undef metric
}
/*
* Extract the addresses of the passed sockaddrs.
* Do a little sanity checking so as to avoid bad memory references.
* This data is derived straight from userland.
*/
static int
rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
{
struct sockaddr *sa;
int i;
for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
if ((rtinfo->rti_addrs & (1 << i)) == 0)
continue;
sa = (struct sockaddr *)cp;
/*
* It won't fit.
*/
if (cp + sa->sa_len > cplim)
return (EINVAL);
/*
* there are no more.. quit now
* If there are more bits, they are in error.
* I've seen this. route(1) can evidently generate these.
* This causes kernel to core dump.
* for compatibility, If we see this, point to a safe address.
*/
if (sa->sa_len == 0) {
rtinfo->rti_info[i] = &sa_zero;
return (0); /* should be EINVAL but for compat */
}
/* accept it */
rtinfo->rti_info[i] = sa;
cp += SA_SIZE(sa);
}
return (0);
}
static struct mbuf *
rt_msg1(int type, struct rt_addrinfo *rtinfo)
{
struct rt_msghdr *rtm;
struct mbuf *m;
int i;
struct sockaddr *sa;
int len, dlen;
switch (type) {
case RTM_DELADDR:
case RTM_NEWADDR:
len = sizeof(struct ifa_msghdr);
break;
case RTM_DELMADDR:
case RTM_NEWMADDR:
len = sizeof(struct ifma_msghdr);
break;
case RTM_IFINFO:
len = sizeof(struct if_msghdr);
break;
case RTM_IFANNOUNCE:
case RTM_IEEE80211:
len = sizeof(struct if_announcemsghdr);
break;
default:
len = sizeof(struct rt_msghdr);
}
if (len > MCLBYTES)
panic("rt_msg1");
m = m_gethdr(M_DONTWAIT, MT_DATA);
if (m && len > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return (m);
m->m_pkthdr.len = m->m_len = len;
m->m_pkthdr.rcvif = NULL;
rtm = mtod(m, struct rt_msghdr *);
bzero((caddr_t)rtm, len);
for (i = 0; i < RTAX_MAX; i++) {
if ((sa = rtinfo->rti_info[i]) == NULL)
continue;
rtinfo->rti_addrs |= (1 << i);
dlen = SA_SIZE(sa);
m_copyback(m, len, dlen, (caddr_t)sa);
len += dlen;
}
if (m->m_pkthdr.len != len) {
m_freem(m);
return (NULL);
}
rtm->rtm_msglen = len;
rtm->rtm_version = RTM_VERSION;
rtm->rtm_type = type;
return (m);
}
static int
rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w)
{
int i;
int len, dlen, second_time = 0;
caddr_t cp0;
rtinfo->rti_addrs = 0;
again:
switch (type) {
case RTM_DELADDR:
case RTM_NEWADDR:
len = sizeof(struct ifa_msghdr);
break;
case RTM_IFINFO:
#ifdef COMPAT_FREEBSD32
if (w != NULL && w->w_req->flags & SCTL_MASK32) {
len = sizeof(struct if_msghdr32);
break;
}
#endif
len = sizeof(struct if_msghdr);
break;
case RTM_NEWMADDR:
len = sizeof(struct ifma_msghdr);
break;
default:
len = sizeof(struct rt_msghdr);
}
cp0 = cp;
if (cp0)
cp += len;
for (i = 0; i < RTAX_MAX; i++) {
struct sockaddr *sa;
if ((sa = rtinfo->rti_info[i]) == NULL)
continue;
rtinfo->rti_addrs |= (1 << i);
dlen = SA_SIZE(sa);
if (cp) {
bcopy((caddr_t)sa, cp, (unsigned)dlen);
cp += dlen;
}
len += dlen;
}
len = ALIGN(len);
if (cp == NULL && w != NULL && !second_time) {
struct walkarg *rw = w;
if (rw->w_req) {
if (rw->w_tmemsize < len) {
if (rw->w_tmem)
free(rw->w_tmem, M_RTABLE);
rw->w_tmem = (caddr_t)
malloc(len, M_RTABLE, M_NOWAIT);
if (rw->w_tmem)
rw->w_tmemsize = len;
}
if (rw->w_tmem) {
cp = rw->w_tmem;
second_time = 1;
goto again;
}
}
}
if (cp) {
struct rt_msghdr *rtm = (struct rt_msghdr *)cp0;
rtm->rtm_version = RTM_VERSION;
rtm->rtm_type = type;
rtm->rtm_msglen = len;
}
return (len);
}
/*
* This routine is called to generate a message from the routing
* socket indicating that a redirect has occured, a routing lookup
* has failed, or that a protocol has detected timeouts to a particular
* destination.
*/
void
rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
{
struct rt_msghdr *rtm;
struct mbuf *m;
struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
if (route_cb.any_count == 0)
return;
m = rt_msg1(type, rtinfo);
if (m == NULL)
return;
rtm = mtod(m, struct rt_msghdr *);
rtm->rtm_flags = RTF_DONE | flags;
rtm->rtm_errno = error;
rtm->rtm_addrs = rtinfo->rti_addrs;
rt_dispatch(m, sa);
}
/*
* This routine is called to generate a message from the routing
* socket indicating that the status of a network interface has changed.
*/
void
rt_ifmsg(struct ifnet *ifp)
{
struct if_msghdr *ifm;
struct mbuf *m;
struct rt_addrinfo info;
if (route_cb.any_count == 0)
return;
bzero((caddr_t)&info, sizeof(info));
m = rt_msg1(RTM_IFINFO, &info);
if (m == NULL)
return;
ifm = mtod(m, struct if_msghdr *);
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
ifm->ifm_data = ifp->if_data;
ifm->ifm_addrs = 0;
rt_dispatch(m, NULL);
}
/*
* This is called to generate messages from the routing socket
* indicating a network interface has had addresses associated with it.
* if we ever reverse the logic and replace messages TO the routing
* socket indicate a request to configure interfaces, then it will
* be unnecessary as the routing socket will automatically generate
* copies of it.
*/
void
rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
{
struct rt_addrinfo info;
struct sockaddr *sa = NULL;
int pass;
struct mbuf *m = NULL;
struct ifnet *ifp = ifa->ifa_ifp;
KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
("unexpected cmd %u", cmd));
#if defined(INET) || defined(INET6)
#ifdef SCTP
/*
* notify the SCTP stack
* this will only get called when an address is added/deleted
* XXX pass the ifaddr struct instead if ifa->ifa_addr...
*/
sctp_addr_change(ifa, cmd);
#endif /* SCTP */
#endif
if (route_cb.any_count == 0)
return;
for (pass = 1; pass < 3; pass++) {
bzero((caddr_t)&info, sizeof(info));
if ((cmd == RTM_ADD && pass == 1) ||
(cmd == RTM_DELETE && pass == 2)) {
struct ifa_msghdr *ifam;
int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
if ((m = rt_msg1(ncmd, &info)) == NULL)
continue;
ifam = mtod(m, struct ifa_msghdr *);
ifam->ifam_index = ifp->if_index;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_addrs = info.rti_addrs;
}
if ((cmd == RTM_ADD && pass == 2) ||
(cmd == RTM_DELETE && pass == 1)) {
struct rt_msghdr *rtm;
if (rt == NULL)
continue;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_DST] = sa = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
if ((m = rt_msg1(cmd, &info)) == NULL)
continue;
rtm = mtod(m, struct rt_msghdr *);
rtm->rtm_index = ifp->if_index;
rtm->rtm_flags |= rt->rt_flags;
rtm->rtm_errno = error;
rtm->rtm_addrs = info.rti_addrs;
}
rt_dispatch(m, sa);
}
}
/*
* This is the analogue to the rt_newaddrmsg which performs the same
* function but for multicast group memberhips. This is easier since
* there is no route state to worry about.
*/
void
rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
{
struct rt_addrinfo info;
struct mbuf *m = NULL;
struct ifnet *ifp = ifma->ifma_ifp;
struct ifma_msghdr *ifmam;
if (route_cb.any_count == 0)
return;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_IFA] = ifma->ifma_addr;
info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
/*
* If a link-layer address is present, present it as a ``gateway''
* (similarly to how ARP entries, e.g., are presented).
*/
info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
m = rt_msg1(cmd, &info);
if (m == NULL)
return;
ifmam = mtod(m, struct ifma_msghdr *);
KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
__func__));
ifmam->ifmam_index = ifp->if_index;
ifmam->ifmam_addrs = info.rti_addrs;
rt_dispatch(m, ifma->ifma_addr);
}
static struct mbuf *
rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
struct rt_addrinfo *info)
{
struct if_announcemsghdr *ifan;
struct mbuf *m;
if (route_cb.any_count == 0)
return NULL;
bzero((caddr_t)info, sizeof(*info));
m = rt_msg1(type, info);
if (m != NULL) {
ifan = mtod(m, struct if_announcemsghdr *);
ifan->ifan_index = ifp->if_index;
strlcpy(ifan->ifan_name, ifp->if_xname,
sizeof(ifan->ifan_name));
ifan->ifan_what = what;
}
return m;
}
/*
* This is called to generate routing socket messages indicating
* IEEE80211 wireless events.
* XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
*/
void
rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
{
struct mbuf *m;
struct rt_addrinfo info;
m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
if (m != NULL) {
/*
* Append the ieee80211 data. Try to stick it in the
* mbuf containing the ifannounce msg; otherwise allocate
* a new mbuf and append.
*
* NB: we assume m is a single mbuf.
*/
if (data_len > M_TRAILINGSPACE(m)) {
struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
if (n == NULL) {
m_freem(m);
return;
}
bcopy(data, mtod(n, void *), data_len);
n->m_len = data_len;
m->m_next = n;
} else if (data_len > 0) {
bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
m->m_len += data_len;
}
if (m->m_flags & M_PKTHDR)
m->m_pkthdr.len += data_len;
mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
rt_dispatch(m, NULL);
}
}
/*
* This is called to generate routing socket messages indicating
* network interface arrival and departure.
*/
void
rt_ifannouncemsg(struct ifnet *ifp, int what)
{
struct mbuf *m;
struct rt_addrinfo info;
m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
if (m != NULL)
rt_dispatch(m, NULL);
}
static void
rt_dispatch(struct mbuf *m, const struct sockaddr *sa)
{
struct m_tag *tag;
/*
* Preserve the family from the sockaddr, if any, in an m_tag for
* use when injecting the mbuf into the routing socket buffer from
* the netisr.
*/
if (sa != NULL) {
tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
M_NOWAIT);
if (tag == NULL) {
m_freem(m);
return;
}
*(unsigned short *)(tag + 1) = sa->sa_family;
m_tag_prepend(m, tag);
}
#ifdef VIMAGE
if (V_loif)
m->m_pkthdr.rcvif = V_loif;
else {
m_freem(m);
return;
}
#endif
netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */
}
/*
* This is used in dumping the kernel table via sysctl().
*/
static int
sysctl_dumpentry(struct radix_node *rn, void *vw)
{
struct walkarg *w = vw;
struct rtentry *rt = (struct rtentry *)rn;
int error = 0, size;
struct rt_addrinfo info;
if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
return 0;
if ((rt->rt_flags & RTF_HOST) == 0
? jailed_without_vnet(w->w_req->td->td_ucred)
: prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0)
return (0);
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_GENMASK] = 0;
if (rt->rt_ifp) {
info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
}
size = rt_msg2(RTM_GET, &info, NULL, w);
if (w->w_req && w->w_tmem) {
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
/*
* let's be honest about this being a retarded hack
*/
rtm->rtm_fmask = rt->rt_rmx.rmx_pksent;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
return (error);
}
return (error);
}
#ifdef COMPAT_FREEBSD32
static void
copy_ifdata32(struct if_data *src, struct if_data32 *dst)
{
bzero(dst, sizeof(*dst));
CP(*src, *dst, ifi_type);
CP(*src, *dst, ifi_physical);
CP(*src, *dst, ifi_addrlen);
CP(*src, *dst, ifi_hdrlen);
CP(*src, *dst, ifi_link_state);
CP(*src, *dst, ifi_datalen);
CP(*src, *dst, ifi_mtu);
CP(*src, *dst, ifi_metric);
CP(*src, *dst, ifi_baudrate);
CP(*src, *dst, ifi_ipackets);
CP(*src, *dst, ifi_ierrors);
CP(*src, *dst, ifi_opackets);
CP(*src, *dst, ifi_oerrors);
CP(*src, *dst, ifi_collisions);
CP(*src, *dst, ifi_ibytes);
CP(*src, *dst, ifi_obytes);
CP(*src, *dst, ifi_imcasts);
CP(*src, *dst, ifi_omcasts);
CP(*src, *dst, ifi_iqdrops);
CP(*src, *dst, ifi_noproto);
CP(*src, *dst, ifi_hwassist);
CP(*src, *dst, ifi_epoch);
TV_CP(*src, *dst, ifi_lastchange);
}
#endif
static int
sysctl_iflist(int af, struct walkarg *w)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct rt_addrinfo info;
int len, error = 0;
bzero((caddr_t)&info, sizeof(info));
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (w->w_arg && w->w_arg != ifp->if_index)
continue;
ifa = ifp->if_addr;
info.rti_info[RTAX_IFP] = ifa->ifa_addr;
len = rt_msg2(RTM_IFINFO, &info, NULL, w);
info.rti_info[RTAX_IFP] = NULL;
if (w->w_req && w->w_tmem) {
struct if_msghdr *ifm;
#ifdef COMPAT_FREEBSD32
if (w->w_req->flags & SCTL_MASK32) {
struct if_msghdr32 *ifm32;
ifm32 = (struct if_msghdr32 *)w->w_tmem;
ifm32->ifm_index = ifp->if_index;
ifm32->ifm_flags = ifp->if_flags |
ifp->if_drv_flags;
copy_ifdata32(&ifp->if_data, &ifm32->ifm_data);
ifm32->ifm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)ifm32,
len);
goto sysctl_out;
}
#endif
ifm = (struct if_msghdr *)w->w_tmem;
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
ifm->ifm_data = ifp->if_data;
ifm->ifm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)ifm, len);
#ifdef COMPAT_FREEBSD32
sysctl_out:
#endif
if (error)
goto done;
}
while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
if (af && af != ifa->ifa_addr->sa_family)
continue;
if (prison_if(w->w_req->td->td_ucred,
ifa->ifa_addr) != 0)
continue;
info.rti_info[RTAX_IFA] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
len = rt_msg2(RTM_NEWADDR, &info, NULL, w);
if (w->w_req && w->w_tmem) {
struct ifa_msghdr *ifam;
ifam = (struct ifa_msghdr *)w->w_tmem;
ifam->ifam_index = ifa->ifa_ifp->if_index;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
if (error)
goto done;
}
}
info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
info.rti_info[RTAX_BRD] = NULL;
}
done:
IFNET_RUNLOCK();
return (error);
}
static int
sysctl_ifmalist(int af, struct walkarg *w)
{
struct ifnet *ifp;
struct ifmultiaddr *ifma;
struct rt_addrinfo info;
int len, error = 0;
struct ifaddr *ifa;
bzero((caddr_t)&info, sizeof(info));
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (w->w_arg && w->w_arg != ifp->if_index)
continue;
ifa = ifp->if_addr;
info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (af && af != ifma->ifma_addr->sa_family)
continue;
if (prison_if(w->w_req->td->td_ucred,
ifma->ifma_addr) != 0)
continue;
info.rti_info[RTAX_IFA] = ifma->ifma_addr;
info.rti_info[RTAX_GATEWAY] =
(ifma->ifma_addr->sa_family != AF_LINK) ?
ifma->ifma_lladdr : NULL;
len = rt_msg2(RTM_NEWMADDR, &info, NULL, w);
if (w->w_req && w->w_tmem) {
struct ifma_msghdr *ifmam;
ifmam = (struct ifma_msghdr *)w->w_tmem;
ifmam->ifmam_index = ifma->ifma_ifp->if_index;
ifmam->ifmam_flags = 0;
ifmam->ifmam_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
if (error) {
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
}
IF_ADDR_UNLOCK(ifp);
}
done:
IFNET_RUNLOCK();
return (error);
}
static int
sysctl_rtsock(SYSCTL_HANDLER_ARGS)
{
int *name = (int *)arg1;
u_int namelen = arg2;
struct radix_node_head *rnh = NULL; /* silence compiler. */
int i, lim, error = EINVAL;
u_char af;
struct walkarg w;
name ++;
namelen--;
if (req->newptr)
return (EPERM);
if (namelen != 3)
return ((namelen < 3) ? EISDIR : ENOTDIR);
af = name[0];
if (af > AF_MAX)
return (EINVAL);
bzero(&w, sizeof(w));
w.w_op = name[1];
w.w_arg = name[2];
w.w_req = req;
error = sysctl_wire_old_buffer(req, 0);
if (error)
return (error);
switch (w.w_op) {
case NET_RT_DUMP:
case NET_RT_FLAGS:
if (af == 0) { /* dump all tables */
i = 1;
lim = AF_MAX;
} else /* dump only one table */
i = lim = af;
/*
* take care of llinfo entries, the caller must
* specify an AF
*/
if (w.w_op == NET_RT_FLAGS &&
(w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
if (af != 0)
error = lltable_sysctl_dumparp(af, w.w_req);
else
error = EINVAL;
break;
}
/*
* take care of routing entries
*/
for (error = 0; error == 0 && i <= lim; i++) {
rnh = rt_tables_get_rnh(req->td->td_proc->p_fibnum, i);
if (rnh != NULL) {
RADIX_NODE_HEAD_LOCK(rnh);
error = rnh->rnh_walktree(rnh,
sysctl_dumpentry, &w);
RADIX_NODE_HEAD_UNLOCK(rnh);
} else if (af != 0)
error = EAFNOSUPPORT;
}
break;
case NET_RT_IFLIST:
error = sysctl_iflist(af, &w);
break;
case NET_RT_IFMALIST:
error = sysctl_ifmalist(af, &w);
break;
}
if (w.w_tmem)
free(w.w_tmem, M_RTABLE);
return (error);
}
SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");
/*
* Definitions of protocols supported in the ROUTE domain.
*/
static struct domain routedomain; /* or at least forward */
static struct protosw routesw[] = {
{
.pr_type = SOCK_RAW,
.pr_domain = &routedomain,
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_output = route_output,
.pr_ctlinput = raw_ctlinput,
.pr_init = raw_init,
.pr_usrreqs = &route_usrreqs
}
};
static struct domain routedomain = {
.dom_family = PF_ROUTE,
.dom_name = "route",
.dom_protosw = routesw,
.dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])]
};
VNET_DOMAIN_SET(route);
Index: stable/8/sys/netinet/in.c
===================================================================
--- stable/8/sys/netinet/in.c (revision 209276)
+++ stable/8/sys/netinet/in.c (revision 209277)
@@ -1,1585 +1,1586 @@
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (C) 2001 WIDE Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in.c 8.4 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_carp.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sockio.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/socket.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/igmp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
static int in_mask2len(struct in_addr *);
static void in_len2mask(struct in_addr *, int);
static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t,
struct ifnet *, struct thread *);
static int in_addprefix(struct in_ifaddr *, int);
static int in_scrubprefix(struct in_ifaddr *);
static void in_socktrim(struct sockaddr_in *);
static int in_ifinit(struct ifnet *,
struct in_ifaddr *, struct sockaddr_in *, int);
static void in_purgemaddrs(struct ifnet *);
static VNET_DEFINE(int, subnetsarelocal);
#define V_subnetsarelocal VNET(subnetsarelocal)
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
&VNET_NAME(subnetsarelocal), 0,
"Treat all subnets as directly connected");
static VNET_DEFINE(int, sameprefixcarponly);
#define V_sameprefixcarponly VNET(sameprefixcarponly)
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW,
&VNET_NAME(sameprefixcarponly), 0,
"Refuse to create same prefixes on different interfaces");
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
#define V_ripcbinfo VNET(ripcbinfo)
/*
* Return 1 if an internet address is for a ``local'' host
* (one to which we have a connection). If subnetsarelocal
* is true, this includes other subnets of the local net.
* Otherwise, it includes only the directly-connected (sub)nets.
*/
int
in_localaddr(struct in_addr in)
{
register u_long i = ntohl(in.s_addr);
register struct in_ifaddr *ia;
IN_IFADDR_RLOCK();
if (V_subnetsarelocal) {
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((i & ia->ia_netmask) == ia->ia_net) {
IN_IFADDR_RUNLOCK();
return (1);
}
}
} else {
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
IN_IFADDR_RUNLOCK();
return (1);
}
}
}
IN_IFADDR_RUNLOCK();
return (0);
}
/*
* Return 1 if an internet address is for the local host and configured
* on one of its interfaces.
*/
int
in_localip(struct in_addr in)
{
struct in_ifaddr *ia;
IN_IFADDR_RLOCK();
LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) {
IN_IFADDR_RUNLOCK();
return (1);
}
}
IN_IFADDR_RUNLOCK();
return (0);
}
/*
* Determine whether an IP address is in a reserved set of addresses
* that may not be forwarded, or whether datagrams to that destination
* may be forwarded.
*/
int
in_canforward(struct in_addr in)
{
register u_long i = ntohl(in.s_addr);
register u_long net;
if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i))
return (0);
if (IN_CLASSA(i)) {
net = i & IN_CLASSA_NET;
if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
return (0);
}
return (1);
}
/*
* Trim a mask in a sockaddr
*/
static void
in_socktrim(struct sockaddr_in *ap)
{
register char *cplim = (char *) &ap->sin_addr;
register char *cp = (char *) (&ap->sin_addr + 1);
ap->sin_len = 0;
while (--cp >= cplim)
if (*cp) {
(ap)->sin_len = cp - (char *) (ap) + 1;
break;
}
}
static int
in_mask2len(mask)
struct in_addr *mask;
{
int x, y;
u_char *p;
p = (u_char *)mask;
for (x = 0; x < sizeof(*mask); x++) {
if (p[x] != 0xff)
break;
}
y = 0;
if (x < sizeof(*mask)) {
for (y = 0; y < 8; y++) {
if ((p[x] & (0x80 >> y)) == 0)
break;
}
}
return (x * 8 + y);
}
static void
in_len2mask(struct in_addr *mask, int len)
{
int i;
u_char *p;
p = (u_char *)mask;
bzero(mask, sizeof(*mask));
for (i = 0; i < len / 8; i++)
p[i] = 0xff;
if (len % 8)
p[i] = (0xff00 >> (len % 8)) & 0xff;
}
/*
* Generic internet control operations (ioctl's).
*
* ifp is NULL if not an interface-specific ioctl.
*/
/* ARGSUSED */
int
in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
struct thread *td)
{
register struct ifreq *ifr = (struct ifreq *)data;
register struct in_ifaddr *ia, *iap;
register struct ifaddr *ifa;
struct in_addr allhosts_addr;
struct in_addr dst;
struct in_ifinfo *ii;
struct in_aliasreq *ifra = (struct in_aliasreq *)data;
struct sockaddr_in oldaddr;
int error, hostIsNew, iaIsNew, maskIsNew;
int iaIsFirst;
ia = NULL;
iaIsFirst = 0;
iaIsNew = 0;
allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
/*
* Filter out ioctls we implement directly; forward the rest on to
* in_lifaddr_ioctl() and ifp->if_ioctl().
*/
switch (cmd) {
case SIOCAIFADDR:
case SIOCDIFADDR:
case SIOCGIFADDR:
case SIOCGIFBRDADDR:
case SIOCGIFDSTADDR:
case SIOCGIFNETMASK:
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFDSTADDR:
case SIOCSIFNETMASK:
break;
case SIOCALIFADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_ADDIFADDR);
if (error)
return (error);
}
if (ifp == NULL)
return (EINVAL);
return in_lifaddr_ioctl(so, cmd, data, ifp, td);
case SIOCDLIFADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_DELIFADDR);
if (error)
return (error);
}
if (ifp == NULL)
return (EINVAL);
return in_lifaddr_ioctl(so, cmd, data, ifp, td);
case SIOCGLIFADDR:
if (ifp == NULL)
return (EINVAL);
return in_lifaddr_ioctl(so, cmd, data, ifp, td);
default:
if (ifp == NULL || ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
return ((*ifp->if_ioctl)(ifp, cmd, data));
}
if (ifp == NULL)
return (EADDRNOTAVAIL);
/*
* Security checks before we get involved in any work.
*/
switch (cmd) {
case SIOCAIFADDR:
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
case SIOCSIFDSTADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_ADDIFADDR);
if (error)
return (error);
}
break;
case SIOCDIFADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_DELIFADDR);
if (error)
return (error);
}
break;
}
/*
* Find address for this interface, if it exists.
*
* If an alias address was specified, find that one instead of the
* first one on the interface, if possible.
*/
dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr;
IN_IFADDR_RLOCK();
LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) {
if (iap->ia_ifp == ifp &&
iap->ia_addr.sin_addr.s_addr == dst.s_addr) {
if (td == NULL || prison_check_ip4(td->td_ucred,
&dst) == 0)
ia = iap;
break;
}
}
if (ia != NULL)
ifa_ref(&ia->ia_ifa);
IN_IFADDR_RUNLOCK();
if (ia == NULL) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
iap = ifatoia(ifa);
if (iap->ia_addr.sin_family == AF_INET) {
if (td != NULL &&
prison_check_ip4(td->td_ucred,
&iap->ia_addr.sin_addr) != 0)
continue;
ia = iap;
break;
}
}
if (ia != NULL)
ifa_ref(&ia->ia_ifa);
IF_ADDR_UNLOCK(ifp);
}
if (ia == NULL)
iaIsFirst = 1;
error = 0;
switch (cmd) {
case SIOCAIFADDR:
case SIOCDIFADDR:
if (ifra->ifra_addr.sin_family == AF_INET) {
struct in_ifaddr *oia;
IN_IFADDR_RLOCK();
for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) {
if (ia->ia_ifp == ifp &&
ia->ia_addr.sin_addr.s_addr ==
ifra->ifra_addr.sin_addr.s_addr)
break;
}
if (ia != NULL && ia != oia)
ifa_ref(&ia->ia_ifa);
if (oia != NULL && ia != oia)
ifa_free(&oia->ia_ifa);
IN_IFADDR_RUNLOCK();
if ((ifp->if_flags & IFF_POINTOPOINT)
&& (cmd == SIOCAIFADDR)
&& (ifra->ifra_dstaddr.sin_addr.s_addr
== INADDR_ANY)) {
error = EDESTADDRREQ;
goto out;
}
}
if (cmd == SIOCDIFADDR && ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
/* FALLTHROUGH */
case SIOCSIFADDR:
case SIOCSIFNETMASK:
case SIOCSIFDSTADDR:
if (ia == NULL) {
ia = (struct in_ifaddr *)
malloc(sizeof *ia, M_IFADDR, M_NOWAIT |
M_ZERO);
if (ia == NULL) {
error = ENOBUFS;
goto out;
}
ifa = &ia->ia_ifa;
ifa_init(ifa);
ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
ia->ia_sockmask.sin_len = 8;
ia->ia_sockmask.sin_family = AF_INET;
if (ifp->if_flags & IFF_BROADCAST) {
ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
ia->ia_broadaddr.sin_family = AF_INET;
}
ia->ia_ifp = ifp;
ifa_ref(ifa); /* if_addrhead */
IF_ADDR_LOCK(ifp);
TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
IF_ADDR_UNLOCK(ifp);
ifa_ref(ifa); /* in_ifaddrhead */
IN_IFADDR_WLOCK();
TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
IN_IFADDR_WUNLOCK();
iaIsNew = 1;
}
break;
case SIOCSIFBRDADDR:
case SIOCGIFADDR:
case SIOCGIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCGIFBRDADDR:
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
break;
}
/*
* Most paths in this switch return directly or via out. Only paths
* that remove the address break in order to hit common removal code.
*/
switch (cmd) {
case SIOCGIFADDR:
*((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr;
goto out;
case SIOCGIFBRDADDR:
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
goto out;
}
*((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr;
goto out;
case SIOCGIFDSTADDR:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
goto out;
}
*((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr;
goto out;
case SIOCGIFNETMASK:
*((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask;
goto out;
case SIOCSIFDSTADDR:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
goto out;
}
oldaddr = ia->ia_dstaddr;
ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr;
if (ifp->if_ioctl != NULL) {
error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR,
(caddr_t)ia);
if (error) {
ia->ia_dstaddr = oldaddr;
goto out;
}
}
if (ia->ia_flags & IFA_ROUTE) {
ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr;
rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
ia->ia_ifa.ifa_dstaddr =
(struct sockaddr *)&ia->ia_dstaddr;
rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP);
}
goto out;
case SIOCSIFBRDADDR:
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
goto out;
}
ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr;
goto out;
case SIOCSIFADDR:
error = in_ifinit(ifp, ia,
(struct sockaddr_in *) &ifr->ifr_addr, 1);
if (error != 0 && iaIsNew)
break;
if (error == 0) {
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
if (iaIsFirst &&
(ifp->if_flags & IFF_MULTICAST) != 0) {
error = in_joingroup(ifp, &allhosts_addr,
NULL, &ii->ii_allhosts);
}
EVENTHANDLER_INVOKE(ifaddr_event, ifp);
}
error = 0;
goto out;
case SIOCSIFNETMASK:
ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr;
ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
goto out;
case SIOCAIFADDR:
maskIsNew = 0;
hostIsNew = 1;
error = 0;
if (ia->ia_addr.sin_family == AF_INET) {
if (ifra->ifra_addr.sin_len == 0) {
ifra->ifra_addr = ia->ia_addr;
hostIsNew = 0;
} else if (ifra->ifra_addr.sin_addr.s_addr ==
ia->ia_addr.sin_addr.s_addr)
hostIsNew = 0;
}
if (ifra->ifra_mask.sin_len) {
/*
* QL: XXX
* Need to scrub the prefix here in case
* the issued command is SIOCAIFADDR with
* the same address, but with a different
* prefix length. And if the prefix length
* is the same as before, then the call is
* un-necessarily executed here.
*/
in_ifscrub(ifp, ia);
ia->ia_sockmask = ifra->ifra_mask;
ia->ia_sockmask.sin_family = AF_INET;
ia->ia_subnetmask =
ntohl(ia->ia_sockmask.sin_addr.s_addr);
maskIsNew = 1;
}
if ((ifp->if_flags & IFF_POINTOPOINT) &&
(ifra->ifra_dstaddr.sin_family == AF_INET)) {
in_ifscrub(ifp, ia);
ia->ia_dstaddr = ifra->ifra_dstaddr;
maskIsNew = 1; /* We lie; but the effect's the same */
}
if (ifra->ifra_addr.sin_family == AF_INET &&
(hostIsNew || maskIsNew))
error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
if (error != 0 && iaIsNew)
goto out;
if ((ifp->if_flags & IFF_BROADCAST) &&
(ifra->ifra_broadaddr.sin_family == AF_INET))
ia->ia_broadaddr = ifra->ifra_broadaddr;
if (error == 0) {
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
if (iaIsFirst &&
(ifp->if_flags & IFF_MULTICAST) != 0) {
error = in_joingroup(ifp, &allhosts_addr,
NULL, &ii->ii_allhosts);
}
EVENTHANDLER_INVOKE(ifaddr_event, ifp);
}
goto out;
case SIOCDIFADDR:
/*
* in_ifscrub kills the interface route.
*/
in_ifscrub(ifp, ia);
/*
* in_ifadown gets rid of all the rest of
* the routes. This is not quite the right
* thing to do, but at least if we are running
* a routing process they will come back.
*/
in_ifadown(&ia->ia_ifa, 1);
EVENTHANDLER_INVOKE(ifaddr_event, ifp);
error = 0;
break;
default:
panic("in_control: unsupported ioctl");
}
IF_ADDR_LOCK(ifp);
TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
IF_ADDR_UNLOCK(ifp);
ifa_free(&ia->ia_ifa); /* if_addrhead */
IN_IFADDR_WLOCK();
TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
if (ia->ia_addr.sin_family == AF_INET) {
struct in_ifaddr *if_ia;
LIST_REMOVE(ia, ia_hash);
IN_IFADDR_WUNLOCK();
/*
* If this is the last IPv4 address configured on this
* interface, leave the all-hosts group.
* No state-change report need be transmitted.
*/
if_ia = NULL;
IFP_TO_IA(ifp, if_ia);
if (if_ia == NULL) {
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
IN_MULTI_LOCK();
if (ii->ii_allhosts) {
(void)in_leavegroup_locked(ii->ii_allhosts,
NULL);
ii->ii_allhosts = NULL;
}
IN_MULTI_UNLOCK();
} else
ifa_free(&if_ia->ia_ifa);
} else
IN_IFADDR_WUNLOCK();
ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
out:
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (error);
}
/*
* SIOC[GAD]LIFADDR.
* SIOCGLIFADDR: get first address. (?!?)
* SIOCGLIFADDR with IFLR_PREFIX:
* get first address that matches the specified prefix.
* SIOCALIFADDR: add the specified address.
* SIOCALIFADDR with IFLR_PREFIX:
* EINVAL since we can't deduce hostid part of the address.
* SIOCDLIFADDR: delete the specified address.
* SIOCDLIFADDR with IFLR_PREFIX:
* delete the first address that matches the specified prefix.
* return values:
* EINVAL on invalid parameters
* EADDRNOTAVAIL on prefix match failed/specified address not found
* other values may be returned from in_ioctl()
*/
static int
in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
struct ifnet *ifp, struct thread *td)
{
struct if_laddrreq *iflr = (struct if_laddrreq *)data;
struct ifaddr *ifa;
/* sanity checks */
if (data == NULL || ifp == NULL) {
panic("invalid argument to in_lifaddr_ioctl");
/*NOTRECHED*/
}
switch (cmd) {
case SIOCGLIFADDR:
/* address must be specified on GET with IFLR_PREFIX */
if ((iflr->flags & IFLR_PREFIX) == 0)
break;
/*FALLTHROUGH*/
case SIOCALIFADDR:
case SIOCDLIFADDR:
/* address must be specified on ADD and DELETE */
if (iflr->addr.ss_family != AF_INET)
return (EINVAL);
if (iflr->addr.ss_len != sizeof(struct sockaddr_in))
return (EINVAL);
/* XXX need improvement */
if (iflr->dstaddr.ss_family
&& iflr->dstaddr.ss_family != AF_INET)
return (EINVAL);
if (iflr->dstaddr.ss_family
&& iflr->dstaddr.ss_len != sizeof(struct sockaddr_in))
return (EINVAL);
break;
default: /*shouldn't happen*/
return (EOPNOTSUPP);
}
if (sizeof(struct in_addr) * 8 < iflr->prefixlen)
return (EINVAL);
switch (cmd) {
case SIOCALIFADDR:
{
struct in_aliasreq ifra;
if (iflr->flags & IFLR_PREFIX)
return (EINVAL);
/* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
bzero(&ifra, sizeof(ifra));
bcopy(iflr->iflr_name, ifra.ifra_name,
sizeof(ifra.ifra_name));
bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len);
if (iflr->dstaddr.ss_family) { /*XXX*/
bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
iflr->dstaddr.ss_len);
}
ifra.ifra_mask.sin_family = AF_INET;
ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in);
in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen);
return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td));
}
case SIOCGLIFADDR:
case SIOCDLIFADDR:
{
struct in_ifaddr *ia;
struct in_addr mask, candidate, match;
struct sockaddr_in *sin;
bzero(&mask, sizeof(mask));
bzero(&match, sizeof(match));
if (iflr->flags & IFLR_PREFIX) {
/* lookup a prefix rather than address. */
in_len2mask(&mask, iflr->prefixlen);
sin = (struct sockaddr_in *)&iflr->addr;
match.s_addr = sin->sin_addr.s_addr;
match.s_addr &= mask.s_addr;
/* if you set extra bits, that's wrong */
if (match.s_addr != sin->sin_addr.s_addr)
return (EINVAL);
} else {
/* on getting an address, take the 1st match */
/* on deleting an address, do exact match */
if (cmd != SIOCGLIFADDR) {
in_len2mask(&mask, 32);
sin = (struct sockaddr_in *)&iflr->addr;
match.s_addr = sin->sin_addr.s_addr;
}
}
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (match.s_addr == 0)
break;
candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr;
candidate.s_addr &= mask.s_addr;
if (candidate.s_addr == match.s_addr)
break;
}
if (ifa == NULL)
return (EADDRNOTAVAIL);
ia = (struct in_ifaddr *)ifa;
if (cmd == SIOCGLIFADDR) {
/* fill in the if_laddrreq structure */
bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len);
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
ia->ia_dstaddr.sin_len);
} else
bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));
iflr->prefixlen =
in_mask2len(&ia->ia_sockmask.sin_addr);
iflr->flags = 0; /*XXX*/
return (0);
} else {
struct in_aliasreq ifra;
/* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
bzero(&ifra, sizeof(ifra));
bcopy(iflr->iflr_name, ifra.ifra_name,
sizeof(ifra.ifra_name));
bcopy(&ia->ia_addr, &ifra.ifra_addr,
ia->ia_addr.sin_len);
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
ia->ia_dstaddr.sin_len);
}
bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr,
ia->ia_sockmask.sin_len);
return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra,
ifp, td));
}
}
}
return (EOPNOTSUPP); /*just for safety*/
}
/*
* Delete any existing route for an interface.
*/
void
in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia)
{
in_scrubprefix(ia);
}
/*
* Initialize an interface's internet address
* and routing table entry.
*/
static int
in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
int scrub)
{
register u_long i = ntohl(sin->sin_addr.s_addr);
struct sockaddr_in oldaddr;
int s = splimp(), flags = RTF_UP, error = 0;
oldaddr = ia->ia_addr;
if (oldaddr.sin_family == AF_INET)
LIST_REMOVE(ia, ia_hash);
ia->ia_addr = *sin;
if (ia->ia_addr.sin_family == AF_INET) {
IN_IFADDR_WLOCK();
LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr),
ia, ia_hash);
IN_IFADDR_WUNLOCK();
}
/*
* Give the interface a chance to initialize
* if this is its first address,
* and to validate the address if necessary.
*/
if (ifp->if_ioctl != NULL) {
error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
if (error) {
splx(s);
/* LIST_REMOVE(ia, ia_hash) is done in in_control */
ia->ia_addr = oldaddr;
IN_IFADDR_WLOCK();
if (ia->ia_addr.sin_family == AF_INET)
LIST_INSERT_HEAD(INADDR_HASH(
ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
else
/*
* If oldaddr family is not AF_INET (e.g.
* interface has been just created) in_control
* does not call LIST_REMOVE, and we end up
* with bogus ia entries in hash
*/
LIST_REMOVE(ia, ia_hash);
IN_IFADDR_WUNLOCK();
return (error);
}
}
splx(s);
if (scrub) {
ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr;
in_ifscrub(ifp, ia);
ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
}
if (IN_CLASSA(i))
ia->ia_netmask = IN_CLASSA_NET;
else if (IN_CLASSB(i))
ia->ia_netmask = IN_CLASSB_NET;
else
ia->ia_netmask = IN_CLASSC_NET;
/*
* The subnet mask usually includes at least the standard network part,
* but may may be smaller in the case of supernetting.
* If it is set, we believe it.
*/
if (ia->ia_subnetmask == 0) {
ia->ia_subnetmask = ia->ia_netmask;
ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
} else
ia->ia_netmask &= ia->ia_subnetmask;
ia->ia_net = i & ia->ia_netmask;
ia->ia_subnet = i & ia->ia_subnetmask;
in_socktrim(&ia->ia_sockmask);
#ifdef DEV_CARP
/*
* XXX: carp(4) does not have interface route
*/
if (ifp->if_type == IFT_CARP)
return (0);
#endif
/*
* Add route for the network.
*/
ia->ia_ifa.ifa_metric = ifp->if_metric;
if (ifp->if_flags & IFF_BROADCAST) {
ia->ia_broadaddr.sin_addr.s_addr =
htonl(ia->ia_subnet | ~ia->ia_subnetmask);
ia->ia_netbroadcast.s_addr =
htonl(ia->ia_net | ~ ia->ia_netmask);
} else if (ifp->if_flags & IFF_LOOPBACK) {
ia->ia_dstaddr = ia->ia_addr;
flags |= RTF_HOST;
} else if (ifp->if_flags & IFF_POINTOPOINT) {
if (ia->ia_dstaddr.sin_family != AF_INET)
return (0);
flags |= RTF_HOST;
}
if ((error = in_addprefix(ia, flags)) != 0)
return (error);
if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
return (0);
if (ifp->if_flags & IFF_POINTOPOINT) {
if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
return (0);
}
/*
* add a loopback route to self
*/
if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) {
struct route ia_ro;
bzero(&ia_ro, sizeof(ia_ro));
*((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr;
rtalloc_ign_fib(&ia_ro, 0, 0);
if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
(ia_ro.ro_rt->rt_ifp == V_loif)) {
RT_LOCK(ia_ro.ro_rt);
RT_ADDREF(ia_ro.ro_rt);
RTFREE_LOCKED(ia_ro.ro_rt);
} else
error = ifa_add_loopback_route((struct ifaddr *)ia,
(struct sockaddr *)&ia->ia_addr);
if (error == 0)
ia->ia_flags |= IFA_RTSELF;
if (ia_ro.ro_rt != NULL)
RTFREE(ia_ro.ro_rt);
}
return (error);
}
#define rtinitflags(x) \
((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \
? RTF_HOST : 0)
/*
* Generate a routing message when inserting or deleting
* an interface address alias.
*/
static void in_addralias_rtmsg(int cmd, struct in_addr *prefix,
struct in_ifaddr *target)
{
struct route pfx_ro;
struct sockaddr_in *pfx_addr;
struct rtentry msg_rt;
/* QL: XXX
* This is a bit questionable because there is no
* additional route entry added/deleted for an address
* alias. Therefore this route report is inaccurate.
*/
bzero(&pfx_ro, sizeof(pfx_ro));
pfx_addr = (struct sockaddr_in *)(&pfx_ro.ro_dst);
pfx_addr->sin_len = sizeof(*pfx_addr);
pfx_addr->sin_family = AF_INET;
pfx_addr->sin_addr = *prefix;
rtalloc_ign_fib(&pfx_ro, 0, 0);
if (pfx_ro.ro_rt != NULL) {
msg_rt = *pfx_ro.ro_rt;
/* QL: XXX
* Point the gateway to the new interface
* address as if a new prefix route entry has
* been added through the new address alias.
* All other parts of the rtentry is accurate,
* e.g., rt_key, rt_mask, rt_ifp etc.
*/
msg_rt.rt_gateway =
(struct sockaddr *)&target->ia_addr;
rt_newaddrmsg(cmd,
(struct ifaddr *)target,
0, &msg_rt);
RTFREE(pfx_ro.ro_rt);
}
return;
}
/*
* Check if we have a route for the given prefix already or add one accordingly.
*/
static int
in_addprefix(struct in_ifaddr *target, int flags)
{
struct in_ifaddr *ia;
struct in_addr prefix, mask, p, m;
int error;
if ((flags & RTF_HOST) != 0) {
prefix = target->ia_dstaddr.sin_addr;
mask.s_addr = 0;
} else {
prefix = target->ia_addr.sin_addr;
mask = target->ia_sockmask.sin_addr;
prefix.s_addr &= mask.s_addr;
}
IN_IFADDR_RLOCK();
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia)) {
p = ia->ia_addr.sin_addr;
if (prefix.s_addr != p.s_addr)
continue;
} else {
p = ia->ia_addr.sin_addr;
m = ia->ia_sockmask.sin_addr;
p.s_addr &= m.s_addr;
if (prefix.s_addr != p.s_addr ||
mask.s_addr != m.s_addr)
continue;
}
/*
* If we got a matching prefix route inserted by other
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
#ifdef RADIX_MPATH
if (ia->ia_addr.sin_addr.s_addr ==
target->ia_addr.sin_addr.s_addr)
return (EEXIST);
else
break;
#endif
if (V_sameprefixcarponly &&
target->ia_ifp->if_type != IFT_CARP &&
ia->ia_ifp->if_type != IFT_CARP) {
IN_IFADDR_RUNLOCK();
return (EEXIST);
} else {
in_addralias_rtmsg(RTM_ADD, &prefix, target);
IN_IFADDR_RUNLOCK();
return (0);
}
}
}
IN_IFADDR_RUNLOCK();
/*
* No-one seem to have this prefix route, so we try to insert it.
*/
error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags);
if (!error)
target->ia_flags |= IFA_ROUTE;
return (error);
}
extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr);
/*
* If there is no other address in the system that can serve a route to the
* same prefix, remove the route. Hand over the route to the new address
* otherwise.
*/
static int
in_scrubprefix(struct in_ifaddr *target)
{
struct in_ifaddr *ia;
struct in_addr prefix, mask, p;
int error = 0;
struct sockaddr_in prefix0, mask0;
/*
* Remove the loopback route to the interface address.
* The "useloopback" setting is not consulted because if the
* user configures an interface address, turns off this
* setting, and then tries to delete that interface address,
* checking the current setting of "useloopback" would leave
* that interface address loopback route untouched, which
* would be wrong. Therefore the interface address loopback route
* deletion is unconditional.
*/
if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) &&
!(target->ia_ifp->if_flags & IFF_LOOPBACK) &&
(target->ia_flags & IFA_RTSELF)) {
struct route ia_ro;
int freeit = 0;
bzero(&ia_ro, sizeof(ia_ro));
*((struct sockaddr_in *)(&ia_ro.ro_dst)) = target->ia_addr;
rtalloc_ign_fib(&ia_ro, 0, 0);
if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
(ia_ro.ro_rt->rt_ifp == V_loif)) {
RT_LOCK(ia_ro.ro_rt);
if (ia_ro.ro_rt->rt_refcnt <= 1)
freeit = 1;
else
RT_REMREF(ia_ro.ro_rt);
RTFREE_LOCKED(ia_ro.ro_rt);
}
if (freeit)
error = ifa_del_loopback_route((struct ifaddr *)target,
(struct sockaddr *)&target->ia_addr);
if (error == 0)
target->ia_flags &= ~IFA_RTSELF;
/* remove arp cache */
arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr);
}
if (rtinitflags(target))
prefix = target->ia_dstaddr.sin_addr;
else {
prefix = target->ia_addr.sin_addr;
mask = target->ia_sockmask.sin_addr;
prefix.s_addr &= mask.s_addr;
}
if ((target->ia_flags & IFA_ROUTE) == 0) {
in_addralias_rtmsg(RTM_DELETE, &prefix, target);
return (0);
}
IN_IFADDR_RLOCK();
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia))
p = ia->ia_dstaddr.sin_addr;
else {
p = ia->ia_addr.sin_addr;
p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
}
if (prefix.s_addr != p.s_addr)
continue;
/*
* If we got a matching prefix address, move IFA_ROUTE and
* the route itself to it. Make sure that routing daemons
* get a heads-up.
*
* XXX: a special case for carp(4) interface
*/
if ((ia->ia_flags & IFA_ROUTE) == 0
#ifdef DEV_CARP
&& (ia->ia_ifp->if_type != IFT_CARP)
#endif
) {
IN_IFADDR_RUNLOCK();
rtinit(&(target->ia_ifa), (int)RTM_DELETE,
rtinitflags(target));
target->ia_flags &= ~IFA_ROUTE;
error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
rtinitflags(ia) | RTF_UP);
if (error == 0)
ia->ia_flags |= IFA_ROUTE;
return (error);
}
}
IN_IFADDR_RUNLOCK();
/*
* remove all L2 entries on the given prefix
*/
bzero(&prefix0, sizeof(prefix0));
prefix0.sin_len = sizeof(prefix0);
prefix0.sin_family = AF_INET;
prefix0.sin_addr.s_addr = target->ia_subnet;
bzero(&mask0, sizeof(mask0));
mask0.sin_len = sizeof(mask0);
mask0.sin_family = AF_INET;
mask0.sin_addr.s_addr = target->ia_subnetmask;
lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
(struct sockaddr *)&mask0);
/*
* As no-one seem to have this prefix, we can remove the route.
*/
rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target));
target->ia_flags &= ~IFA_ROUTE;
return (0);
}
#undef rtinitflags
/*
* Return 1 if the address might be a local broadcast address.
*/
int
in_broadcast(struct in_addr in, struct ifnet *ifp)
{
register struct ifaddr *ifa;
u_long t;
if (in.s_addr == INADDR_BROADCAST ||
in.s_addr == INADDR_ANY)
return (1);
if ((ifp->if_flags & IFF_BROADCAST) == 0)
return (0);
t = ntohl(in.s_addr);
/*
* Look through the list of addresses for a match
* with a broadcast address.
*/
#define ia ((struct in_ifaddr *)ifa)
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET &&
(in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
in.s_addr == ia->ia_netbroadcast.s_addr ||
/*
* Check for old-style (host 0) broadcast.
*/
t == ia->ia_subnet || t == ia->ia_net) &&
/*
* Check for an all one subnetmask. These
* only exist when an interface gets a secondary
* address.
*/
ia->ia_subnetmask != (u_long)0xffffffff)
return (1);
return (0);
#undef ia
}
/*
* On interface removal, clean up IPv4 data structures hung off of the ifnet.
*/
void
in_ifdetach(struct ifnet *ifp)
{
in_pcbpurgeif0(&V_ripcbinfo, ifp);
in_pcbpurgeif0(&V_udbinfo, ifp);
in_purgemaddrs(ifp);
}
/*
* Delete all IPv4 multicast address records, and associated link-layer
* multicast address records, associated with ifp.
* XXX It looks like domifdetach runs AFTER the link layer cleanup.
* XXX This should not race with ifma_protospec being set during
* a new allocation, if it does, we have bigger problems.
*/
static void
in_purgemaddrs(struct ifnet *ifp)
{
LIST_HEAD(,in_multi) purgeinms;
struct in_multi *inm, *tinm;
struct ifmultiaddr *ifma;
LIST_INIT(&purgeinms);
IN_MULTI_LOCK();
/*
* Extract list of in_multi associated with the detaching ifp
* which the PF_INET layer is about to release.
* We need to do this as IF_ADDR_LOCK() may be re-acquired
* by code further down.
*/
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
#if 0
KASSERT(ifma->ifma_protospec != NULL,
("%s: ifma_protospec is NULL", __func__));
#endif
inm = (struct in_multi *)ifma->ifma_protospec;
LIST_INSERT_HEAD(&purgeinms, inm, inm_link);
}
IF_ADDR_UNLOCK(ifp);
LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) {
LIST_REMOVE(inm, inm_link);
inm_release_locked(inm);
}
igmp_ifdetach(ifp);
IN_MULTI_UNLOCK();
}
#include <net/if_dl.h>
#include <netinet/if_ether.h>
struct in_llentry {
struct llentry base;
struct sockaddr_in l3_addr4;
};
static struct llentry *
in_lltable_new(const struct sockaddr *l3addr, u_int flags)
{
struct in_llentry *lle;
lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_DONTWAIT | M_ZERO);
if (lle == NULL) /* NB: caller generates msg */
return NULL;
callout_init(&lle->base.la_timer, CALLOUT_MPSAFE);
/*
* For IPv4 this will trigger "arpresolve" to generate
* an ARP request.
*/
lle->base.la_expire = time_second; /* mark expired */
lle->l3_addr4 = *(const struct sockaddr_in *)l3addr;
lle->base.lle_refcnt = 1;
LLE_LOCK_INIT(&lle->base);
return &lle->base;
}
/*
* Deletes an address from the address table.
* This function is called by the timer functions
* such as arptimer() and nd6_llinfo_timer(), and
* the caller does the locking.
*/
static void
in_lltable_free(struct lltable *llt, struct llentry *lle)
{
LLE_WUNLOCK(lle);
LLE_LOCK_DESTROY(lle);
free(lle, M_LLTABLE);
}
#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
(((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 )
static void
in_lltable_prefix_free(struct lltable *llt,
const struct sockaddr *prefix,
const struct sockaddr *mask)
{
const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix;
const struct sockaddr_in *msk = (const struct sockaddr_in *)mask;
struct llentry *lle, *next;
register int i;
for (i=0; i < LLTBL_HASHTBL_SIZE; i++) {
LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
if (IN_ARE_MASKED_ADDR_EQUAL((struct sockaddr_in *)L3_ADDR(lle),
pfx, msk)) {
int canceled;
canceled = callout_drain(&lle->la_timer);
LLE_WLOCK(lle);
if (canceled)
LLE_REMREF(lle);
llentry_free(lle);
}
}
}
}
static int
in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
{
struct rtentry *rt;
KASSERT(l3addr->sa_family == AF_INET,
("sin_family %d", l3addr->sa_family));
/* XXX rtalloc1 should take a const param */
rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0);
- if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) ||
- ((rt->rt_ifp != ifp) && !(flags & LLE_PUB))) {
+ if (rt == NULL || (!(flags & LLE_PUB) &&
+ ((rt->rt_flags & RTF_GATEWAY) ||
+ (rt->rt_ifp != ifp)))) {
#ifdef DIAGNOSTIC
log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
#endif
if (rt != NULL)
RTFREE_LOCKED(rt);
return (EINVAL);
}
RTFREE_LOCKED(rt);
return 0;
}
/*
* Return NULL if not found or marked for deletion.
* If found return lle read locked.
*/
static struct llentry *
in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
{
const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
struct llentries *lleh;
u_int hashkey;
IF_AFDATA_LOCK_ASSERT(ifp);
KASSERT(l3addr->sa_family == AF_INET,
("sin_family %d", l3addr->sa_family));
hashkey = sin->sin_addr.s_addr;
lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)];
LIST_FOREACH(lle, lleh, lle_next) {
struct sockaddr_in *sa2 = (struct sockaddr_in *)L3_ADDR(lle);
if (lle->la_flags & LLE_DELETED)
continue;
if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr)
break;
}
if (lle == NULL) {
#ifdef DIAGNOSTIC
if (flags & LLE_DELETE)
log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle);
#endif
if (!(flags & LLE_CREATE))
return (NULL);
/*
* A route that covers the given address must have
* been installed 1st because we are doing a resolution,
* verify this.
*/
if (!(flags & LLE_IFADDR) &&
in_lltable_rtcheck(ifp, flags, l3addr) != 0)
goto done;
lle = in_lltable_new(l3addr, flags);
if (lle == NULL) {
log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
goto done;
}
lle->la_flags = flags & ~LLE_CREATE;
if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) {
bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen);
lle->la_flags |= (LLE_VALID | LLE_STATIC);
}
lle->lle_tbl = llt;
lle->lle_head = lleh;
LIST_INSERT_HEAD(lleh, lle, lle_next);
} else if (flags & LLE_DELETE) {
if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
LLE_WLOCK(lle);
lle->la_flags = LLE_DELETED;
LLE_WUNLOCK(lle);
#ifdef DIAGNOSTIC
log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
#endif
}
lle = (void *)-1;
}
if (LLE_IS_VALID(lle)) {
if (flags & LLE_EXCLUSIVE)
LLE_WLOCK(lle);
else
LLE_RLOCK(lle);
}
done:
return (lle);
}
static int
in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
{
#define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle))
struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
/* XXX stack use */
struct {
struct rt_msghdr rtm;
struct sockaddr_inarp sin;
struct sockaddr_dl sdl;
} arpc;
int error, i;
LLTABLE_LOCK_ASSERT();
error = 0;
for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
struct sockaddr_dl *sdl;
/* skip deleted entries */
if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
continue;
/* Skip if jailed and not a valid IP of the prison. */
if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0)
continue;
/*
* produce a msg made of:
* struct rt_msghdr;
* struct sockaddr_inarp; (IPv4)
* struct sockaddr_dl;
*/
bzero(&arpc, sizeof(arpc));
arpc.rtm.rtm_msglen = sizeof(arpc);
arpc.rtm.rtm_version = RTM_VERSION;
arpc.rtm.rtm_type = RTM_GET;
arpc.rtm.rtm_flags = RTF_UP;
arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
arpc.sin.sin_family = AF_INET;
arpc.sin.sin_len = sizeof(arpc.sin);
arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr;
/* publish */
if (lle->la_flags & LLE_PUB) {
arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
/* proxy only */
if (lle->la_flags & LLE_PROXY)
arpc.sin.sin_other = SIN_PROXY;
}
sdl = &arpc.sdl;
sdl->sdl_family = AF_LINK;
sdl->sdl_len = sizeof(*sdl);
sdl->sdl_index = ifp->if_index;
sdl->sdl_type = ifp->if_type;
if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
sdl->sdl_alen = ifp->if_addrlen;
bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
} else {
sdl->sdl_alen = 0;
bzero(LLADDR(sdl), ifp->if_addrlen);
}
arpc.rtm.rtm_rmx.rmx_expire =
lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
if (lle->la_flags & LLE_STATIC)
arpc.rtm.rtm_flags |= RTF_STATIC;
arpc.rtm.rtm_index = ifp->if_index;
error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
if (error)
break;
}
}
return error;
#undef SIN
}
void *
in_domifattach(struct ifnet *ifp)
{
struct in_ifinfo *ii;
struct lltable *llt;
ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
llt = lltable_init(ifp, AF_INET);
if (llt != NULL) {
llt->llt_new = in_lltable_new;
llt->llt_free = in_lltable_free;
llt->llt_prefix_free = in_lltable_prefix_free;
llt->llt_rtcheck = in_lltable_rtcheck;
llt->llt_lookup = in_lltable_lookup;
llt->llt_dump = in_lltable_dump;
}
ii->ii_llt = llt;
ii->ii_igmp = igmp_domifattach(ifp);
return ii;
}
void
in_domifdetach(struct ifnet *ifp, void *aux)
{
struct in_ifinfo *ii = (struct in_ifinfo *)aux;
igmp_domifdetach(ifp);
lltable_free(ii->ii_llt);
free(ii, M_IFADDR);
}
Index: stable/8/sys/netinet/in_pcb.c
===================================================================
--- stable/8/sys/netinet/in_pcb.c (revision 209276)
+++ stable/8/sys/netinet/in_pcb.c (revision 209277)
@@ -1,1957 +1,1957 @@
/*-
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2007-2009 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_ipsec.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <vm/uma.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif /* INET6 */
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <security/mac/mac_framework.h>
/*
* These configure the range of local port addresses assigned to
* "unspecified" outgoing connections/packets/whatever.
*/
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
/*
* Reserved ports accessible only to root. There are significant
* security considerations that must be accounted for when changing these,
* but the security benefits can be great. Please be careful.
*/
VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
VNET_DEFINE(int, ipport_reservedlow);
/* Variables dealing with random ephemeral port allocation. */
VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
VNET_DEFINE(int, ipport_tcpallocs);
static VNET_DEFINE(int, ipport_tcplastcount);
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
#define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
else if ((var) > (max)) { (var) = (max); }
static void in_pcbremlists(struct inpcb *inp);
static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
{
int error;
#ifdef VIMAGE
error = vnet_sysctl_handle_int(oidp, arg1, arg2, req);
#else
error = sysctl_handle_int(oidp, arg1, arg2, req);
#endif
if (error == 0) {
RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
}
return (error);
}
#undef RANGECHK
SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
"allocations before switching to a sequental one");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
/*
* in_pcb.c: manage the Protocol Control Blocks.
*
* NOTE: It is assumed that most of these functions will be called with
* the pcbinfo lock held, and often, the inpcb lock held, as these utility
* functions often modify hash chains or addresses in pcbs.
*/
/*
* Allocate a PCB and associate it with the socket.
* On success return with the PCB locked.
*/
int
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
{
struct inpcb *inp;
int error;
INP_INFO_WLOCK_ASSERT(pcbinfo);
error = 0;
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
return (ENOBUFS);
bzero(inp, inp_zero_size);
inp->inp_pcbinfo = pcbinfo;
inp->inp_socket = so;
inp->inp_cred = crhold(so->so_cred);
inp->inp_inc.inc_fibnum = so->so_fibnum;
#ifdef MAC
error = mac_inpcb_init(inp, M_NOWAIT);
if (error != 0)
goto out;
mac_inpcb_create(so, inp);
#endif
#ifdef IPSEC
error = ipsec_init_policy(so, &inp->inp_sp);
if (error != 0) {
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
goto out;
}
#endif /*IPSEC*/
#ifdef INET6
if (INP_SOCKAF(so) == AF_INET6) {
inp->inp_vflag |= INP_IPV6PROTO;
if (V_ip6_v6only)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
pcbinfo->ipi_count++;
so->so_pcb = (caddr_t)inp;
#ifdef INET6
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
inp->inp_refcount = 1; /* Reference from the inpcbinfo */
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
crfree(inp->inp_cred);
uma_zfree(pcbinfo->ipi_zone, inp);
}
#endif
return (error);
}
int
in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
{
int anonport, error;
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
return (EINVAL);
anonport = inp->inp_lport == 0 && (nam == NULL ||
((struct sockaddr_in *)nam)->sin_port == 0);
error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
&inp->inp_lport, cred);
if (error)
return (error);
if (in_pcbinshash(inp) != 0) {
inp->inp_laddr.s_addr = INADDR_ANY;
inp->inp_lport = 0;
return (EAGAIN);
}
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
/*
* Set up a bind operation on a PCB, performing port allocation
* as required, but do not actually modify the PCB. Callers can
* either complete the bind by setting inp_laddr/inp_lport and
* calling in_pcbinshash(), or they can just use the resulting
* port and address to authorise the sending of a once-off packet.
*
* On error, the values of *laddrp and *lportp are not changed.
*/
int
in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
u_short *lportp, struct ucred *cred)
{
struct socket *so = inp->inp_socket;
unsigned short *lastport;
struct sockaddr_in *sin;
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct in_addr laddr;
u_short lport = 0;
int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
int error;
int dorandom;
/*
* Because no actual state changes occur here, a global write lock on
* the pcbinfo isn't required.
*/
INP_INFO_LOCK_ASSERT(pcbinfo);
INP_LOCK_ASSERT(inp);
if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
return (EADDRNOTAVAIL);
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
wild = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
return (error);
} else {
sin = (struct sockaddr_in *)nam;
if (nam->sa_len != sizeof (*sin))
return (EINVAL);
#ifdef notdef
/*
* We should check the family, but old programs
* incorrectly fail to initialize it.
*/
if (sin->sin_family != AF_INET)
return (EAFNOSUPPORT);
#endif
error = prison_local_ip4(cred, &sin->sin_addr);
if (error)
return (error);
if (sin->sin_port != *lportp) {
/* Don't allow the port to change. */
if (*lportp != 0)
return (EINVAL);
lport = sin->sin_port;
}
/* NB: lport is left as 0 if the port isn't being changed. */
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & SO_REUSEADDR)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
* Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
if (lport) {
struct inpcb *t;
struct tcptw *tw;
/* GROSS */
if (ntohs(lport) <= V_ipport_reservedhigh &&
ntohs(lport) >= V_ipport_reservedlow &&
priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
0))
return (EACCES);
if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
priv_check_cred(inp->inp_cred,
PRIV_NETINET_REUSEPORT, 0) != 0) {
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, INPLOOKUP_WILDCARD, cred);
/*
* XXX
* This entire block sorely needs a rewrite.
*/
if (t &&
((t->inp_flags & INP_TIMEWAIT) == 0) &&
(so->so_type != SOCK_STREAM ||
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
(t->inp_socket->so_options &
SO_REUSEPORT) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
}
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, wild, cred);
if (t && (t->inp_flags & INP_TIMEWAIT)) {
/*
* XXXRW: If an incpb has had its timewait
* state recycled, we treat the address as
* being in use (for now). This is better
* than a panic, but not desirable.
*/
tw = intotw(inp);
if (tw == NULL ||
(reuseport & tw->tw_so_options) == 0)
return (EADDRINUSE);
} else if (t &&
(reuseport & t->inp_socket->so_options) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) !=
INADDR_ANY ||
INP_SOCKAF(so) ==
INP_SOCKAF(t->inp_socket))
#endif
return (EADDRINUSE);
}
}
}
if (*lportp != 0)
lport = *lportp;
if (lport == 0) {
u_short first, last, aux;
int count;
if (inp->inp_flags & INP_HIGHPORT) {
first = V_ipport_hifirstauto; /* sysctl */
last = V_ipport_hilastauto;
lastport = &pcbinfo->ipi_lasthi;
} else if (inp->inp_flags & INP_LOWPORT) {
error = priv_check_cred(cred,
PRIV_NETINET_RESERVEDPORT, 0);
if (error)
return error;
first = V_ipport_lowfirstauto; /* 1023 */
last = V_ipport_lowlastauto; /* 600 */
lastport = &pcbinfo->ipi_lastlow;
} else {
first = V_ipport_firstauto; /* sysctl */
last = V_ipport_lastauto;
lastport = &pcbinfo->ipi_lastport;
}
/*
* For UDP, use random port allocation as long as the user
* allows it. For TCP (and as of yet unknown) connections,
* use random port allocation only if the user allows it AND
* ipport_tick() allows it.
*/
if (V_ipport_randomized &&
(!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
dorandom = 1;
else
dorandom = 0;
/*
* It makes no sense to do random port allocation if
* we have the only port available.
*/
if (first == last)
dorandom = 0;
/* Make sure to not include UDP packets in the count. */
if (pcbinfo != &V_udbinfo)
V_ipport_tcpallocs++;
/*
* Instead of having two loops further down counting up or down
* make sure that first is always <= last and go with only one
* code path implementing all logic.
*/
if (first > last) {
aux = first;
first = last;
last = aux;
}
if (dorandom)
*lastport = first +
(arc4random() % (last - first));
count = last - first;
do {
if (count-- < 0) /* completely used? */
return (EADDRNOTAVAIL);
++*lastport;
if (*lastport < first || *lastport > last)
*lastport = first;
lport = htons(*lastport);
} while (in_pcblookup_local(pcbinfo, laddr,
lport, wild, cred));
}
*laddrp = laddr.s_addr;
*lportp = lport;
return (0);
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
{
u_short lport, fport;
in_addr_t laddr, faddr;
int anonport, error;
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
lport = inp->inp_lport;
laddr = inp->inp_laddr.s_addr;
anonport = (lport == 0);
error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
NULL, cred);
if (error)
return (error);
/* Do the initial binding of the local address if required. */
if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
inp->inp_lport = lport;
inp->inp_laddr.s_addr = laddr;
if (in_pcbinshash(inp) != 0) {
inp->inp_laddr.s_addr = INADDR_ANY;
inp->inp_lport = 0;
return (EAGAIN);
}
}
/* Commit the remaining changes. */
inp->inp_lport = lport;
inp->inp_laddr.s_addr = laddr;
inp->inp_faddr.s_addr = faddr;
inp->inp_fport = fport;
in_pcbrehash(inp);
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
/*
* Do proper source address selection on an unbound socket in case
* of connect. Take jails into account as well.
*/
static int
in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
struct ucred *cred)
{
struct ifaddr *ifa;
struct sockaddr *sa;
struct sockaddr_in *sin;
struct route sro;
int error;
KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
/*
* Bypass source address selection and use the primary jail IP
* if requested.
*/
if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
return (0);
error = 0;
bzero(&sro, sizeof(sro));
sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
sin->sin_addr.s_addr = faddr->s_addr;
/*
* If route is known our src addr is taken from the i/f,
* else punt.
*
* Find out route to destination.
*/
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
/*
* If we found a route, use the address corresponding to
* the outgoing interface.
*
* Otherwise assume faddr is reachable on a directly connected
* network and try to find a corresponding interface to take
* the source address from.
*/
if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
struct in_ifaddr *ia;
struct ifnet *ifp;
ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
if (ia == NULL)
- ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
+ ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0));
if (ia == NULL) {
error = ENETUNREACH;
goto done;
}
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
ifa_free(&ia->ia_ifa);
goto done;
}
ifp = ia->ia_ifp;
ifa_free(&ia->ia_ifa);
ia = NULL;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
IF_ADDR_UNLOCK(ifp);
goto done;
}
IF_ADDR_UNLOCK(ifp);
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
/*
* If the outgoing interface on the route found is not
* a loopback interface, use the address from that interface.
* In case of jails do those three steps:
* 1. check if the interface address belongs to the jail. If so use it.
* 2. check if we have any address on the outgoing interface
* belonging to this jail. If so use it.
* 3. as a last resort return the 'default' jail address.
*/
if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
struct in_ifaddr *ia;
struct ifnet *ifp;
/* If not jailed, use the default returned. */
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* Jailed. */
/* 1. Check if the iface address belongs to the jail. */
sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/*
* 2. Check if we have any address on the outgoing interface
* belonging to this jail.
*/
ia = NULL;
ifp = sro.ro_rt->rt_ifp;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
IF_ADDR_UNLOCK(ifp);
goto done;
}
IF_ADDR_UNLOCK(ifp);
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
/*
* The outgoing interface is marked with 'loopback net', so a route
* to ourselves is here.
* Try to find the interface of the destination address and then
* take the address from there. That interface is not necessarily
* a loopback interface.
* In case of jails, check that it is an address of the jail
* and if we cannot find, fall back to the 'default' jail address.
*/
if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
struct sockaddr_in sain;
struct in_ifaddr *ia;
bzero(&sain, sizeof(struct sockaddr_in));
sain.sin_family = AF_INET;
sain.sin_len = sizeof(struct sockaddr_in);
sain.sin_addr.s_addr = faddr->s_addr;
ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
if (ia == NULL)
- ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
+ ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0));
if (ia == NULL)
ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
if (ia == NULL) {
error = ENETUNREACH;
goto done;
}
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
ifa_free(&ia->ia_ifa);
goto done;
}
/* Jailed. */
if (ia != NULL) {
struct ifnet *ifp;
ifp = ia->ia_ifp;
ifa_free(&ia->ia_ifa);
ia = NULL;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred,
&sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
IF_ADDR_UNLOCK(ifp);
goto done;
}
IF_ADDR_UNLOCK(ifp);
}
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
done:
if (sro.ro_rt != NULL)
RTFREE(sro.ro_rt);
return (error);
}
/*
* Set up for a connect from a socket to the specified address.
* On entry, *laddrp and *lportp should contain the current local
* address and port for the PCB; these are updated to the values
* that should be placed in inp_laddr and inp_lport to complete
* the connect.
*
* On success, *faddrp and *fportp will be set to the remote address
* and port. These are not updated in the error case.
*
* If the operation fails because the connection already exists,
* *oinpp will be set to the PCB of that connection so that the
* caller can decide to override it. In all other cases, *oinpp
* is set to NULL.
*/
int
in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
struct inpcb **oinpp, struct ucred *cred)
{
struct sockaddr_in *sin = (struct sockaddr_in *)nam;
struct in_ifaddr *ia;
struct inpcb *oinp;
struct in_addr laddr, faddr;
u_short lport, fport;
int error;
/*
* Because a global state change doesn't actually occur here, a read
* lock is sufficient.
*/
INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
INP_LOCK_ASSERT(inp);
if (oinpp != NULL)
*oinpp = NULL;
if (nam->sa_len != sizeof (*sin))
return (EINVAL);
if (sin->sin_family != AF_INET)
return (EAFNOSUPPORT);
if (sin->sin_port == 0)
return (EADDRNOTAVAIL);
laddr.s_addr = *laddrp;
lport = *lportp;
faddr = sin->sin_addr;
fport = sin->sin_port;
if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
* use the primary local address.
* If the supplied address is INADDR_BROADCAST,
* and the primary interface supports broadcast,
* choose the broadcast address for that interface.
*/
if (faddr.s_addr == INADDR_ANY) {
IN_IFADDR_RLOCK();
faddr =
IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
IN_IFADDR_RUNLOCK();
if (cred != NULL &&
(error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
IN_IFADDR_RLOCK();
if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
IFF_BROADCAST)
faddr = satosin(&TAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
IN_IFADDR_RUNLOCK();
}
}
if (laddr.s_addr == INADDR_ANY) {
error = in_pcbladdr(inp, &faddr, &laddr, cred);
if (error)
return (error);
/*
* If the destination address is multicast and an outgoing
* interface has been set as a multicast option, use the
* address of that interface as our source address.
*/
if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
inp->inp_moptions != NULL) {
struct ip_moptions *imo;
struct ifnet *ifp;
imo = inp->inp_moptions;
if (imo->imo_multicast_ifp != NULL) {
ifp = imo->imo_multicast_ifp;
IN_IFADDR_RLOCK();
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
if (ia->ia_ifp == ifp)
break;
if (ia == NULL) {
IN_IFADDR_RUNLOCK();
return (EADDRNOTAVAIL);
}
laddr = ia->ia_addr.sin_addr;
IN_IFADDR_RUNLOCK();
}
}
}
oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
0, NULL);
if (oinp != NULL) {
if (oinpp != NULL)
*oinpp = oinp;
return (EADDRINUSE);
}
if (lport == 0) {
error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
cred);
if (error)
return (error);
}
*laddrp = laddr.s_addr;
*lportp = lport;
*faddrp = faddr.s_addr;
*fportp = fport;
return (0);
}
void
in_pcbdisconnect(struct inpcb *inp)
{
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
inp->inp_faddr.s_addr = INADDR_ANY;
inp->inp_fport = 0;
in_pcbrehash(inp);
}
/*
* in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
* For most protocols, this will be invoked immediately prior to calling
* in_pcbfree(). However, with TCP the inpcb may significantly outlive the
* socket, in which case in_pcbfree() is deferred.
*/
void
in_pcbdetach(struct inpcb *inp)
{
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
}
/*
* in_pcbfree_internal() frees an inpcb that has been detached from its
* socket, and whose reference count has reached 0. It will also remove the
* inpcb from any global lists it might remain on.
*/
static void
in_pcbfree_internal(struct inpcb *inp)
{
struct inpcbinfo *ipi = inp->inp_pcbinfo;
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
#ifdef IPSEC
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
#endif /* IPSEC */
inp->inp_gencnt = ++ipi->ipi_gencnt;
in_pcbremlists(inp);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
if (inp->in6p_moptions != NULL)
ip6_freemoptions(inp->in6p_moptions);
}
#endif
if (inp->inp_options)
(void)m_free(inp->inp_options);
if (inp->inp_moptions != NULL)
inp_freemoptions(inp->inp_moptions);
inp->inp_vflag = 0;
crfree(inp->inp_cred);
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
INP_WUNLOCK(inp);
uma_zfree(ipi->ipi_zone, inp);
}
/*
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
* but where the inpcb lock is already held.
*
* While the inpcb will not be freed, releasing the inpcb lock means that the
* connection's state may change, so the caller should be careful to
* revalidate any cached state on reacquiring the lock. Drop the reference
* using in_pcbrele().
*/
void
in_pcbref(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
inp->inp_refcount++;
}
/*
* Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
* return a flag indicating whether or not the inpcb remains valid. If it is
* valid, we return with the inpcb lock held.
*/
int
in_pcbrele(struct inpcb *inp)
{
#ifdef INVARIANTS
struct inpcbinfo *ipi = inp->inp_pcbinfo;
#endif
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
inp->inp_refcount--;
if (inp->inp_refcount > 0)
return (0);
in_pcbfree_internal(inp);
return (1);
}
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
* released using in_pcbrele(), but the inpcb is still unlocked.
*/
void
in_pcbfree(struct inpcb *inp)
{
#ifdef INVARIANTS
struct inpcbinfo *ipi = inp->inp_pcbinfo;
#endif
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
__func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
if (!in_pcbrele(inp))
INP_WUNLOCK(inp);
}
/*
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
* port reservation, and preventing it from being returned by inpcb lookups.
*
* It is used by TCP to mark an inpcb as unused and avoid future packet
* delivery or event notification when a socket remains open but TCP has
* closed. This might occur as a result of a shutdown()-initiated TCP close
* or a RST on the wire, and allows the port binding to be reused while still
* maintaining the invariant that so_pcb always points to a valid inpcb until
* in_pcbdetach().
*
* XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
* lists, but can lead to confusing netstat output, as open sockets with
* closed TCP connections will no longer appear to have their bound port
* number. An explicit flag would be better, as it would allow us to leave
* the port number intact after the connection is dropped.
*
* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
* in_pcbnotifyall() and in_pcbpurgeif0()?
*/
void
in_pcbdrop(struct inpcb *inp)
{
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
inp->inp_flags |= INP_DROPPED;
if (inp->inp_flags & INP_INHASHLIST) {
struct inpcbport *phd = inp->inp_phd;
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
LIST_REMOVE(phd, phd_hash);
free(phd, M_PCB);
}
inp->inp_flags &= ~INP_INHASHLIST;
}
}
/*
* Common routines to return the socket addresses associated with inpcbs.
*/
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr_p)
{
struct sockaddr_in *sin;
sin = malloc(sizeof *sin, M_SONAME,
M_WAITOK | M_ZERO);
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = *addr_p;
sin->sin_port = port;
return (struct sockaddr *)sin;
}
int
in_getsockaddr(struct socket *so, struct sockaddr **nam)
{
struct inpcb *inp;
struct in_addr addr;
in_port_t port;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
INP_RLOCK(inp);
port = inp->inp_lport;
addr = inp->inp_laddr;
INP_RUNLOCK(inp);
*nam = in_sockaddr(port, &addr);
return 0;
}
int
in_getpeeraddr(struct socket *so, struct sockaddr **nam)
{
struct inpcb *inp;
struct in_addr addr;
in_port_t port;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
INP_RLOCK(inp);
port = inp->inp_fport;
addr = inp->inp_faddr;
INP_RUNLOCK(inp);
*nam = in_sockaddr(port, &addr);
return 0;
}
void
in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
struct inpcb *(*notify)(struct inpcb *, int))
{
struct inpcb *inp, *inp_temp;
INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
#ifdef INET6
if ((inp->inp_vflag & INP_IPV4) == 0) {
INP_WUNLOCK(inp);
continue;
}
#endif
if (inp->inp_faddr.s_addr != faddr.s_addr ||
inp->inp_socket == NULL) {
INP_WUNLOCK(inp);
continue;
}
if ((*notify)(inp, errno))
INP_WUNLOCK(inp);
}
INP_INFO_WUNLOCK(pcbinfo);
}
void
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
struct inpcb *inp;
struct ip_moptions *imo;
int i, gap;
INP_INFO_RLOCK(pcbinfo);
LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(inp);
imo = inp->inp_moptions;
if ((inp->inp_vflag & INP_IPV4) &&
imo != NULL) {
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (imo->imo_multicast_ifp == ifp)
imo->imo_multicast_ifp = NULL;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
*/
for (i = 0, gap = 0; i < imo->imo_num_memberships;
i++) {
if (imo->imo_membership[i]->inm_ifp == ifp) {
in_delmulti(imo->imo_membership[i]);
gap++;
} else if (gap != 0)
imo->imo_membership[i - gap] =
imo->imo_membership[i];
}
imo->imo_num_memberships -= gap;
}
INP_WUNLOCK(inp);
}
INP_INFO_RUNLOCK(pcbinfo);
}
/*
* Lookup a PCB based on the local address and port.
*/
#define INP_LOOKUP_MAPPED_PCB_COST 3
struct inpcb *
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
u_short lport, int wild_okay, struct ucred *cred)
{
struct inpcb *inp;
#ifdef INET6
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
#else
int matchwild = 3;
#endif
int wildcard;
INP_INFO_LOCK_ASSERT(pcbinfo);
if (!wild_okay) {
struct inpcbhead *head;
/*
* Look for an unconnected (wildcard foreign addr) PCB that
* matches the local address and port we're looking for.
*/
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_hashmask)];
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr == INADDR_ANY &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_lport == lport) {
/*
* Found?
*/
if (cred == NULL ||
prison_equal_ip4(cred->cr_prison,
inp->inp_cred->cr_prison))
return (inp);
}
}
/*
* Not found.
*/
return (NULL);
} else {
struct inpcbporthead *porthash;
struct inpcbport *phd;
struct inpcb *match = NULL;
/*
* Best fit PCB lookup.
*
* First see if this local port is in use by looking on the
* port hash list.
*/
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
pcbinfo->ipi_porthashmask)];
LIST_FOREACH(phd, porthash, phd_hash) {
if (phd->phd_port == lport)
break;
}
if (phd != NULL) {
/*
* Port is in use by one or more PCBs. Look for best
* fit.
*/
LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
wildcard = 0;
if (cred != NULL &&
!prison_equal_ip4(inp->inp_cred->cr_prison,
cred->cr_prison))
continue;
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
/*
* We never select the PCB that has
* INP_IPV6 flag and is bound to :: if
* we have another PCB which is bound
* to 0.0.0.0. If a PCB has the
* INP_IPV6 flag, then we set its cost
* higher than IPv4 only PCBs.
*
* Note that the case only happens
* when a socket is bound to ::, under
* the condition that the use of the
* mapped address is allowed.
*/
if ((inp->inp_vflag & INP_IPV6) != 0)
wildcard += INP_LOOKUP_MAPPED_PCB_COST;
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY)
wildcard++;
if (inp->inp_laddr.s_addr != INADDR_ANY) {
if (laddr.s_addr == INADDR_ANY)
wildcard++;
else if (inp->inp_laddr.s_addr != laddr.s_addr)
continue;
} else {
if (laddr.s_addr != INADDR_ANY)
wildcard++;
}
if (wildcard < matchwild) {
match = inp;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
}
return (match);
}
}
#undef INP_LOOKUP_MAPPED_PCB_COST
/*
* Lookup PCB in hash list.
*/
struct inpcb *
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
struct ifnet *ifp)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
u_short fport = fport_arg, lport = lport_arg;
INP_INFO_LOCK_ASSERT(pcbinfo);
/*
* First look for an exact match.
*/
tmpinp = NULL;
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
pcbinfo->ipi_hashmask)];
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr == faddr.s_addr &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_fport == fport &&
inp->inp_lport == lport) {
/*
* XXX We should be able to directly return
* the inp here, without any checks.
* Well unless both bound with SO_REUSEPORT?
*/
if (prison_flag(inp->inp_cred, PR_IP4))
return (inp);
if (tmpinp == NULL)
tmpinp = inp;
}
}
if (tmpinp != NULL)
return (tmpinp);
/*
* Then look for a wildcard match, if requested.
*/
if (wildcard == INPLOOKUP_WILDCARD) {
struct inpcb *local_wild = NULL, *local_exact = NULL;
#ifdef INET6
struct inpcb *local_wild_mapped = NULL;
#endif
struct inpcb *jail_wild = NULL;
int injail;
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
* 2. jailed, wild.
* 3. non-jailed, non-wild.
* 4. non-jailed, wild.
*/
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_hashmask)];
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY ||
inp->inp_lport != lport)
continue;
/* XXX inp locking */
if (ifp && ifp->if_type == IFT_FAITH &&
(inp->inp_flags & INP_FAITH) == 0)
continue;
injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
if (prison_check_ip4(inp->inp_cred,
&laddr) != 0)
continue;
} else {
if (local_exact != NULL)
continue;
}
if (inp->inp_laddr.s_addr == laddr.s_addr) {
if (injail)
return (inp);
else
local_exact = inp;
} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
#ifdef INET6
/* XXX inp locking, NULL check */
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
#endif /* INET6 */
if (injail)
jail_wild = inp;
else
local_wild = inp;
}
} /* LIST_FOREACH */
if (jail_wild != NULL)
return (jail_wild);
if (local_exact != NULL)
return (local_exact);
if (local_wild != NULL)
return (local_wild);
#ifdef INET6
if (local_wild_mapped != NULL)
return (local_wild_mapped);
#endif /* defined(INET6) */
} /* if (wildcard == INPLOOKUP_WILDCARD) */
return (NULL);
}
/*
* Insert PCB onto various hash lists.
*/
int
in_pcbinshash(struct inpcb *inp)
{
struct inpcbhead *pcbhash;
struct inpcbporthead *pcbporthash;
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
("in_pcbinshash: INP_INHASHLIST"));
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
else
#endif /* INET6 */
hashkey_faddr = inp->inp_faddr.s_addr;
pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
/*
* Go through port list and look for a head for this lport.
*/
LIST_FOREACH(phd, pcbporthash, phd_hash) {
if (phd->phd_port == inp->inp_lport)
break;
}
/*
* If none exists, malloc one and tack it on.
*/
if (phd == NULL) {
phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
if (phd == NULL) {
return (ENOBUFS); /* XXX */
}
phd->phd_port = inp->inp_lport;
LIST_INIT(&phd->phd_pcblist);
LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
}
inp->inp_phd = phd;
LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
inp->inp_flags |= INP_INHASHLIST;
return (0);
}
/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
* hashed port list would have to be updated as well), so the lport must
* not change after in_pcbinshash() has been called.
*/
void
in_pcbrehash(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbhead *head;
u_int32_t hashkey_faddr;
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_flags & INP_INHASHLIST,
("in_pcbrehash: !INP_INHASHLIST"));
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
else
#endif /* INET6 */
hashkey_faddr = inp->inp_faddr.s_addr;
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
/*
* Remove PCB from various lists.
*/
static void
in_pcbremlists(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
if (inp->inp_flags & INP_INHASHLIST) {
struct inpcbport *phd = inp->inp_phd;
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
LIST_REMOVE(phd, phd_hash);
free(phd, M_PCB);
}
inp->inp_flags &= ~INP_INHASHLIST;
}
LIST_REMOVE(inp, inp_list);
pcbinfo->ipi_count--;
}
/*
* A set label operation has occurred at the socket layer, propagate the
* label change into the in_pcb for the socket.
*/
void
in_pcbsosetlabel(struct socket *so)
{
#ifdef MAC
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
INP_WLOCK(inp);
SOCK_LOCK(so);
mac_inpcb_sosetlabel(so, inp);
SOCK_UNLOCK(so);
INP_WUNLOCK(inp);
#endif
}
/*
* ipport_tick runs once per second, determining if random port allocation
* should be continued. If more than ipport_randomcps ports have been
* allocated in the last second, then we return to sequential port
* allocation. We return to random allocation only once we drop below
* ipport_randomcps for at least ipport_randomtime seconds.
*/
void
ipport_tick(void *xtp)
{
VNET_ITERATOR_DECL(vnet_iter);
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
if (V_ipport_tcpallocs <=
V_ipport_tcplastcount + V_ipport_randomcps) {
if (V_ipport_stoprandom > 0)
V_ipport_stoprandom--;
} else
V_ipport_stoprandom = V_ipport_randomtime;
V_ipport_tcplastcount = V_ipport_tcpallocs;
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
}
void
inp_wlock(struct inpcb *inp)
{
INP_WLOCK(inp);
}
void
inp_wunlock(struct inpcb *inp)
{
INP_WUNLOCK(inp);
}
void
inp_rlock(struct inpcb *inp)
{
INP_RLOCK(inp);
}
void
inp_runlock(struct inpcb *inp)
{
INP_RUNLOCK(inp);
}
#ifdef INVARIANTS
void
inp_lock_assert(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
}
void
inp_unlock_assert(struct inpcb *inp)
{
INP_UNLOCK_ASSERT(inp);
}
#endif
void
inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
struct inpcb *inp;
INP_INFO_RLOCK(&V_tcbinfo);
LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
func(inp, arg);
INP_WUNLOCK(inp);
}
INP_INFO_RUNLOCK(&V_tcbinfo);
}
struct socket *
inp_inpcbtosocket(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
return (inp->inp_socket);
}
struct tcpcb *
inp_inpcbtotcpcb(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
return ((struct tcpcb *)inp->inp_ppcb);
}
int
inp_ip_tos_get(const struct inpcb *inp)
{
return (inp->inp_ip_tos);
}
void
inp_ip_tos_set(struct inpcb *inp, int val)
{
inp->inp_ip_tos = val;
}
void
inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp)
{
INP_LOCK_ASSERT(inp);
*laddr = inp->inp_laddr.s_addr;
*faddr = inp->inp_faddr.s_addr;
*lp = inp->inp_lport;
*fp = inp->inp_fport;
}
struct inpcb *
so_sotoinpcb(struct socket *so)
{
return (sotoinpcb(so));
}
struct tcpcb *
so_sototcpcb(struct socket *so)
{
return (sototcpcb(so));
}
#ifdef DDB
static void
db_print_indent(int indent)
{
int i;
for (i = 0; i < indent; i++)
db_printf(" ");
}
static void
db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
{
char faddr_str[48], laddr_str[48];
db_print_indent(indent);
db_printf("%s at %p\n", name, inc);
indent += 2;
#ifdef INET6
if (inc->inc_flags & INC_ISIPV6) {
/* IPv6. */
ip6_sprintf(laddr_str, &inc->inc6_laddr);
ip6_sprintf(faddr_str, &inc->inc6_faddr);
} else {
#endif
/* IPv4. */
inet_ntoa_r(inc->inc_laddr, laddr_str);
inet_ntoa_r(inc->inc_faddr, faddr_str);
#ifdef INET6
}
#endif
db_print_indent(indent);
db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
ntohs(inc->inc_lport));
db_print_indent(indent);
db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
ntohs(inc->inc_fport));
}
static void
db_print_inpflags(int inp_flags)
{
int comma;
comma = 0;
if (inp_flags & INP_RECVOPTS) {
db_printf("%sINP_RECVOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVRETOPTS) {
db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVDSTADDR) {
db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_HDRINCL) {
db_printf("%sINP_HDRINCL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_HIGHPORT) {
db_printf("%sINP_HIGHPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_LOWPORT) {
db_printf("%sINP_LOWPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ANONPORT) {
db_printf("%sINP_ANONPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVIF) {
db_printf("%sINP_RECVIF", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_MTUDISC) {
db_printf("%sINP_MTUDISC", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_FAITH) {
db_printf("%sINP_FAITH", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVTTL) {
db_printf("%sINP_RECVTTL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_DONTFRAG) {
db_printf("%sINP_DONTFRAG", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_IPV6_V6ONLY) {
db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_PKTINFO) {
db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_HOPLIMIT) {
db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_HOPOPTS) {
db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_DSTOPTS) {
db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RTHDR) {
db_printf("%sIN6P_RTHDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RTHDRDSTOPTS) {
db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_TCLASS) {
db_printf("%sIN6P_TCLASS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_AUTOFLOWLABEL) {
db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_TIMEWAIT) {
db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ONESBCAST) {
db_printf("%sINP_ONESBCAST", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_DROPPED) {
db_printf("%sINP_DROPPED", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_SOCKREF) {
db_printf("%sINP_SOCKREF", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RFC2292) {
db_printf("%sIN6P_RFC2292", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_MTU) {
db_printf("IN6P_MTU%s", comma ? ", " : "");
comma = 1;
}
}
static void
db_print_inpvflag(u_char inp_vflag)
{
int comma;
comma = 0;
if (inp_vflag & INP_IPV4) {
db_printf("%sINP_IPV4", comma ? ", " : "");
comma = 1;
}
if (inp_vflag & INP_IPV6) {
db_printf("%sINP_IPV6", comma ? ", " : "");
comma = 1;
}
if (inp_vflag & INP_IPV6PROTO) {
db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
comma = 1;
}
}
static void
db_print_inpcb(struct inpcb *inp, const char *name, int indent)
{
db_print_indent(indent);
db_printf("%s at %p\n", name, inp);
indent += 2;
db_print_indent(indent);
db_printf("inp_flow: 0x%x\n", inp->inp_flow);
db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
db_print_indent(indent);
db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
db_print_indent(indent);
db_printf("inp_label: %p inp_flags: 0x%x (",
inp->inp_label, inp->inp_flags);
db_print_inpflags(inp->inp_flags);
db_printf(")\n");
db_print_indent(indent);
db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
inp->inp_vflag);
db_print_inpvflag(inp->inp_vflag);
db_printf(")\n");
db_print_indent(indent);
db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
db_print_indent(indent);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6) {
db_printf("in6p_options: %p in6p_outputopts: %p "
"in6p_moptions: %p\n", inp->in6p_options,
inp->in6p_outputopts, inp->in6p_moptions);
db_printf("in6p_icmp6filt: %p in6p_cksum %d "
"in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
inp->in6p_hops);
} else
#endif
{
db_printf("inp_ip_tos: %d inp_ip_options: %p "
"inp_ip_moptions: %p\n", inp->inp_ip_tos,
inp->inp_options, inp->inp_moptions);
}
db_print_indent(indent);
db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
(uintmax_t)inp->inp_gencnt);
}
DB_SHOW_COMMAND(inpcb, db_show_inpcb)
{
struct inpcb *inp;
if (!have_addr) {
db_printf("usage: show inpcb <addr>\n");
return;
}
inp = (struct inpcb *)addr;
db_print_inpcb(inp, "inpcb", 0);
}
#endif
Index: stable/8/sys/netinet/ip_options.c
===================================================================
--- stable/8/sys/netinet/ip_options.c (revision 209276)
+++ stable/8/sys/netinet/ip_options.c (revision 209277)
@@ -1,745 +1,745 @@
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California.
* Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ipstealth.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/netisr.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
#include <netinet/ip_icmp.h>
#include <machine/in_cksum.h>
#include <sys/socketvar.h>
#include <security/mac/mac_framework.h>
static int ip_dosourceroute = 0;
SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
&ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
static int ip_acceptsourceroute = 0;
SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
CTLFLAG_RW, &ip_acceptsourceroute, 0,
"Enable accepting source routed IP packets");
int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */
SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW,
&ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)");
static void save_rte(struct mbuf *m, u_char *, struct in_addr);
/*
* Do option processing on a datagram, possibly discarding it if bad options
* are encountered, or forwarding it if source-routed.
*
* The pass argument is used when operating in the IPSTEALTH mode to tell
* what options to process: [LS]SRR (pass 0) or the others (pass 1). The
* reason for as many as two passes is that when doing IPSTEALTH, non-routing
* options should be processed only if the packet is for us.
*
* Returns 1 if packet has been forwarded/freed, 0 if the packet should be
* processed further.
*/
int
ip_dooptions(struct mbuf *m, int pass)
{
struct ip *ip = mtod(m, struct ip *);
u_char *cp;
struct in_ifaddr *ia;
int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
struct in_addr *sin, dst;
uint32_t ntime;
struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
/* Ignore or reject packets with IP options. */
if (ip_doopts == 0)
return 0;
else if (ip_doopts == 2) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_FILTER_PROHIB;
goto bad;
}
dst = ip->ip_dst;
cp = (u_char *)(ip + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp)) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
}
switch (opt) {
default:
break;
/*
* Source routing with record. Find interface with current
* destination address. If none on this machine then drop if
* strictly routed, or do nothing if loosely routed. Record
* interface address and bring up next address component. If
* strictly routed make sure next address is on directly
* accessible net.
*/
case IPOPT_LSRR:
case IPOPT_SSRR:
#ifdef IPSTEALTH
if (V_ipstealth && pass > 0)
break;
#endif
if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
ipaddr.sin_addr = ip->ip_dst;
if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr)
== 0) {
if (opt == IPOPT_SSRR) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
if (!ip_dosourceroute)
goto nosourcerouting;
/*
* Loose routing, and not at next destination
* yet; nothing to do except forward.
*/
break;
}
off--; /* 0 origin */
if (off > optlen - (int)sizeof(struct in_addr)) {
/*
* End of source route. Should be for us.
*/
if (!ip_acceptsourceroute)
goto nosourcerouting;
save_rte(m, cp, ip->ip_src);
break;
}
#ifdef IPSTEALTH
if (V_ipstealth)
goto dropit;
#endif
if (!ip_dosourceroute) {
if (V_ipforwarding) {
char buf[16]; /* aaa.bbb.ccc.ddd\0 */
/*
* Acting as a router, so generate
* ICMP
*/
nosourcerouting:
strcpy(buf, inet_ntoa(ip->ip_dst));
log(LOG_WARNING,
"attempted source route from %s to %s\n",
inet_ntoa(ip->ip_src), buf);
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
} else {
/*
* Not acting as a router, so
* silently drop.
*/
#ifdef IPSTEALTH
dropit:
#endif
IPSTAT_INC(ips_cantforward);
m_freem(m);
return (1);
}
}
/*
* locate outgoing interface
*/
(void)memcpy(&ipaddr.sin_addr, cp + off,
sizeof(ipaddr.sin_addr));
if (opt == IPOPT_SSRR) {
#define INA struct in_ifaddr *
#define SA struct sockaddr *
if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL)
- ia = (INA)ifa_ifwithnet((SA)&ipaddr);
+ ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0);
} else
/* XXX MRT 0 for routing */
ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m));
if (ia == NULL) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
ip->ip_dst = ipaddr.sin_addr;
(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
sizeof(struct in_addr));
ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
/*
* Let ip_intr's mcast routing check handle mcast pkts
*/
forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
break;
case IPOPT_RR:
#ifdef IPSTEALTH
if (V_ipstealth && pass == 0)
break;
#endif
if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
/*
* If no space remains, ignore.
*/
off--; /* 0 origin */
if (off > optlen - (int)sizeof(struct in_addr))
break;
(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
sizeof(ipaddr.sin_addr));
/*
* Locate outgoing interface; if we're the
* destination, use the incoming interface (should be
* same).
*/
if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
(ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_HOST;
goto bad;
}
(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
sizeof(struct in_addr));
ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
break;
case IPOPT_TS:
#ifdef IPSTEALTH
if (V_ipstealth && pass == 0)
break;
#endif
code = cp - (u_char *)ip;
if (optlen < 4 || optlen > 40) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < 5) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if (off > optlen - (int)sizeof(int32_t)) {
cp[IPOPT_OFFSET + 1] += (1 << 4);
if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
break;
}
off--; /* 0 origin */
sin = (struct in_addr *)(cp + off);
switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
case IPOPT_TS_TSONLY:
break;
case IPOPT_TS_TSANDADDR:
if (off + sizeof(uint32_t) +
sizeof(struct in_addr) > optlen) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
ipaddr.sin_addr = dst;
ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
m->m_pkthdr.rcvif);
if (ia == NULL)
continue;
(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
sizeof(struct in_addr));
ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
off += sizeof(struct in_addr);
break;
case IPOPT_TS_PRESPEC:
if (off + sizeof(uint32_t) +
sizeof(struct in_addr) > optlen) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
(void)memcpy(&ipaddr.sin_addr, sin,
sizeof(struct in_addr));
if (ifa_ifwithaddr((SA)&ipaddr) == NULL)
continue;
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
off += sizeof(struct in_addr);
break;
default:
code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
goto bad;
}
ntime = iptime();
(void)memcpy(cp + off, &ntime, sizeof(uint32_t));
cp[IPOPT_OFFSET] += sizeof(uint32_t);
}
}
if (forward && V_ipforwarding) {
ip_forward(m, 1);
return (1);
}
return (0);
bad:
icmp_error(m, type, code, 0, 0);
IPSTAT_INC(ips_badoptions);
return (1);
}
/*
* Save incoming source route for use in replies, to be picked up later by
* ip_srcroute if the receiver is interested.
*/
static void
save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
{
unsigned olen;
struct ipopt_tag *opts;
opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS,
sizeof(struct ipopt_tag), M_NOWAIT);
if (opts == NULL)
return;
olen = option[IPOPT_OLEN];
if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) {
m_tag_free((struct m_tag *)opts);
return;
}
bcopy(option, opts->ip_srcrt.srcopt, olen);
opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
opts->ip_srcrt.dst = dst;
m_tag_prepend(m, (struct m_tag *)opts);
}
/*
* Retrieve incoming source route for use in replies, in the same form used
* by setsockopt. The first hop is placed before the options, will be
* removed later.
*/
struct mbuf *
ip_srcroute(struct mbuf *m0)
{
struct in_addr *p, *q;
struct mbuf *m;
struct ipopt_tag *opts;
opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL);
if (opts == NULL)
return (NULL);
if (opts->ip_nhops == 0)
return (NULL);
m = m_get(M_DONTWAIT, MT_DATA);
if (m == NULL)
return (NULL);
#define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt))
/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
m->m_len = opts->ip_nhops * sizeof(struct in_addr) +
sizeof(struct in_addr) + OPTSIZ;
/*
* First, save first hop for return route.
*/
p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]);
*(mtod(m, struct in_addr *)) = *p--;
/*
* Copy option fields and padding (nop) to mbuf.
*/
opts->ip_srcrt.nop = IPOPT_NOP;
opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
&(opts->ip_srcrt.nop), OPTSIZ);
q = (struct in_addr *)(mtod(m, caddr_t) +
sizeof(struct in_addr) + OPTSIZ);
#undef OPTSIZ
/*
* Record return path as an IP source route, reversing the path
* (pointers are now aligned).
*/
while (p >= opts->ip_srcrt.route) {
*q++ = *p--;
}
/*
* Last hop goes to final destination.
*/
*q = opts->ip_srcrt.dst;
m_tag_delete(m0, (struct m_tag *)opts);
return (m);
}
/*
* Strip out IP options, at higher level protocol in the kernel. Second
* argument is buffer to which options will be moved, and return value is
* their length.
*
* XXX should be deleted; last arg currently ignored.
*/
void
ip_stripoptions(struct mbuf *m, struct mbuf *mopt)
{
int i;
struct ip *ip = mtod(m, struct ip *);
caddr_t opts;
int olen;
olen = (ip->ip_hl << 2) - sizeof (struct ip);
opts = (caddr_t)(ip + 1);
i = m->m_len - (sizeof (struct ip) + olen);
bcopy(opts + olen, opts, (unsigned)i);
m->m_len -= olen;
if (m->m_flags & M_PKTHDR)
m->m_pkthdr.len -= olen;
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(struct ip) >> 2;
}
/*
* Insert IP options into preformed packet. Adjust IP destination as
* required for IP source routing, as indicated by a non-zero in_addr at the
* start of the options.
*
* XXX This routine assumes that the packet has no options in place.
*/
struct mbuf *
ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
{
struct ipoption *p = mtod(opt, struct ipoption *);
struct mbuf *n;
struct ip *ip = mtod(m, struct ip *);
unsigned optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
if (optlen + ip->ip_len > IP_MAXPACKET) {
*phlen = 0;
return (m); /* XXX should fail */
}
if (p->ipopt_dst.s_addr)
ip->ip_dst = p->ipopt_dst;
if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
MGETHDR(n, M_DONTWAIT, MT_DATA);
if (n == NULL) {
*phlen = 0;
return (m);
}
M_MOVE_PKTHDR(n, m);
n->m_pkthdr.rcvif = NULL;
n->m_pkthdr.len += optlen;
m->m_len -= sizeof(struct ip);
m->m_data += sizeof(struct ip);
n->m_next = m;
m = n;
m->m_len = optlen + sizeof(struct ip);
m->m_data += max_linkhdr;
bcopy(ip, mtod(m, void *), sizeof(struct ip));
} else {
m->m_data -= optlen;
m->m_len += optlen;
m->m_pkthdr.len += optlen;
bcopy(ip, mtod(m, void *), sizeof(struct ip));
}
ip = mtod(m, struct ip *);
bcopy(p->ipopt_list, ip + 1, optlen);
*phlen = sizeof(struct ip) + optlen;
ip->ip_v = IPVERSION;
ip->ip_hl = *phlen >> 2;
ip->ip_len += optlen;
return (m);
}
/*
* Copy options from ip to jp, omitting those not copied during
* fragmentation.
*/
int
ip_optcopy(struct ip *ip, struct ip *jp)
{
u_char *cp, *dp;
int opt, optlen, cnt;
cp = (u_char *)(ip + 1);
dp = (u_char *)(jp + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP) {
/* Preserve for IP mcast tunnel's LSRR alignment. */
*dp++ = IPOPT_NOP;
optlen = 1;
continue;
}
KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
("ip_optcopy: malformed ipv4 option"));
optlen = cp[IPOPT_OLEN];
KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
("ip_optcopy: malformed ipv4 option"));
/* Bogus lengths should have been caught by ip_dooptions. */
if (optlen > cnt)
optlen = cnt;
if (IPOPT_COPIED(opt)) {
bcopy(cp, dp, optlen);
dp += optlen;
}
}
for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
*dp++ = IPOPT_EOL;
return (optlen);
}
/*
* Set up IP options in pcb for insertion in output packets. Store in mbuf
* with pointer in pcbopt, adding pseudo-option with destination address if
* source routed.
*/
int
ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m)
{
int cnt, optlen;
u_char *cp;
struct mbuf **pcbopt;
u_char opt;
INP_WLOCK_ASSERT(inp);
pcbopt = &inp->inp_options;
/* turn off any old options */
if (*pcbopt)
(void)m_free(*pcbopt);
*pcbopt = 0;
if (m == NULL || m->m_len == 0) {
/*
* Only turning off any previous options.
*/
if (m != NULL)
(void)m_free(m);
return (0);
}
if (m->m_len % sizeof(int32_t))
goto bad;
/*
* IP first-hop destination address will be stored before actual
* options; move other options back and clear it when none present.
*/
if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
goto bad;
cnt = m->m_len;
m->m_len += sizeof(struct in_addr);
cp = mtod(m, u_char *) + sizeof(struct in_addr);
bcopy(mtod(m, void *), cp, (unsigned)cnt);
bzero(mtod(m, void *), sizeof(struct in_addr));
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp))
goto bad;
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
goto bad;
}
switch (opt) {
default:
break;
case IPOPT_LSRR:
case IPOPT_SSRR:
/*
* User process specifies route as:
*
* ->A->B->C->D
*
* D must be our final destination (but we can't
* check that since we may not have connected yet).
* A is first hop destination, which doesn't appear
* in actual IP option, but is stored before the
* options.
*/
/* XXX-BZ PRIV_NETINET_SETHDROPTS? */
if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
goto bad;
m->m_len -= sizeof(struct in_addr);
cnt -= sizeof(struct in_addr);
optlen -= sizeof(struct in_addr);
cp[IPOPT_OLEN] = optlen;
/*
* Move first hop before start of options.
*/
bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
sizeof(struct in_addr));
/*
* Then copy rest of options back
* to close up the deleted entry.
*/
bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
&cp[IPOPT_OFFSET+1],
(unsigned)cnt - (IPOPT_MINOFF - 1));
break;
}
}
if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
goto bad;
*pcbopt = m;
return (0);
bad:
(void)m_free(m);
return (EINVAL);
}
/*
* Check for the presence of the IP Router Alert option [RFC2113]
* in the header of an IPv4 datagram.
*
* This call is not intended for use from the forwarding path; it is here
* so that protocol domains may check for the presence of the option.
* Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
* option does not have much relevance to the implementation, though this
* may change in future.
* Router alert options SHOULD be passed if running in IPSTEALTH mode and
* we are not the endpoint.
* Length checks on individual options should already have been peformed
* by ip_dooptions() therefore they are folded under INVARIANTS here.
*
* Return zero if not present or options are invalid, non-zero if present.
*/
int
ip_checkrouteralert(struct mbuf *m)
{
struct ip *ip = mtod(m, struct ip *);
u_char *cp;
int opt, optlen, cnt, found_ra;
found_ra = 0;
cp = (u_char *)(ip + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
#ifdef INVARIANTS
if (cnt < IPOPT_OLEN + sizeof(*cp))
break;
#endif
optlen = cp[IPOPT_OLEN];
#ifdef INVARIANTS
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
break;
#endif
}
switch (opt) {
case IPOPT_RA:
#ifdef INVARIANTS
if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
(*((uint16_t *)&cp[IPOPT_OFFSET]) != 0))
break;
else
#endif
found_ra = 1;
break;
default:
break;
}
}
return (found_ra);
}
Index: stable/8/sys/netinet/ip_output.c
===================================================================
--- stable/8/sys/netinet/ip_output.c (revision 209276)
+++ stable/8/sys/netinet/ip_output.c (revision 209277)
@@ -1,1278 +1,1278 @@
/*-
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ipfw.h"
#include "opt_ipsec.h"
#include "opt_route.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
#include "opt_sctp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/ucred.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/netisr.h>
#include <net/pfil.h>
#include <net/route.h>
#include <net/flowtable.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
#ifdef SCTP
#include <netinet/sctp.h>
#include <netinet/sctp_crc32.h>
#endif
#ifdef IPSEC
#include <netinet/ip_ipsec.h>
#include <netipsec/ipsec.h>
#endif /* IPSEC*/
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
x, (ntohl(a.s_addr)>>24)&0xFF,\
(ntohl(a.s_addr)>>16)&0xFF,\
(ntohl(a.s_addr)>>8)&0xFF,\
(ntohl(a.s_addr))&0xFF, y);
VNET_DEFINE(u_short, ip_id);
#ifdef MBUF_STRESS_TEST
int mbuf_frag_size = 0;
SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
#endif
static void ip_mloopback
(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
extern int in_mcast_loop;
extern struct protosw inetsw[];
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
* In the IP forwarding case, the packet will arrive with options already
* inserted, so must have a NULL opt pointer.
*/
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
struct ip *ip;
struct ifnet *ifp = NULL; /* keep compiler happy */
struct mbuf *m0;
int hlen = sizeof (struct ip);
int mtu;
int len, error = 0;
int nortfree = 0;
struct sockaddr_in *dst = NULL; /* keep compiler happy */
struct in_ifaddr *ia = NULL;
int isbroadcast, sw_csum;
struct route iproute;
struct rtentry *rte; /* cache for ro->ro_rt */
struct in_addr odst;
#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag = NULL;
#endif
#ifdef IPSEC
int no_route_but_check_spd = 0;
#endif
M_ASSERTPKTHDR(m);
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
M_SETFIB(m, inp->inp_inc.inc_fibnum);
if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
m->m_pkthdr.flowid = inp->inp_flowid;
m->m_flags |= M_FLOWID;
}
}
if (ro == NULL) {
ro = &iproute;
bzero(ro, sizeof (*ro));
#ifdef FLOWTABLE
{
struct flentry *fle;
/*
* The flow table returns route entries valid for up to 30
* seconds; we rely on the remainder of ip_output() taking no
* longer than that long for the stability of ro_rt. The
* flow ID assignment must have happened before this point.
*/
if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
flow_to_route(fle, ro);
nortfree = 1;
}
}
#endif
}
if (opt) {
len = 0;
m = ip_insertoptions(m, opt, &len);
if (len != 0)
hlen = len;
}
ip = mtod(m, struct ip *);
/*
* Fill in IP header. If we are not allowing fragmentation,
* then the ip_id field is meaningless, but we don't set it
* to zero. Doing so causes various problems when devices along
* the path (routers, load balancers, firewalls, etc.) illegally
* disable DF on our packet. Note that a 16-bit counter
* will wrap around in less than 10 seconds at 100 Mbit/s on a
* medium with MTU 1500. See Steven M. Bellovin, "A Technique
* for Counting NATted Hosts", Proc. IMW'02, available at
* <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
*/
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_hl = hlen >> 2;
ip->ip_id = ip_newid();
IPSTAT_INC(ips_localout);
} else {
hlen = ip->ip_hl << 2;
}
dst = (struct sockaddr_in *)&ro->ro_dst;
again:
/*
* If there is a cached route,
* check that it is to the same destination
* and is still up. If not, free it and try again.
* The address family should also be checked in case of sharing the
* cache with IPv6.
*/
rte = ro->ro_rt;
if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
if (!nortfree)
RTFREE(rte);
rte = ro->ro_rt = (struct rtentry *)NULL;
ro->ro_lle = (struct llentry *)NULL;
}
#ifdef IPFIREWALL_FORWARD
if (rte == NULL && fwd_tag == NULL) {
#else
if (rte == NULL) {
#endif
bzero(dst, sizeof(*dst));
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
dst->sin_addr = ip->ip_dst;
}
/*
* If routing to interface only, short circuit routing lookup.
* The use of an all-ones broadcast address implies this; an
* interface is specified by the broadcast address of an interface,
* or the destination address of a ptp interface.
*/
if (flags & IP_SENDONES) {
if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
(ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
ip->ip_dst.s_addr = INADDR_BROADCAST;
dst->sin_addr = ip->ip_dst;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = 1;
} else if (flags & IP_ROUTETOIF) {
if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
- (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
+ (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = in_broadcast(dst->sin_addr, ifp);
} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
imo != NULL && imo->imo_multicast_ifp != NULL) {
/*
* Bypass the normal routing lookup for multicast
* packets if the interface is specified.
*/
ifp = imo->imo_multicast_ifp;
IFP_TO_IA(ifp, ia);
isbroadcast = 0; /* fool gcc */
} else {
/*
* We want to do any cloning requested by the link layer,
* as this is probably required in all cases for correct
* operation (as it is for ARP).
*/
if (rte == NULL) {
#ifdef RADIX_MPATH
rtalloc_mpath_fib(ro,
ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
#else
in_rtalloc_ign(ro, 0,
inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
#endif
rte = ro->ro_rt;
}
if (rte == NULL ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp)) {
#ifdef IPSEC
/*
* There is no route for this packet, but it is
* possible that a matching SPD entry exists.
*/
no_route_but_check_spd = 1;
mtu = 0; /* Silence GCC warning. */
goto sendit;
#endif
IPSTAT_INC(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
ia = ifatoia(rte->rt_ifa);
ifa_ref(&ia->ia_ifa);
ifp = rte->rt_ifp;
rte->rt_rmx.rmx_pksent++;
if (rte->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)rte->rt_gateway;
if (rte->rt_flags & RTF_HOST)
isbroadcast = (rte->rt_flags & RTF_BROADCAST);
else
isbroadcast = in_broadcast(dst->sin_addr, ifp);
}
/*
* Calculate MTU. If we have a route that is up, use that,
* otherwise use the interface's MTU.
*/
if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
/*
* This case can happen if the user changed the MTU
* of an interface after enabling IP on it. Because
* most netifs don't keep track of routes pointing to
* them, there is no way for one to update all its
* routes when the MTU is changed.
*/
if (rte->rt_rmx.rmx_mtu > ifp->if_mtu)
rte->rt_rmx.rmx_mtu = ifp->if_mtu;
mtu = rte->rt_rmx.rmx_mtu;
} else {
mtu = ifp->if_mtu;
}
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
m->m_flags |= M_MCAST;
/*
* IP destination address is multicast. Make sure "dst"
* still points to the address in "ro". (It may have been
* changed to point to a gateway address, above.)
*/
dst = (struct sockaddr_in *)&ro->ro_dst;
/*
* See if the caller provided any multicast options
*/
if (imo != NULL) {
ip->ip_ttl = imo->imo_multicast_ttl;
if (imo->imo_multicast_vif != -1)
ip->ip_src.s_addr =
ip_mcast_src ?
ip_mcast_src(imo->imo_multicast_vif) :
INADDR_ANY;
} else
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
/*
* Confirm that the outgoing interface supports multicast.
*/
if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
}
/*
* If source address not specified yet, use address
* of outgoing interface.
*/
if (ip->ip_src.s_addr == INADDR_ANY) {
/* Interface may have no addresses. */
if (ia != NULL)
ip->ip_src = IA_SIN(ia)->sin_addr;
}
if ((imo == NULL && in_mcast_loop) ||
(imo && imo->imo_multicast_loop)) {
/*
* Loop back multicast datagram if not expressly
* forbidden to do so, even if we are not a member
* of the group; ip_input() will filter it later,
* thus deferring a hash lookup and mutex acquisition
* at the expense of a cheap copy using m_copym().
*/
ip_mloopback(ifp, m, dst, hlen);
} else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IP_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip_mloopback(),
* above, will be forwarded by the ip_input() routine,
* if necessary.
*/
if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
/*
* If rsvp daemon is not running, do not
* set ip_moptions. This ensures that the packet
* is multicast and not just sent down one link
* as prescribed by rsvpd.
*/
if (!V_rsvp_on)
imo = NULL;
if (ip_mforward &&
ip_mforward(ip, ifp, m, imo) != 0) {
m_freem(m);
goto done;
}
}
}
/*
* Multicasts with a time-to-live of zero may be looped-
* back, above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip_mloopback() will
* loop back a copy. ip_input() will drop the copy if
* this host does not belong to the destination group on
* the loopback interface.
*/
if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
m_freem(m);
goto done;
}
goto sendit;
}
/*
* If the source address is not specified yet, use the address
* of the outoing interface.
*/
if (ip->ip_src.s_addr == INADDR_ANY) {
/* Interface may have no addresses. */
if (ia != NULL) {
ip->ip_src = IA_SIN(ia)->sin_addr;
}
}
/*
* Verify that we have any chance at all of being able to queue the
* packet or packet fragments, unless ALTQ is enabled on the given
* interface in which case packetdrop should be done by queueing.
*/
#ifdef ALTQ
if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
ifp->if_snd.ifq_maxlen))
#else
if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
ifp->if_snd.ifq_maxlen)
#endif /* ALTQ */
{
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
goto bad;
}
/*
* Look for broadcast address and
* verify user is allowed to send
* such a packet.
*/
if (isbroadcast) {
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EADDRNOTAVAIL;
goto bad;
}
if ((flags & IP_ALLOWBROADCAST) == 0) {
error = EACCES;
goto bad;
}
/* don't allow broadcast messages to be fragmented */
if (ip->ip_len > mtu) {
error = EMSGSIZE;
goto bad;
}
m->m_flags |= M_BCAST;
} else {
m->m_flags &= ~M_BCAST;
}
sendit:
#ifdef IPSEC
switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) {
case 1:
goto bad;
case -1:
goto done;
case 0:
default:
break; /* Continue with packet processing. */
}
/*
* Check if there was a route for this packet; return error if not.
*/
if (no_route_but_check_spd) {
IPSTAT_INC(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
/* Update variables that are affected by ipsec4_output(). */
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
#endif /* IPSEC */
/* Jump over all PFIL processing if hooks are not active. */
if (!PFIL_HOOKED(&V_inet_pfil_hook))
goto passout;
/* Run through list of hooks for output packets. */
odst.s_addr = ip->ip_dst.s_addr;
error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
if (error != 0 || m == NULL)
goto done;
ip = mtod(m, struct ip *);
/* See if destination IP address was changed by packet filter. */
if (odst.s_addr != ip->ip_dst.s_addr) {
m->m_flags |= M_SKIP_FIREWALL;
/* If destination is now ourself drop to ip_input(). */
if (in_localip(ip->ip_dst)) {
m->m_flags |= M_FASTFWD_OURS;
if (m->m_pkthdr.rcvif == NULL)
m->m_pkthdr.rcvif = V_loif;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
m->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xffff;
}
m->m_pkthdr.csum_flags |=
CSUM_IP_CHECKED | CSUM_IP_VALID;
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP)
m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
#endif
error = netisr_queue(NETISR_IP, m);
goto done;
} else
goto again; /* Redo the routing table lookup. */
}
#ifdef IPFIREWALL_FORWARD
/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
if (m->m_flags & M_FASTFWD_OURS) {
if (m->m_pkthdr.rcvif == NULL)
m->m_pkthdr.rcvif = V_loif;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
m->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xffff;
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP)
m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
#endif
m->m_pkthdr.csum_flags |=
CSUM_IP_CHECKED | CSUM_IP_VALID;
error = netisr_queue(NETISR_IP, m);
goto done;
}
/* Or forward to some other address? */
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
if (fwd_tag) {
dst = (struct sockaddr_in *)&ro->ro_dst;
bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
m->m_flags |= M_SKIP_FIREWALL;
m_tag_delete(m, fwd_tag);
goto again;
}
#endif /* IPFIREWALL_FORWARD */
passout:
/* 127/8 must not appear on wire - RFC1122. */
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
IPSTAT_INC(ips_badaddr);
error = EADDRNOTAVAIL;
goto bad;
}
}
m->m_pkthdr.csum_flags |= CSUM_IP;
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
if (sw_csum & CSUM_DELAY_DATA) {
in_delayed_cksum(m);
sw_csum &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (sw_csum & CSUM_SCTP) {
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
sw_csum &= ~CSUM_SCTP;
}
#endif
m->m_pkthdr.csum_flags &= ifp->if_hwassist;
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
*/
if (ip->ip_len <= mtu ||
(m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
ip->ip_len = htons(ip->ip_len);
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
ip->ip_sum = in_cksum(m, hlen);
/*
* Record statistics for this interface address.
* With CSUM_TSO the byte/packet count will be slightly
* incorrect because we count the IP+TCP headers only
* once instead of for every generated packet.
*/
if (!(flags & IP_FORWARDING) && ia) {
if (m->m_pkthdr.csum_flags & CSUM_TSO)
ia->ia_ifa.if_opackets +=
m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
else
ia->ia_ifa.if_opackets++;
ia->ia_ifa.if_obytes += m->m_pkthdr.len;
}
#ifdef MBUF_STRESS_TEST
if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
#endif
/*
* Reset layer specific mbuf flags
* to avoid confusing lower layers.
*/
m->m_flags &= ~(M_PROTOFLAGS);
error = (*ifp->if_output)(ifp, m,
(struct sockaddr *)dst, ro);
goto done;
}
/* Balk when DF bit is set or the interface didn't support TSO. */
if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
}
/*
* Too large for interface; fragment if possible. If successful,
* on return, m will point to a list of packets to be sent.
*/
error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
if (error)
goto bad;
for (; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = 0;
if (error == 0) {
/* Record statistics for this interface address. */
if (ia != NULL) {
ia->ia_ifa.if_opackets++;
ia->ia_ifa.if_obytes += m->m_pkthdr.len;
}
/*
* Reset layer specific mbuf flags
* to avoid confusing upper layers.
*/
m->m_flags &= ~(M_PROTOFLAGS);
error = (*ifp->if_output)(ifp, m,
(struct sockaddr *)dst, ro);
} else
m_freem(m);
}
if (error == 0)
IPSTAT_INC(ips_fragmented);
done:
if (ro == &iproute && ro->ro_rt && !nortfree) {
RTFREE(ro->ro_rt);
}
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (error);
bad:
m_freem(m);
goto done;
}
/*
* Create a chain of fragments which fit the given mtu. m_frag points to the
* mbuf to be fragmented; on return it points to the chain with the fragments.
* Return 0 if no error. If error, m_frag may contain a partially built
* chain of fragments that should be freed by the caller.
*
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
* sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
*/
int
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
u_long if_hwassist_flags, int sw_csum)
{
int error = 0;
int hlen = ip->ip_hl << 2;
int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
int off;
struct mbuf *m0 = *m_frag; /* the original packet */
int firstlen;
struct mbuf **mnext;
int nfrags;
if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
IPSTAT_INC(ips_cantfrag);
return EMSGSIZE;
}
/*
* Must be able to put at least 8 bytes per fragment.
*/
if (len < 8)
return EMSGSIZE;
/*
* If the interface will not calculate checksums on
* fragmented packets, then do it here.
*/
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
sctp_delayed_cksum(m0, hlen);
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
#endif
if (len > PAGE_SIZE) {
/*
* Fragment large datagrams such that each segment
* contains a multiple of PAGE_SIZE amount of data,
* plus headers. This enables a receiver to perform
* page-flipping zero-copy optimizations.
*
* XXX When does this help given that sender and receiver
* could have different page sizes, and also mtu could
* be less than the receiver's page size ?
*/
int newlen;
struct mbuf *m;
for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
off += m->m_len;
/*
* firstlen (off - hlen) must be aligned on an
* 8-byte boundary
*/
if (off < hlen)
goto smart_frag_failure;
off = ((off - hlen) & ~7) + hlen;
newlen = (~PAGE_MASK) & mtu;
if ((newlen + sizeof (struct ip)) > mtu) {
/* we failed, go back the default */
smart_frag_failure:
newlen = len;
off = hlen + len;
}
len = newlen;
} else {
off = hlen + len;
}
firstlen = off - hlen;
mnext = &m0->m_nextpkt; /* pointer to next packet */
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
* Here, m0 is the original packet, m is the fragment being created.
* The fragments are linked off the m_nextpkt of the original
* packet, which after processing serves as the first fragment.
*/
for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
struct ip *mhip; /* ip header on the fragment */
struct mbuf *m;
int mhlen = sizeof (struct ip);
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
goto done;
}
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
/*
* In the first mbuf, leave room for the link header, then
* copy the original IP header including options. The payload
* goes into an additional mbuf chain returned by m_copym().
*/
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
if (hlen > sizeof (struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
mhip->ip_v = IPVERSION;
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
/* XXX do we need to add ip->ip_off below ? */
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
if (off + len >= ip->ip_len) { /* last fragment */
len = ip->ip_len - off;
m->m_flags |= M_LASTFRAG;
} else
mhip->ip_off |= IP_MF;
mhip->ip_len = htons((u_short)(len + mhlen));
m->m_next = m_copym(m0, off, len, M_DONTWAIT);
if (m->m_next == NULL) { /* copy failed */
m_free(m);
error = ENOBUFS; /* ??? */
IPSTAT_INC(ips_odropped);
goto done;
}
m->m_pkthdr.len = mhlen + len;
m->m_pkthdr.rcvif = NULL;
#ifdef MAC
mac_netinet_fragment(m0, m);
#endif
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
mhip->ip_off = htons(mhip->ip_off);
mhip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
mhip->ip_sum = in_cksum(m, mhlen);
*mnext = m;
mnext = &m->m_nextpkt;
}
IPSTAT_ADD(ips_ofragments, nfrags);
/* set first marker for fragment chain */
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
m0->m_pkthdr.csum_data = nfrags;
/*
* Update first fragment by trimming what's been copied out
* and updating header.
*/
m_adj(m0, hlen + firstlen - ip->ip_len);
m0->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
ip->ip_off |= IP_MF;
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
ip->ip_sum = in_cksum(m0, hlen);
done:
*m_frag = m0;
return error;
}
void
in_delayed_cksum(struct mbuf *m)
{
struct ip *ip;
u_short csum, offset;
ip = mtod(m, struct ip *);
offset = ip->ip_hl << 2 ;
csum = in_cksum_skip(m, ip->ip_len, offset);
if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
csum = 0xffff;
offset += m->m_pkthdr.csum_data; /* checksum offset */
if (offset + sizeof(u_short) > m->m_len) {
printf("delayed m_pullup, m->len: %d off: %d p: %d\n",
m->m_len, offset, ip->ip_p);
/*
* XXX
* this shouldn't happen, but if it does, the
* correct behavior may be to insert the checksum
* in the appropriate next mbuf in the chain.
*/
return;
}
*(u_short *)(m->m_data + offset) = csum;
}
/*
* IP socket option processing.
*/
int
ip_ctloutput(struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp = sotoinpcb(so);
int error, optval;
error = optval = 0;
if (sopt->sopt_level != IPPROTO_IP) {
if ((sopt->sopt_level == SOL_SOCKET) &&
(sopt->sopt_name == SO_SETFIB)) {
inp->inp_inc.inc_fibnum = so->so_fibnum;
return (0);
}
return (EINVAL);
}
switch (sopt->sopt_dir) {
case SOPT_SET:
switch (sopt->sopt_name) {
case IP_OPTIONS:
#ifdef notyet
case IP_RETOPTS:
#endif
{
struct mbuf *m;
if (sopt->sopt_valsize > MLEN) {
error = EMSGSIZE;
break;
}
MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
break;
}
m->m_len = sopt->sopt_valsize;
error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
m->m_len);
if (error) {
m_free(m);
break;
}
INP_WLOCK(inp);
error = ip_pcbopts(inp, sopt->sopt_name, m);
INP_WUNLOCK(inp);
return (error);
}
case IP_BINDANY:
if (sopt->sopt_td != NULL) {
error = priv_check(sopt->sopt_td,
PRIV_NETINET_BINDANY);
if (error)
break;
}
/* FALLTHROUGH */
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVTTL:
case IP_RECVIF:
case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
break;
switch (sopt->sopt_name) {
case IP_TOS:
inp->inp_ip_tos = optval;
break;
case IP_TTL:
inp->inp_ip_ttl = optval;
break;
case IP_MINTTL:
if (optval >= 0 && optval <= MAXTTL)
inp->inp_ip_minttl = optval;
else
error = EINVAL;
break;
#define OPTSET(bit) do { \
INP_WLOCK(inp); \
if (optval) \
inp->inp_flags |= bit; \
else \
inp->inp_flags &= ~bit; \
INP_WUNLOCK(inp); \
} while (0)
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
case IP_RECVRETOPTS:
OPTSET(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
OPTSET(INP_RECVDSTADDR);
break;
case IP_RECVTTL:
OPTSET(INP_RECVTTL);
break;
case IP_RECVIF:
OPTSET(INP_RECVIF);
break;
case IP_FAITH:
OPTSET(INP_FAITH);
break;
case IP_ONESBCAST:
OPTSET(INP_ONESBCAST);
break;
case IP_DONTFRAG:
OPTSET(INP_DONTFRAG);
break;
case IP_BINDANY:
OPTSET(INP_BINDANY);
break;
}
break;
#undef OPTSET
/*
* Multicast socket options are processed by the in_mcast
* module.
*/
case IP_MULTICAST_IF:
case IP_MULTICAST_VIF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
case IP_ADD_SOURCE_MEMBERSHIP:
case IP_DROP_SOURCE_MEMBERSHIP:
case IP_BLOCK_SOURCE:
case IP_UNBLOCK_SOURCE:
case IP_MSFILTER:
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
error = inp_setmoptions(inp, sopt);
break;
case IP_PORTRANGE:
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
break;
INP_WLOCK(inp);
switch (optval) {
case IP_PORTRANGE_DEFAULT:
inp->inp_flags &= ~(INP_LOWPORT);
inp->inp_flags &= ~(INP_HIGHPORT);
break;
case IP_PORTRANGE_HIGH:
inp->inp_flags &= ~(INP_LOWPORT);
inp->inp_flags |= INP_HIGHPORT;
break;
case IP_PORTRANGE_LOW:
inp->inp_flags &= ~(INP_HIGHPORT);
inp->inp_flags |= INP_LOWPORT;
break;
default:
error = EINVAL;
break;
}
INP_WUNLOCK(inp);
break;
#ifdef IPSEC
case IP_IPSEC_POLICY:
{
caddr_t req;
struct mbuf *m;
if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
break;
if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
break;
req = mtod(m, caddr_t);
error = ipsec_set_policy(inp, sopt->sopt_name, req,
m->m_len, (sopt->sopt_td != NULL) ?
sopt->sopt_td->td_ucred : NULL);
m_freem(m);
break;
}
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
case SOPT_GET:
switch (sopt->sopt_name) {
case IP_OPTIONS:
case IP_RETOPTS:
if (inp->inp_options)
error = sooptcopyout(sopt,
mtod(inp->inp_options,
char *),
inp->inp_options->m_len);
else
sopt->sopt_valsize = 0;
break;
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVTTL:
case IP_RECVIF:
case IP_PORTRANGE:
case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
switch (sopt->sopt_name) {
case IP_TOS:
optval = inp->inp_ip_tos;
break;
case IP_TTL:
optval = inp->inp_ip_ttl;
break;
case IP_MINTTL:
optval = inp->inp_ip_minttl;
break;
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
break;
case IP_RECVRETOPTS:
optval = OPTBIT(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
optval = OPTBIT(INP_RECVDSTADDR);
break;
case IP_RECVTTL:
optval = OPTBIT(INP_RECVTTL);
break;
case IP_RECVIF:
optval = OPTBIT(INP_RECVIF);
break;
case IP_PORTRANGE:
if (inp->inp_flags & INP_HIGHPORT)
optval = IP_PORTRANGE_HIGH;
else if (inp->inp_flags & INP_LOWPORT)
optval = IP_PORTRANGE_LOW;
else
optval = 0;
break;
case IP_FAITH:
optval = OPTBIT(INP_FAITH);
break;
case IP_ONESBCAST:
optval = OPTBIT(INP_ONESBCAST);
break;
case IP_DONTFRAG:
optval = OPTBIT(INP_DONTFRAG);
break;
}
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
/*
* Multicast socket options are processed by the in_mcast
* module.
*/
case IP_MULTICAST_IF:
case IP_MULTICAST_VIF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_MSFILTER:
error = inp_getmoptions(inp, sopt);
break;
#ifdef IPSEC
case IP_IPSEC_POLICY:
{
struct mbuf *m = NULL;
caddr_t req = NULL;
size_t len = 0;
if (m != 0) {
req = mtod(m, caddr_t);
len = m->m_len;
}
error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
if (error == 0)
error = soopt_mcopyout(sopt, m); /* XXX */
if (error == 0)
m_freem(m);
break;
}
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
/*
* Routine called from ip_output() to loop back a copy of an IP multicast
* packet to the input queue of a specified interface. Note that this
* calls the output routine of the loopback "driver", but with an interface
* pointer that might NOT be a loopback interface -- evil, but easier than
* replicating that code here.
*/
static void
ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
int hlen)
{
register struct ip *ip;
struct mbuf *copym;
/*
* Make a deep copy of the packet because we're going to
* modify the pack in order to generate checksums.
*/
copym = m_dup(m, M_DONTWAIT);
if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
copym = m_pullup(copym, hlen);
if (copym != NULL) {
/* If needed, compute the checksum and mark it as valid. */
if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
in_delayed_cksum(copym);
copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
copym->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
copym->m_pkthdr.csum_data = 0xffff;
}
/*
* We don't bother to fragment if the IP length is greater
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
ip->ip_len = htons(ip->ip_len);
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, hlen);
#if 1 /* XXX */
if (dst->sin_family != AF_INET) {
printf("ip_mloopback: bad address family %d\n",
dst->sin_family);
dst->sin_family = AF_INET;
}
#endif
if_simloop(ifp, copym, dst->sin_family, 0);
}
}
Index: stable/8/sys
===================================================================
--- stable/8/sys (revision 209276)
+++ stable/8/sys (revision 209277)
Property changes on: stable/8/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys:r208553

File Metadata

Mime Type
text/x-c
Expires
Sun, Mar 29, 12:56 AM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
30450978
Default Alt Text
(350 KB)

Event Timeline