Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F149779421
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
350 KB
Referenced Files
None
Subscribers
None
View Options
This file is larger than 256 KB, so syntax highlighting was skipped.
Index: stable/8/sys/amd64/include/xen
===================================================================
--- stable/8/sys/amd64/include/xen (revision 209276)
+++ stable/8/sys/amd64/include/xen (revision 209277)
Property changes on: stable/8/sys/amd64/include/xen
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/amd64/include/xen:r208553
Index: stable/8/sys/cddl/contrib/opensolaris
===================================================================
--- stable/8/sys/cddl/contrib/opensolaris (revision 209276)
+++ stable/8/sys/cddl/contrib/opensolaris (revision 209277)
Property changes on: stable/8/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/cddl/contrib/opensolaris:r208553
Index: stable/8/sys/contrib/dev/acpica
===================================================================
--- stable/8/sys/contrib/dev/acpica (revision 209276)
+++ stable/8/sys/contrib/dev/acpica (revision 209277)
Property changes on: stable/8/sys/contrib/dev/acpica
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/dev/acpica:r208553
Index: stable/8/sys/contrib/pf
===================================================================
--- stable/8/sys/contrib/pf (revision 209276)
+++ stable/8/sys/contrib/pf (revision 209277)
Property changes on: stable/8/sys/contrib/pf
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/contrib/pf:r208553
Index: stable/8/sys/dev/xen/xenpci
===================================================================
--- stable/8/sys/dev/xen/xenpci (revision 209276)
+++ stable/8/sys/dev/xen/xenpci (revision 209277)
Property changes on: stable/8/sys/dev/xen/xenpci
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/dev/xen/xenpci:r208553
Index: stable/8/sys/geom/sched
===================================================================
--- stable/8/sys/geom/sched (revision 209276)
+++ stable/8/sys/geom/sched (revision 209277)
Property changes on: stable/8/sys/geom/sched
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys/geom/sched:r208553
Index: stable/8/sys/net/if.c
===================================================================
--- stable/8/sys/net/if.c (revision 209276)
+++ stable/8/sys/net/if.c (revision 209277)
@@ -1,3484 +1,3485 @@
/*-
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if.c 8.5 (Berkeley) 1/9/95
* $FreeBSD$
*/
#include "opt_compat.h"
#include "opt_inet6.h"
#include "opt_inet.h"
#include "opt_carp.h"
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/malloc.h>
#include <sys/sbuf.h>
#include <sys/bus.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/refcount.h>
#include <sys/module.h>
#include <sys/rwlock.h>
#include <sys/sockio.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <sys/domain.h>
#include <sys/jail.h>
#include <machine/stdarg.h>
#include <vm/uma.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <net/if.h>
#include <net/if_arp.h>
#include <net/if_clone.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/radix.h>
#include <net/route.h>
#include <net/vnet.h>
#if defined(INET) || defined(INET6)
/*XXX*/
#include <netinet/in.h>
#include <netinet/in_var.h>
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/in6_ifattach.h>
#endif
#endif
#ifdef INET
#include <netinet/if_ether.h>
#endif
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
#include <netinet/ip_carp.h>
#endif
#endif
#include <security/mac/mac_framework.h>
struct ifindex_entry {
struct ifnet *ife_ifnet;
};
static int slowtimo_started;
SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
/* Log link state change events */
static int log_link_state_change = 1;
SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
&log_link_state_change, 0,
"log interface link state change events");
/* Interface description */
static unsigned int ifdescr_maxlen = 1024;
SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
&ifdescr_maxlen, 0,
"administrative maximum length for interface description");
MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
/* global sx for non-critical path ifdescr */
static struct sx ifdescr_sx;
SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
void (*bstp_linkstate_p)(struct ifnet *ifp, int state);
void (*ng_ether_link_state_p)(struct ifnet *ifp, int state);
void (*lagg_linkstate_p)(struct ifnet *ifp, int state);
struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
/*
* XXX: Style; these should be sorted alphabetically, and unprototyped
* static functions should be prototyped. Currently they are sorted by
* declaration order.
*/
static void if_attachdomain(void *);
static void if_attachdomain1(struct ifnet *);
static int ifconf(u_long, caddr_t);
static void if_freemulti(struct ifmultiaddr *);
static void if_init(void *);
static void if_grow(void);
static void if_check(void *);
static void if_route(struct ifnet *, int flag, int fam);
static int if_setflag(struct ifnet *, int, int, int *, int);
static void if_slowtimo(void *);
static int if_transmit(struct ifnet *ifp, struct mbuf *m);
static void if_unroute(struct ifnet *, int flag, int fam);
static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
static int if_rtdel(struct radix_node *, void *);
static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *);
static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
static void do_link_state_change(void *, int);
static int if_getgroup(struct ifgroupreq *, struct ifnet *);
static int if_getgroupmembers(struct ifgroupreq *);
static void if_delgroups(struct ifnet *);
static void if_attach_internal(struct ifnet *, int);
static void if_detach_internal(struct ifnet *, int);
#ifdef INET6
/*
* XXX: declare here to avoid to include many inet6 related files..
* should be more generalized?
*/
extern void nd6_setmtu(struct ifnet *);
#endif
VNET_DEFINE(int, if_index);
int ifqmaxlen = IFQ_MAXLEN;
VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */
VNET_DEFINE(struct ifgrouphead, ifg_head);
static VNET_DEFINE(int, if_indexlim) = 8;
/* Table of ifnet by index. */
static VNET_DEFINE(struct ifindex_entry *, ifindex_table);
#define V_if_indexlim VNET(if_indexlim)
#define V_ifindex_table VNET(ifindex_table)
/*
* The global network interface list (V_ifnet) and related state (such as
* if_index, if_indexlim, and ifindex_table) are protected by an sxlock and
* an rwlock. Either may be acquired shared to stablize the list, but both
* must be acquired writable to modify the list. This model allows us to
* both stablize the interface list during interrupt thread processing, but
* also to stablize it over long-running ioctls, without introducing priority
* inversions and deadlocks.
*/
struct rwlock ifnet_rwlock;
struct sx ifnet_sxlock;
/*
* The allocation of network interfaces is a rather non-atomic affair; we
* need to select an index before we are ready to expose the interface for
* use, so will use this pointer value to indicate reservation.
*/
#define IFNET_HOLD (void *)(uintptr_t)(-1)
static if_com_alloc_t *if_com_alloc[256];
static if_com_free_t *if_com_free[256];
/*
* System initialization
*/
SYSINIT(interface_check, SI_SUB_PROTO_IF, SI_ORDER_FIRST, if_check, NULL);
MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
struct ifnet *
ifnet_byindex_locked(u_short idx)
{
if (idx > V_if_index)
return (NULL);
if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD)
return (NULL);
return (V_ifindex_table[idx].ife_ifnet);
}
struct ifnet *
ifnet_byindex(u_short idx)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
ifp = ifnet_byindex_locked(idx);
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
struct ifnet *
ifnet_byindex_ref(u_short idx)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
ifp = ifnet_byindex_locked(idx);
if (ifp == NULL || (ifp->if_flags & IFF_DYING)) {
IFNET_RUNLOCK_NOSLEEP();
return (NULL);
}
if_ref(ifp);
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
/*
* Allocate an ifindex array entry; return 0 on success or an error on
* failure.
*/
static int
ifindex_alloc_locked(u_short *idxp)
{
u_short idx;
IFNET_WLOCK_ASSERT();
/*
* Try to find an empty slot below V_if_index. If we fail, take the
* next slot.
*/
for (idx = 1; idx <= V_if_index; idx++) {
if (V_ifindex_table[idx].ife_ifnet == NULL)
break;
}
/* Catch if_index overflow. */
if (idx < 1)
return (ENOSPC);
if (idx > V_if_index)
V_if_index = idx;
if (V_if_index >= V_if_indexlim)
if_grow();
*idxp = idx;
return (0);
}
static void
ifindex_free_locked(u_short idx)
{
IFNET_WLOCK_ASSERT();
V_ifindex_table[idx].ife_ifnet = NULL;
while (V_if_index > 0 &&
V_ifindex_table[V_if_index].ife_ifnet == NULL)
V_if_index--;
}
static void
ifindex_free(u_short idx)
{
IFNET_WLOCK();
ifindex_free_locked(idx);
IFNET_WUNLOCK();
}
static void
ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp)
{
IFNET_WLOCK_ASSERT();
V_ifindex_table[idx].ife_ifnet = ifp;
}
static void
ifnet_setbyindex(u_short idx, struct ifnet *ifp)
{
IFNET_WLOCK();
ifnet_setbyindex_locked(idx, ifp);
IFNET_WUNLOCK();
}
struct ifaddr *
ifaddr_byindex(u_short idx)
{
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
ifa = ifnet_byindex_locked(idx)->if_addr;
if (ifa != NULL)
ifa_ref(ifa);
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
/*
* Network interface utility routines.
*
* Routines with ifa_ifwith* names take sockaddr *'s as
* parameters.
*/
static void
vnet_if_init(const void *unused __unused)
{
TAILQ_INIT(&V_ifnet);
TAILQ_INIT(&V_ifg_head);
if_grow(); /* create initial table */
vnet_if_clone_init();
}
VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_init,
NULL);
/* ARGSUSED*/
static void
if_init(void *dummy __unused)
{
IFNET_LOCK_INIT();
if_clone_init();
}
SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL);
#ifdef VIMAGE
static void
vnet_if_uninit(const void *unused __unused)
{
VNET_ASSERT(TAILQ_EMPTY(&V_ifnet));
VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head));
free((caddr_t)V_ifindex_table, M_IFNET);
}
VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
vnet_if_uninit, NULL);
#endif
static void
if_grow(void)
{
u_int n;
struct ifindex_entry *e;
V_if_indexlim <<= 1;
n = V_if_indexlim * sizeof(*e);
e = malloc(n, M_IFNET, M_WAITOK | M_ZERO);
if (V_ifindex_table != NULL) {
memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
free((caddr_t)V_ifindex_table, M_IFNET);
}
V_ifindex_table = e;
}
static void
if_check(void *dummy __unused)
{
/*
* If at least one interface added during boot uses
* if_watchdog then start the timer.
*/
if (slowtimo_started)
if_slowtimo(0);
}
/*
* Allocate a struct ifnet and an index for an interface. A layer 2
* common structure will also be allocated if an allocation routine is
* registered for the passed type.
*/
struct ifnet *
if_alloc(u_char type)
{
struct ifnet *ifp;
u_short idx;
ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO);
IFNET_WLOCK();
if (ifindex_alloc_locked(&idx) != 0) {
IFNET_WUNLOCK();
free(ifp, M_IFNET);
return (NULL);
}
ifnet_setbyindex_locked(idx, IFNET_HOLD);
IFNET_WUNLOCK();
ifp->if_index = idx;
ifp->if_type = type;
ifp->if_alloctype = type;
if (if_com_alloc[type] != NULL) {
ifp->if_l2com = if_com_alloc[type](type, ifp);
if (ifp->if_l2com == NULL) {
free(ifp, M_IFNET);
ifindex_free(idx);
return (NULL);
}
}
IF_ADDR_LOCK_INIT(ifp);
TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
ifp->if_afdata_initialized = 0;
IF_AFDATA_LOCK_INIT(ifp);
TAILQ_INIT(&ifp->if_addrhead);
TAILQ_INIT(&ifp->if_prefixhead);
TAILQ_INIT(&ifp->if_multiaddrs);
TAILQ_INIT(&ifp->if_groups);
#ifdef MAC
mac_ifnet_init(ifp);
#endif
ifq_init(&ifp->if_snd, ifp);
refcount_init(&ifp->if_refcount, 1); /* Index reference. */
ifnet_setbyindex(ifp->if_index, ifp);
return (ifp);
}
/*
* Do the actual work of freeing a struct ifnet, associated index, and layer
* 2 common structure. This call is made when the last reference to an
* interface is released.
*/
static void
if_free_internal(struct ifnet *ifp)
{
KASSERT((ifp->if_flags & IFF_DYING),
("if_free_internal: interface not dying"));
IFNET_WLOCK();
KASSERT(ifp == ifnet_byindex_locked(ifp->if_index),
("%s: freeing unallocated ifnet", ifp->if_xname));
ifindex_free_locked(ifp->if_index);
IFNET_WUNLOCK();
if (if_com_free[ifp->if_alloctype] != NULL)
if_com_free[ifp->if_alloctype](ifp->if_l2com,
ifp->if_alloctype);
#ifdef MAC
mac_ifnet_destroy(ifp);
#endif /* MAC */
if (ifp->if_description != NULL)
free(ifp->if_description, M_IFDESCR);
IF_AFDATA_DESTROY(ifp);
IF_ADDR_LOCK_DESTROY(ifp);
ifq_delete(&ifp->if_snd);
free(ifp, M_IFNET);
}
/*
* This version should only be called by intefaces that switch their type
* after calling if_alloc(). if_free_type() will go away again now that we
* have if_alloctype to cache the original allocation type. For now, assert
* that they match, since we require that in practice.
*/
void
if_free_type(struct ifnet *ifp, u_char type)
{
KASSERT(ifp->if_alloctype == type,
("if_free_type: type (%d) != alloctype (%d)", type,
ifp->if_alloctype));
ifp->if_flags |= IFF_DYING; /* XXX: Locking */
if (!refcount_release(&ifp->if_refcount))
return;
if_free_internal(ifp);
}
/*
* This is the normal version of if_free(), used by device drivers to free a
* detached network interface. The contents of if_free_type() will move into
* here when if_free_type() goes away.
*/
void
if_free(struct ifnet *ifp)
{
if_free_type(ifp, ifp->if_alloctype);
}
/*
* Interfaces to keep an ifnet type-stable despite the possibility of the
* driver calling if_free(). If there are additional references, we defer
* freeing the underlying data structure.
*/
void
if_ref(struct ifnet *ifp)
{
/* We don't assert the ifnet list lock here, but arguably should. */
refcount_acquire(&ifp->if_refcount);
}
void
if_rele(struct ifnet *ifp)
{
if (!refcount_release(&ifp->if_refcount))
return;
if_free_internal(ifp);
}
void
ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
{
mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
if (ifq->ifq_maxlen == 0)
ifq->ifq_maxlen = ifqmaxlen;
ifq->altq_type = 0;
ifq->altq_disc = NULL;
ifq->altq_flags &= ALTQF_CANTCHANGE;
ifq->altq_tbr = NULL;
ifq->altq_ifp = ifp;
}
void
ifq_delete(struct ifaltq *ifq)
{
mtx_destroy(&ifq->ifq_mtx);
}
/*
* Perform generic interface initalization tasks and attach the interface
* to the list of "active" interfaces. If vmove flag is set on entry
* to if_attach_internal(), perform only a limited subset of initialization
* tasks, given that we are moving from one vnet to another an ifnet which
* has already been fully initialized.
*
* XXX:
* - The decision to return void and thus require this function to
* succeed is questionable.
* - We should probably do more sanity checking. For instance we don't
* do anything to insure if_xname is unique or non-empty.
*/
void
if_attach(struct ifnet *ifp)
{
if_attach_internal(ifp, 0);
}
static void
if_attach_internal(struct ifnet *ifp, int vmove)
{
unsigned socksize, ifasize;
int namelen, masklen;
struct sockaddr_dl *sdl;
struct ifaddr *ifa;
if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index))
panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
ifp->if_xname);
#ifdef VIMAGE
ifp->if_vnet = curvnet;
if (ifp->if_home_vnet == NULL)
ifp->if_home_vnet = curvnet;
#endif
if_addgroup(ifp, IFG_ALL);
getmicrotime(&ifp->if_lastchange);
ifp->if_data.ifi_epoch = time_uptime;
ifp->if_data.ifi_datalen = sizeof(struct if_data);
KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
(ifp->if_transmit != NULL && ifp->if_qflush != NULL),
("transmit and qflush must both either be set or both be NULL"));
if (ifp->if_transmit == NULL) {
ifp->if_transmit = if_transmit;
ifp->if_qflush = if_qflush;
}
if (!vmove) {
#ifdef MAC
mac_ifnet_create(ifp);
#endif
/*
* Create a Link Level name for this device.
*/
namelen = strlen(ifp->if_xname);
/*
* Always save enough space for any possiable name so we
* can do a rename in place later.
*/
masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
socksize = masklen + ifp->if_addrlen;
if (socksize < sizeof(*sdl))
socksize = sizeof(*sdl);
socksize = roundup2(socksize, sizeof(long));
ifasize = sizeof(*ifa) + 2 * socksize;
ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
ifa_init(ifa);
sdl = (struct sockaddr_dl *)(ifa + 1);
sdl->sdl_len = socksize;
sdl->sdl_family = AF_LINK;
bcopy(ifp->if_xname, sdl->sdl_data, namelen);
sdl->sdl_nlen = namelen;
sdl->sdl_index = ifp->if_index;
sdl->sdl_type = ifp->if_type;
ifp->if_addr = ifa;
ifa->ifa_ifp = ifp;
ifa->ifa_rtrequest = link_rtrequest;
ifa->ifa_addr = (struct sockaddr *)sdl;
sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
ifa->ifa_netmask = (struct sockaddr *)sdl;
sdl->sdl_len = masklen;
while (namelen != 0)
sdl->sdl_data[--namelen] = 0xff;
TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
/* Reliably crash if used uninitialized. */
ifp->if_broadcastaddr = NULL;
}
#ifdef VIMAGE
else {
/*
* Update the interface index in the link layer address
* of the interface.
*/
for (ifa = ifp->if_addr; ifa != NULL;
ifa = TAILQ_NEXT(ifa, ifa_link)) {
if (ifa->ifa_addr->sa_family == AF_LINK) {
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
sdl->sdl_index = ifp->if_index;
}
}
}
#endif
IFNET_WLOCK();
TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
#ifdef VIMAGE
curvnet->vnet_ifcnt++;
#endif
IFNET_WUNLOCK();
if (domain_init_status >= 2)
if_attachdomain1(ifp);
EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
/* Announce the interface. */
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
if (!vmove && ifp->if_watchdog != NULL) {
if_printf(ifp,
"WARNING: using obsoleted if_watchdog interface\n");
/*
* Note that we need if_slowtimo(). If this happens after
* boot, then call if_slowtimo() directly.
*/
if (atomic_cmpset_int(&slowtimo_started, 0, 1) && !cold)
if_slowtimo(0);
}
}
static void
if_attachdomain(void *dummy)
{
struct ifnet *ifp;
int s;
s = splnet();
TAILQ_FOREACH(ifp, &V_ifnet, if_link)
if_attachdomain1(ifp);
splx(s);
}
SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
if_attachdomain, NULL);
static void
if_attachdomain1(struct ifnet *ifp)
{
struct domain *dp;
int s;
s = splnet();
/*
* Since dp->dom_ifattach calls malloc() with M_WAITOK, we
* cannot lock ifp->if_afdata initialization, entirely.
*/
if (IF_AFDATA_TRYLOCK(ifp) == 0) {
splx(s);
return;
}
if (ifp->if_afdata_initialized >= domain_init_status) {
IF_AFDATA_UNLOCK(ifp);
splx(s);
printf("if_attachdomain called more than once on %s\n",
ifp->if_xname);
return;
}
ifp->if_afdata_initialized = domain_init_status;
IF_AFDATA_UNLOCK(ifp);
/* address family dependent data region */
bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
for (dp = domains; dp; dp = dp->dom_next) {
if (dp->dom_ifattach)
ifp->if_afdata[dp->dom_family] =
(*dp->dom_ifattach)(ifp);
}
splx(s);
}
/*
* Remove any unicast or broadcast network addresses from an interface.
*/
void
if_purgeaddrs(struct ifnet *ifp)
{
struct ifaddr *ifa, *next;
TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
if (ifa->ifa_addr->sa_family == AF_LINK)
continue;
#ifdef INET
/* XXX: Ugly!! ad hoc just for INET */
if (ifa->ifa_addr->sa_family == AF_INET) {
struct ifaliasreq ifr;
bzero(&ifr, sizeof(ifr));
ifr.ifra_addr = *ifa->ifa_addr;
if (ifa->ifa_dstaddr)
ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
NULL) == 0)
continue;
}
#endif /* INET */
#ifdef INET6
if (ifa->ifa_addr->sa_family == AF_INET6) {
in6_purgeaddr(ifa);
/* ifp_addrhead is already updated */
continue;
}
#endif /* INET6 */
TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
ifa_free(ifa);
}
}
/*
* Remove any multicast network addresses from an interface when an ifnet
* is going away.
*/
static void
if_purgemaddrs(struct ifnet *ifp)
{
struct ifmultiaddr *ifma;
struct ifmultiaddr *next;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
if_delmulti_locked(ifp, ifma, 1);
IF_ADDR_UNLOCK(ifp);
}
/*
* Detach an interface, removing it from the list of "active" interfaces.
* If vmove flag is set on entry to if_detach_internal(), perform only a
* limited subset of cleanup tasks, given that we are moving an ifnet from
* one vnet to another, where it must be fully operational.
*
* XXXRW: There are some significant questions about event ordering, and
* how to prevent things from starting to use the interface during detach.
*/
void
if_detach(struct ifnet *ifp)
{
if_detach_internal(ifp, 0);
}
static void
if_detach_internal(struct ifnet *ifp, int vmove)
{
struct ifaddr *ifa;
struct radix_node_head *rnh;
int i, j;
struct domain *dp;
struct ifnet *iter;
int found = 0;
IFNET_WLOCK();
TAILQ_FOREACH(iter, &V_ifnet, if_link)
if (iter == ifp) {
TAILQ_REMOVE(&V_ifnet, ifp, if_link);
found = 1;
break;
}
#ifdef VIMAGE
if (found)
curvnet->vnet_ifcnt--;
#endif
IFNET_WUNLOCK();
if (!found) {
if (vmove)
panic("%s: ifp=%p not on the ifnet tailq %p",
__func__, ifp, &V_ifnet);
else
return; /* XXX this should panic as well? */
}
/*
* Remove/wait for pending events.
*/
taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
/*
* Remove routes and flush queues.
*/
if_down(ifp);
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
altq_disable(&ifp->if_snd);
if (ALTQ_IS_ATTACHED(&ifp->if_snd))
altq_detach(&ifp->if_snd);
#endif
if_purgeaddrs(ifp);
#ifdef INET
in_ifdetach(ifp);
#endif
#ifdef INET6
/*
* Remove all IPv6 kernel structs related to ifp. This should be done
* before removing routing entries below, since IPv6 interface direct
* routes are expected to be removed by the IPv6-specific kernel API.
* Otherwise, the kernel will detect some inconsistency and bark it.
*/
in6_ifdetach(ifp);
#endif
if_purgemaddrs(ifp);
if (!vmove) {
/*
* Prevent further calls into the device driver via ifnet.
*/
if_dead(ifp);
/*
* Remove link ifaddr pointer and maybe decrement if_index.
* Clean up all addresses.
*/
ifp->if_addr = NULL;
/* We can now free link ifaddr. */
if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
ifa = TAILQ_FIRST(&ifp->if_addrhead);
TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
ifa_free(ifa);
}
}
/*
* Delete all remaining routes using this interface
* Unfortuneatly the only way to do this is to slog through
* the entire routing table looking for routes which point
* to this interface...oh well...
*/
for (i = 1; i <= AF_MAX; i++) {
for (j = 0; j < rt_numfibs; j++) {
rnh = rt_tables_get_rnh(j, i);
if (rnh == NULL)
continue;
RADIX_NODE_HEAD_LOCK(rnh);
(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
RADIX_NODE_HEAD_UNLOCK(rnh);
}
}
/* Announce that the interface is gone. */
rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
if_delgroups(ifp);
/*
* We cannot hold the lock over dom_ifdetach calls as they might
* sleep, for example trying to drain a callout, thus open up the
* theoretical race with re-attaching.
*/
IF_AFDATA_LOCK(ifp);
i = ifp->if_afdata_initialized;
ifp->if_afdata_initialized = 0;
IF_AFDATA_UNLOCK(ifp);
for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
(*dp->dom_ifdetach)(ifp,
ifp->if_afdata[dp->dom_family]);
}
}
#ifdef VIMAGE
/*
* if_vmove() performs a limited version of if_detach() in current
* vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
* An attempt is made to shrink if_index in current vnet, find an
* unused if_index in target vnet and calls if_grow() if necessary,
* and finally find an unused if_xname for the target vnet.
*/
void
if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
{
u_short idx;
/*
* Detach from current vnet, but preserve LLADDR info, do not
* mark as dead etc. so that the ifnet can be reattached later.
*/
if_detach_internal(ifp, 1);
/*
* Unlink the ifnet from ifindex_table[] in current vnet, and shrink
* the if_index for that vnet if possible.
*
* NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized,
* or we'd lock on one vnet and unlock on another.
*/
IFNET_WLOCK();
ifindex_free_locked(ifp->if_index);
/*
* Switch to the context of the target vnet.
*/
CURVNET_SET_QUIET(new_vnet);
if (ifindex_alloc_locked(&idx) != 0) {
IFNET_WUNLOCK();
panic("if_index overflow");
}
ifp->if_index = idx;
ifnet_setbyindex_locked(ifp->if_index, ifp);
IFNET_WUNLOCK();
if_attach_internal(ifp, 1);
CURVNET_RESTORE();
}
/*
* Move an ifnet to or from another child prison/vnet, specified by the jail id.
*/
static int
if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
{
struct prison *pr;
struct ifnet *difp;
/* Try to find the prison within our visibility. */
sx_slock(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, jid);
sx_sunlock(&allprison_lock);
if (pr == NULL)
return (ENXIO);
prison_hold_locked(pr);
mtx_unlock(&pr->pr_mtx);
/* Do not try to move the iface from and to the same prison. */
if (pr->pr_vnet == ifp->if_vnet) {
prison_free(pr);
return (EEXIST);
}
/* Make sure the named iface does not exists in the dst. prison/vnet. */
/* XXX Lock interfaces to avoid races. */
CURVNET_SET_QUIET(pr->pr_vnet);
difp = ifunit(ifname);
CURVNET_RESTORE();
if (difp != NULL) {
prison_free(pr);
return (EEXIST);
}
/* Move the interface into the child jail/vnet. */
if_vmove(ifp, pr->pr_vnet);
/* Report the new if_xname back to the userland. */
sprintf(ifname, "%s", ifp->if_xname);
prison_free(pr);
return (0);
}
static int
if_vmove_reclaim(struct thread *td, char *ifname, int jid)
{
struct prison *pr;
struct vnet *vnet_dst;
struct ifnet *ifp;
/* Try to find the prison within our visibility. */
sx_slock(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, jid);
sx_sunlock(&allprison_lock);
if (pr == NULL)
return (ENXIO);
prison_hold_locked(pr);
mtx_unlock(&pr->pr_mtx);
/* Make sure the named iface exists in the source prison/vnet. */
CURVNET_SET(pr->pr_vnet);
ifp = ifunit(ifname); /* XXX Lock to avoid races. */
if (ifp == NULL) {
CURVNET_RESTORE();
prison_free(pr);
return (ENXIO);
}
/* Do not try to move the iface from and to the same prison. */
vnet_dst = TD_TO_VNET(td);
if (vnet_dst == ifp->if_vnet) {
CURVNET_RESTORE();
prison_free(pr);
return (EEXIST);
}
/* Get interface back from child jail/vnet. */
if_vmove(ifp, vnet_dst);
CURVNET_RESTORE();
/* Report the new if_xname back to the userland. */
sprintf(ifname, "%s", ifp->if_xname);
prison_free(pr);
return (0);
}
#endif /* VIMAGE */
/*
* Add a group to an interface
*/
int
if_addgroup(struct ifnet *ifp, const char *groupname)
{
struct ifg_list *ifgl;
struct ifg_group *ifg = NULL;
struct ifg_member *ifgm;
if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
groupname[strlen(groupname) - 1] <= '9')
return (EINVAL);
IFNET_WLOCK();
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
IFNET_WUNLOCK();
return (EEXIST);
}
if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
M_NOWAIT)) == NULL) {
IFNET_WUNLOCK();
return (ENOMEM);
}
if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
M_TEMP, M_NOWAIT)) == NULL) {
free(ifgl, M_TEMP);
IFNET_WUNLOCK();
return (ENOMEM);
}
TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
if (!strcmp(ifg->ifg_group, groupname))
break;
if (ifg == NULL) {
if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
M_TEMP, M_NOWAIT)) == NULL) {
free(ifgl, M_TEMP);
free(ifgm, M_TEMP);
IFNET_WUNLOCK();
return (ENOMEM);
}
strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
ifg->ifg_refcnt = 0;
TAILQ_INIT(&ifg->ifg_members);
EVENTHANDLER_INVOKE(group_attach_event, ifg);
TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
}
ifg->ifg_refcnt++;
ifgl->ifgl_group = ifg;
ifgm->ifgm_ifp = ifp;
IF_ADDR_LOCK(ifp);
TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
IF_ADDR_UNLOCK(ifp);
IFNET_WUNLOCK();
EVENTHANDLER_INVOKE(group_change_event, groupname);
return (0);
}
/*
* Remove a group from an interface
*/
int
if_delgroup(struct ifnet *ifp, const char *groupname)
{
struct ifg_list *ifgl;
struct ifg_member *ifgm;
IFNET_WLOCK();
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
break;
if (ifgl == NULL) {
IFNET_WUNLOCK();
return (ENOENT);
}
IF_ADDR_LOCK(ifp);
TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
IF_ADDR_UNLOCK(ifp);
TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
if (ifgm->ifgm_ifp == ifp)
break;
if (ifgm != NULL) {
TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
free(ifgm, M_TEMP);
}
if (--ifgl->ifgl_group->ifg_refcnt == 0) {
TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
free(ifgl->ifgl_group, M_TEMP);
}
IFNET_WUNLOCK();
free(ifgl, M_TEMP);
EVENTHANDLER_INVOKE(group_change_event, groupname);
return (0);
}
/*
* Remove an interface from all groups
*/
static void
if_delgroups(struct ifnet *ifp)
{
struct ifg_list *ifgl;
struct ifg_member *ifgm;
char groupname[IFNAMSIZ];
IFNET_WLOCK();
while (!TAILQ_EMPTY(&ifp->if_groups)) {
ifgl = TAILQ_FIRST(&ifp->if_groups);
strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
IF_ADDR_LOCK(ifp);
TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
IF_ADDR_UNLOCK(ifp);
TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
if (ifgm->ifgm_ifp == ifp)
break;
if (ifgm != NULL) {
TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
ifgm_next);
free(ifgm, M_TEMP);
}
if (--ifgl->ifgl_group->ifg_refcnt == 0) {
TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
EVENTHANDLER_INVOKE(group_detach_event,
ifgl->ifgl_group);
free(ifgl->ifgl_group, M_TEMP);
}
IFNET_WUNLOCK();
free(ifgl, M_TEMP);
EVENTHANDLER_INVOKE(group_change_event, groupname);
IFNET_WLOCK();
}
IFNET_WUNLOCK();
}
/*
* Stores all groups from an interface in memory pointed
* to by data
*/
static int
if_getgroup(struct ifgroupreq *data, struct ifnet *ifp)
{
int len, error;
struct ifg_list *ifgl;
struct ifg_req ifgrq, *ifgp;
struct ifgroupreq *ifgr = data;
if (ifgr->ifgr_len == 0) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
ifgr->ifgr_len += sizeof(struct ifg_req);
IF_ADDR_UNLOCK(ifp);
return (0);
}
len = ifgr->ifgr_len;
ifgp = ifgr->ifgr_groups;
/* XXX: wire */
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
if (len < sizeof(ifgrq)) {
IF_ADDR_UNLOCK(ifp);
return (EINVAL);
}
bzero(&ifgrq, sizeof ifgrq);
strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
sizeof(ifgrq.ifgrq_group));
if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
IF_ADDR_UNLOCK(ifp);
return (error);
}
len -= sizeof(ifgrq);
ifgp++;
}
IF_ADDR_UNLOCK(ifp);
return (0);
}
/*
* Stores all members of a group in memory pointed to by data
*/
static int
if_getgroupmembers(struct ifgroupreq *data)
{
struct ifgroupreq *ifgr = data;
struct ifg_group *ifg;
struct ifg_member *ifgm;
struct ifg_req ifgrq, *ifgp;
int len, error;
IFNET_RLOCK();
TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
break;
if (ifg == NULL) {
IFNET_RUNLOCK();
return (ENOENT);
}
if (ifgr->ifgr_len == 0) {
TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
ifgr->ifgr_len += sizeof(ifgrq);
IFNET_RUNLOCK();
return (0);
}
len = ifgr->ifgr_len;
ifgp = ifgr->ifgr_groups;
TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
if (len < sizeof(ifgrq)) {
IFNET_RUNLOCK();
return (EINVAL);
}
bzero(&ifgrq, sizeof ifgrq);
strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
sizeof(ifgrq.ifgrq_member));
if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
IFNET_RUNLOCK();
return (error);
}
len -= sizeof(ifgrq);
ifgp++;
}
IFNET_RUNLOCK();
return (0);
}
/*
* Delete Routes for a Network Interface
*
* Called for each routing entry via the rnh->rnh_walktree() call above
* to delete all route entries referencing a detaching network interface.
*
* Arguments:
* rn pointer to node in the routing table
* arg argument passed to rnh->rnh_walktree() - detaching interface
*
* Returns:
* 0 successful
* errno failed - reason indicated
*
*/
static int
if_rtdel(struct radix_node *rn, void *arg)
{
struct rtentry *rt = (struct rtentry *)rn;
struct ifnet *ifp = arg;
int err;
if (rt->rt_ifp == ifp) {
/*
* Protect (sorta) against walktree recursion problems
* with cloned routes
*/
if ((rt->rt_flags & RTF_UP) == 0)
return (0);
err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
rt_mask(rt), rt->rt_flags|RTF_RNH_LOCKED,
(struct rtentry **) NULL, rt->rt_fibnum);
if (err) {
log(LOG_WARNING, "if_rtdel: error %d\n", err);
}
}
return (0);
}
/*
* Wrapper functions for struct ifnet address list locking macros. These are
* used by kernel modules to avoid encoding programming interface or binary
* interface assumptions that may be violated when kernel-internal locking
* approaches change.
*/
void
if_addr_rlock(struct ifnet *ifp)
{
IF_ADDR_LOCK(ifp);
}
void
if_addr_runlock(struct ifnet *ifp)
{
IF_ADDR_UNLOCK(ifp);
}
void
if_maddr_rlock(struct ifnet *ifp)
{
IF_ADDR_LOCK(ifp);
}
void
if_maddr_runlock(struct ifnet *ifp)
{
IF_ADDR_UNLOCK(ifp);
}
/*
* Reference count functions for ifaddrs.
*/
void
ifa_init(struct ifaddr *ifa)
{
mtx_init(&ifa->ifa_mtx, "ifaddr", NULL, MTX_DEF);
refcount_init(&ifa->ifa_refcnt, 1);
}
void
ifa_ref(struct ifaddr *ifa)
{
refcount_acquire(&ifa->ifa_refcnt);
}
void
ifa_free(struct ifaddr *ifa)
{
if (refcount_release(&ifa->ifa_refcnt)) {
mtx_destroy(&ifa->ifa_mtx);
free(ifa, M_IFADDR);
}
}
int
ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
{
int error = 0;
struct rtentry *rt = NULL;
struct rt_addrinfo info;
static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
bzero(&info, sizeof(info));
info.rti_ifp = V_loif;
info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
info.rti_info[RTAX_DST] = ia;
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
error = rtrequest1_fib(RTM_ADD, &info, &rt, 0);
if (error == 0 && rt != NULL) {
RT_LOCK(rt);
((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
ifa->ifa_ifp->if_type;
((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
ifa->ifa_ifp->if_index;
RT_REMREF(rt);
RT_UNLOCK(rt);
} else if (error != 0)
log(LOG_INFO, "ifa_add_loopback_route: insertion failed\n");
return (error);
}
int
ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia)
{
int error = 0;
struct rt_addrinfo info;
struct sockaddr_dl null_sdl;
bzero(&null_sdl, sizeof(null_sdl));
null_sdl.sdl_len = sizeof(null_sdl);
null_sdl.sdl_family = AF_LINK;
null_sdl.sdl_type = ifa->ifa_ifp->if_type;
null_sdl.sdl_index = ifa->ifa_ifp->if_index;
bzero(&info, sizeof(info));
info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC;
info.rti_info[RTAX_DST] = ia;
info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl;
error = rtrequest1_fib(RTM_DELETE, &info, NULL, 0);
if (error != 0)
log(LOG_INFO, "ifa_del_loopback_route: deletion failed\n");
return (error);
}
/*
* XXX: Because sockaddr_dl has deeper structure than the sockaddr
* structs used to represent other address families, it is necessary
* to perform a different comparison.
*/
#define sa_equal(a1, a2) \
(bcmp((a1), (a2), ((a1))->sa_len) == 0)
#define sa_dl_equal(a1, a2) \
((((struct sockaddr_dl *)(a1))->sdl_len == \
((struct sockaddr_dl *)(a2))->sdl_len) && \
(bcmp(LLADDR((struct sockaddr_dl *)(a1)), \
LLADDR((struct sockaddr_dl *)(a2)), \
((struct sockaddr_dl *)(a1))->sdl_alen) == 0))
/*
* Locate an interface based on a complete address.
*/
/*ARGSUSED*/
static struct ifaddr *
ifa_ifwithaddr_internal(struct sockaddr *addr, int getref)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if (sa_equal(addr, ifa->ifa_addr)) {
if (getref)
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
/* IP6 doesn't have broadcast */
if ((ifp->if_flags & IFF_BROADCAST) &&
ifa->ifa_broadaddr &&
ifa->ifa_broadaddr->sa_len != 0 &&
sa_equal(ifa->ifa_broadaddr, addr)) {
if (getref)
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
struct ifaddr *
ifa_ifwithaddr(struct sockaddr *addr)
{
return (ifa_ifwithaddr_internal(addr, 1));
}
int
ifa_ifwithaddr_check(struct sockaddr *addr)
{
return (ifa_ifwithaddr_internal(addr, 0) != NULL);
}
/*
* Locate an interface based on the broadcast address.
*/
/* ARGSUSED */
struct ifaddr *
ifa_ifwithbroadaddr(struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if ((ifp->if_flags & IFF_BROADCAST) &&
ifa->ifa_broadaddr &&
ifa->ifa_broadaddr->sa_len != 0 &&
sa_equal(ifa->ifa_broadaddr, addr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
/*
* Locate the point to point interface with a given destination address.
*/
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithdstaddr(struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
continue;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if (ifa->ifa_dstaddr != NULL &&
sa_equal(addr, ifa->ifa_dstaddr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
return (ifa);
}
/*
* Find an interface on a specific network. If many, choice
* is most specific found.
*/
struct ifaddr *
-ifa_ifwithnet(struct sockaddr *addr)
+ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifaddr *ifa_maybe = NULL;
u_int af = addr->sa_family;
char *addr_data = addr->sa_data, *cplim;
/*
* AF_LINK addresses can be looked up directly by their index number,
* so do that if we can.
*/
if (af == AF_LINK) {
struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
return (ifaddr_byindex(sdl->sdl_index));
}
/*
* Scan though each interface, looking for ones that have addresses
* in this address family. Maintain a reference on ifa_maybe once
* we find one, as we release the IF_ADDR_LOCK() that kept it stable
* when we move onto the next interface.
*/
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
char *cp, *cp2, *cp3;
if (ifa->ifa_addr->sa_family != af)
next: continue;
- if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
+ if (af == AF_INET &&
+ ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
/*
* This is a bit broken as it doesn't
* take into account that the remote end may
* be a single node in the network we are
* looking for.
* The trouble is that we don't know the
* netmask for the remote end.
*/
if (ifa->ifa_dstaddr != NULL &&
sa_equal(addr, ifa->ifa_dstaddr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
} else {
/*
* if we have a special address handler,
* then use it instead of the generic one.
*/
if (ifa->ifa_claim_addr) {
if ((*ifa->ifa_claim_addr)(ifa, addr)) {
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
goto done;
}
continue;
}
/*
* Scan all the bits in the ifa's address.
* If a bit dissagrees with what we are
* looking for, mask it with the netmask
* to see if it really matters.
* (A byte at a time)
*/
if (ifa->ifa_netmask == 0)
continue;
cp = addr_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = ifa->ifa_netmask->sa_len
+ (char *)ifa->ifa_netmask;
while (cp3 < cplim)
if ((*cp++ ^ *cp2++) & *cp3++)
goto next; /* next address! */
/*
* If the netmask of what we just found
* is more specific than what we had before
* (if we had one) then remember the new one
* before continuing to search
* for an even better one.
*/
if (ifa_maybe == NULL ||
rn_refines((caddr_t)ifa->ifa_netmask,
(caddr_t)ifa_maybe->ifa_netmask)) {
if (ifa_maybe != NULL)
ifa_free(ifa_maybe);
ifa_maybe = ifa;
ifa_ref(ifa_maybe);
}
}
}
IF_ADDR_UNLOCK(ifp);
}
ifa = ifa_maybe;
ifa_maybe = NULL;
done:
IFNET_RUNLOCK_NOSLEEP();
if (ifa_maybe != NULL)
ifa_free(ifa_maybe);
return (ifa);
}
/*
* Find an interface address specific to an interface best matching
* a given address.
*/
struct ifaddr *
ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
{
struct ifaddr *ifa;
char *cp, *cp2, *cp3;
char *cplim;
struct ifaddr *ifa_maybe = NULL;
u_int af = addr->sa_family;
if (af >= AF_MAX)
return (0);
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != af)
continue;
if (ifa_maybe == NULL)
ifa_maybe = ifa;
if (ifa->ifa_netmask == 0) {
if (sa_equal(addr, ifa->ifa_addr) ||
(ifa->ifa_dstaddr &&
sa_equal(addr, ifa->ifa_dstaddr)))
goto done;
continue;
}
if (ifp->if_flags & IFF_POINTOPOINT) {
if (sa_equal(addr, ifa->ifa_dstaddr))
goto done;
} else {
cp = addr->sa_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
for (; cp3 < cplim; cp3++)
if ((*cp++ ^ *cp2++) & *cp3)
break;
if (cp3 == cplim)
goto done;
}
}
ifa = ifa_maybe;
done:
if (ifa != NULL)
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
return (ifa);
}
#include <net/if_llatbl.h>
/*
* Default action when installing a route with a Link Level gateway.
* Lookup an appropriate real ifa to point to.
* This should be moved to /sys/net/link.c eventually.
*/
static void
link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
{
struct ifaddr *ifa, *oifa;
struct sockaddr *dst;
struct ifnet *ifp;
RT_LOCK_ASSERT(rt);
if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) ||
((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0))
return;
ifa = ifaof_ifpforaddr(dst, ifp);
if (ifa) {
oifa = rt->rt_ifa;
rt->rt_ifa = ifa;
ifa_free(oifa);
if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
ifa->ifa_rtrequest(cmd, rt, info);
}
}
/*
* Mark an interface down and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
static void
if_unroute(struct ifnet *ifp, int flag, int fam)
{
struct ifaddr *ifa;
KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
ifp->if_flags &= ~flag;
getmicrotime(&ifp->if_lastchange);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
ifp->if_qflush(ifp);
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
if (ifp->if_carp)
carp_carpdev_state(ifp->if_carp);
#endif
#endif
rt_ifmsg(ifp);
}
/*
* Mark an interface up and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
static void
if_route(struct ifnet *ifp, int flag, int fam)
{
struct ifaddr *ifa;
KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
ifp->if_flags |= flag;
getmicrotime(&ifp->if_lastchange);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
pfctlinput(PRC_IFUP, ifa->ifa_addr);
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
if (ifp->if_carp)
carp_carpdev_state(ifp->if_carp);
#endif
#endif
rt_ifmsg(ifp);
#ifdef INET6
in6_if_up(ifp);
#endif
}
void (*vlan_link_state_p)(struct ifnet *, int); /* XXX: private from if_vlan */
void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */
/*
* Handle a change in the interface link state. To avoid LORs
* between driver lock and upper layer locks, as well as possible
* recursions, we post event to taskqueue, and all job
* is done in static do_link_state_change().
*/
void
if_link_state_change(struct ifnet *ifp, int link_state)
{
/* Return if state hasn't changed. */
if (ifp->if_link_state == link_state)
return;
ifp->if_link_state = link_state;
taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
}
static void
do_link_state_change(void *arg, int pending)
{
struct ifnet *ifp = (struct ifnet *)arg;
int link_state = ifp->if_link_state;
CURVNET_SET(ifp->if_vnet);
/* Notify that the link state has changed. */
rt_ifmsg(ifp);
if (ifp->if_vlantrunk != NULL)
(*vlan_link_state_p)(ifp, 0);
if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
IFP2AC(ifp)->ac_netgraph != NULL)
(*ng_ether_link_state_p)(ifp, link_state);
#if defined(INET) || defined(INET6)
#ifdef DEV_CARP
if (ifp->if_carp)
carp_carpdev_state(ifp->if_carp);
#endif
#endif
if (ifp->if_bridge) {
KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!"));
(*bstp_linkstate_p)(ifp, link_state);
}
if (ifp->if_lagg) {
KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!"));
(*lagg_linkstate_p)(ifp, link_state);
}
if (IS_DEFAULT_VNET(curvnet))
devctl_notify("IFNET", ifp->if_xname,
(link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
NULL);
if (pending > 1)
if_printf(ifp, "%d link states coalesced\n", pending);
if (log_link_state_change)
log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
(link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
CURVNET_RESTORE();
}
/*
* Mark an interface down and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
void
if_down(struct ifnet *ifp)
{
if_unroute(ifp, IFF_UP, AF_UNSPEC);
}
/*
* Mark an interface up and notify protocols of
* the transition.
* NOTE: must be called at splnet or eqivalent.
*/
void
if_up(struct ifnet *ifp)
{
if_route(ifp, IFF_UP, AF_UNSPEC);
}
/*
* Flush an interface queue.
*/
void
if_qflush(struct ifnet *ifp)
{
struct mbuf *m, *n;
struct ifaltq *ifq;
ifq = &ifp->if_snd;
IFQ_LOCK(ifq);
#ifdef ALTQ
if (ALTQ_IS_ENABLED(ifq))
ALTQ_PURGE(ifq);
#endif
n = ifq->ifq_head;
while ((m = n) != 0) {
n = m->m_act;
m_freem(m);
}
ifq->ifq_head = 0;
ifq->ifq_tail = 0;
ifq->ifq_len = 0;
IFQ_UNLOCK(ifq);
}
/*
* Handle interface watchdog timer routines. Called
* from softclock, we decrement timers (if set) and
* call the appropriate interface routine on expiration.
*
* XXXRW: Note that because timeouts run with Giant, if_watchdog() is called
* holding Giant.
*/
static void
if_slowtimo(void *arg)
{
VNET_ITERATOR_DECL(vnet_iter);
struct ifnet *ifp;
int s = splimp();
VNET_LIST_RLOCK_NOSLEEP();
IFNET_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (ifp->if_timer == 0 || --ifp->if_timer)
continue;
if (ifp->if_watchdog)
(*ifp->if_watchdog)(ifp);
}
CURVNET_RESTORE();
}
IFNET_RUNLOCK_NOSLEEP();
VNET_LIST_RUNLOCK_NOSLEEP();
splx(s);
timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ);
}
/*
* Map interface name to interface structure pointer, with or without
* returning a reference.
*/
struct ifnet *
ifunit_ref(const char *name)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
!(ifp->if_flags & IFF_DYING))
break;
}
if (ifp != NULL)
if_ref(ifp);
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
struct ifnet *
ifunit(const char *name)
{
struct ifnet *ifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
break;
}
IFNET_RUNLOCK_NOSLEEP();
return (ifp);
}
/*
* Hardware specific interface ioctls.
*/
static int
ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
{
struct ifreq *ifr;
struct ifstat *ifs;
int error = 0;
int new_flags, temp_flags;
size_t namelen, onamelen;
size_t descrlen;
char *descrbuf, *odescrbuf;
char new_name[IFNAMSIZ];
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
ifr = (struct ifreq *)data;
switch (cmd) {
case SIOCGIFINDEX:
ifr->ifr_index = ifp->if_index;
break;
case SIOCGIFFLAGS:
temp_flags = ifp->if_flags | ifp->if_drv_flags;
ifr->ifr_flags = temp_flags & 0xffff;
ifr->ifr_flagshigh = temp_flags >> 16;
break;
case SIOCGIFCAP:
ifr->ifr_reqcap = ifp->if_capabilities;
ifr->ifr_curcap = ifp->if_capenable;
break;
#ifdef MAC
case SIOCGIFMAC:
error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
break;
#endif
case SIOCGIFMETRIC:
ifr->ifr_metric = ifp->if_metric;
break;
case SIOCGIFMTU:
ifr->ifr_mtu = ifp->if_mtu;
break;
case SIOCGIFPHYS:
ifr->ifr_phys = ifp->if_physical;
break;
case SIOCGIFDESCR:
error = 0;
sx_slock(&ifdescr_sx);
if (ifp->if_description == NULL)
error = ENOMSG;
else {
/* space for terminating nul */
descrlen = strlen(ifp->if_description) + 1;
if (ifr->ifr_buffer.length < descrlen)
ifr->ifr_buffer.buffer = NULL;
else
error = copyout(ifp->if_description,
ifr->ifr_buffer.buffer, descrlen);
ifr->ifr_buffer.length = descrlen;
}
sx_sunlock(&ifdescr_sx);
break;
case SIOCSIFDESCR:
error = priv_check(td, PRIV_NET_SETIFDESCR);
if (error)
return (error);
/*
* Copy only (length-1) bytes to make sure that
* if_description is always nul terminated. The
* length parameter is supposed to count the
* terminating nul in.
*/
if (ifr->ifr_buffer.length > ifdescr_maxlen)
return (ENAMETOOLONG);
else if (ifr->ifr_buffer.length == 0)
descrbuf = NULL;
else {
descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR,
M_WAITOK | M_ZERO);
error = copyin(ifr->ifr_buffer.buffer, descrbuf,
ifr->ifr_buffer.length - 1);
if (error) {
free(descrbuf, M_IFDESCR);
break;
}
}
sx_xlock(&ifdescr_sx);
odescrbuf = ifp->if_description;
ifp->if_description = descrbuf;
sx_xunlock(&ifdescr_sx);
getmicrotime(&ifp->if_lastchange);
free(odescrbuf, M_IFDESCR);
break;
case SIOCSIFFLAGS:
error = priv_check(td, PRIV_NET_SETIFFLAGS);
if (error)
return (error);
/*
* Currently, no driver owned flags pass the IFF_CANTCHANGE
* check, so we don't need special handling here yet.
*/
new_flags = (ifr->ifr_flags & 0xffff) |
(ifr->ifr_flagshigh << 16);
if (ifp->if_flags & IFF_SMART) {
/* Smart drivers twiddle their own routes */
} else if (ifp->if_flags & IFF_UP &&
(new_flags & IFF_UP) == 0) {
int s = splimp();
if_down(ifp);
splx(s);
} else if (new_flags & IFF_UP &&
(ifp->if_flags & IFF_UP) == 0) {
int s = splimp();
if_up(ifp);
splx(s);
}
/* See if permanently promiscuous mode bit is about to flip */
if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
if (new_flags & IFF_PPROMISC)
ifp->if_flags |= IFF_PROMISC;
else if (ifp->if_pcount == 0)
ifp->if_flags &= ~IFF_PROMISC;
log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
ifp->if_xname,
(new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
}
ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
(new_flags &~ IFF_CANTCHANGE);
if (ifp->if_ioctl) {
(void) (*ifp->if_ioctl)(ifp, cmd, data);
}
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFCAP:
error = priv_check(td, PRIV_NET_SETIFCAP);
if (error)
return (error);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
if (ifr->ifr_reqcap & ~ifp->if_capabilities)
return (EINVAL);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
#ifdef MAC
case SIOCSIFMAC:
error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
break;
#endif
case SIOCSIFNAME:
error = priv_check(td, PRIV_NET_SETIFNAME);
if (error)
return (error);
error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
if (error != 0)
return (error);
if (new_name[0] == '\0')
return (EINVAL);
if (ifunit(new_name) != NULL)
return (EEXIST);
/*
* XXX: Locking. Nothing else seems to lock if_flags,
* and there are numerous other races with the
* ifunit() checks not being atomic with namespace
* changes (renames, vmoves, if_attach, etc).
*/
ifp->if_flags |= IFF_RENAMING;
/* Announce the departure of the interface. */
rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
log(LOG_INFO, "%s: changing name to '%s'\n",
ifp->if_xname, new_name);
strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
ifa = ifp->if_addr;
IFA_LOCK(ifa);
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
namelen = strlen(new_name);
onamelen = sdl->sdl_nlen;
/*
* Move the address if needed. This is safe because we
* allocate space for a name of length IFNAMSIZ when we
* create this in if_attach().
*/
if (namelen != onamelen) {
bcopy(sdl->sdl_data + onamelen,
sdl->sdl_data + namelen, sdl->sdl_alen);
}
bcopy(new_name, sdl->sdl_data, namelen);
sdl->sdl_nlen = namelen;
sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
bzero(sdl->sdl_data, onamelen);
while (namelen != 0)
sdl->sdl_data[--namelen] = 0xff;
IFA_UNLOCK(ifa);
EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
/* Announce the return of the interface. */
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
ifp->if_flags &= ~IFF_RENAMING;
break;
#ifdef VIMAGE
case SIOCSIFVNET:
error = priv_check(td, PRIV_NET_SETIFVNET);
if (error)
return (error);
error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
break;
#endif
case SIOCSIFMETRIC:
error = priv_check(td, PRIV_NET_SETIFMETRIC);
if (error)
return (error);
ifp->if_metric = ifr->ifr_metric;
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFPHYS:
error = priv_check(td, PRIV_NET_SETIFPHYS);
if (error)
return (error);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFMTU:
{
u_long oldmtu = ifp->if_mtu;
error = priv_check(td, PRIV_NET_SETIFMTU);
if (error)
return (error);
if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
return (EINVAL);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0) {
getmicrotime(&ifp->if_lastchange);
rt_ifmsg(ifp);
}
/*
* If the link MTU changed, do network layer specific procedure.
*/
if (ifp->if_mtu != oldmtu) {
#ifdef INET6
nd6_setmtu(ifp);
#endif
}
break;
}
case SIOCADDMULTI:
case SIOCDELMULTI:
if (cmd == SIOCADDMULTI)
error = priv_check(td, PRIV_NET_ADDMULTI);
else
error = priv_check(td, PRIV_NET_DELMULTI);
if (error)
return (error);
/* Don't allow group membership on non-multicast interfaces. */
if ((ifp->if_flags & IFF_MULTICAST) == 0)
return (EOPNOTSUPP);
/* Don't let users screw up protocols' entries. */
if (ifr->ifr_addr.sa_family != AF_LINK)
return (EINVAL);
if (cmd == SIOCADDMULTI) {
struct ifmultiaddr *ifma;
/*
* Userland is only permitted to join groups once
* via the if_addmulti() KPI, because it cannot hold
* struct ifmultiaddr * between calls. It may also
* lose a race while we check if the membership
* already exists.
*/
IF_ADDR_LOCK(ifp);
ifma = if_findmulti(ifp, &ifr->ifr_addr);
IF_ADDR_UNLOCK(ifp);
if (ifma != NULL)
error = EADDRINUSE;
else
error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
} else {
error = if_delmulti(ifp, &ifr->ifr_addr);
}
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
case SIOCSIFPHYADDR:
case SIOCDIFPHYADDR:
#ifdef INET6
case SIOCSIFPHYADDR_IN6:
#endif
case SIOCSLIFPHYADDR:
case SIOCSIFMEDIA:
case SIOCSIFGENERIC:
error = priv_check(td, PRIV_NET_HWIOCTL);
if (error)
return (error);
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
if (error == 0)
getmicrotime(&ifp->if_lastchange);
break;
case SIOCGIFSTATUS:
ifs = (struct ifstat *)data;
ifs->ascii[0] = '\0';
case SIOCGIFPSRCADDR:
case SIOCGIFPDSTADDR:
case SIOCGLIFPHYADDR:
case SIOCGIFMEDIA:
case SIOCGIFGENERIC:
if (ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
error = (*ifp->if_ioctl)(ifp, cmd, data);
break;
case SIOCSIFLLADDR:
error = priv_check(td, PRIV_NET_SETLLADDR);
if (error)
return (error);
error = if_setlladdr(ifp,
ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
EVENTHANDLER_INVOKE(iflladdr_event, ifp);
break;
case SIOCAIFGROUP:
{
struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
error = priv_check(td, PRIV_NET_ADDIFGROUP);
if (error)
return (error);
if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
return (error);
break;
}
case SIOCGIFGROUP:
if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
return (error);
break;
case SIOCDIFGROUP:
{
struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr;
error = priv_check(td, PRIV_NET_DELIFGROUP);
if (error)
return (error);
if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
return (error);
break;
}
default:
error = ENOIOCTL;
break;
}
return (error);
}
/*
* Interface ioctls.
*/
int
ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
{
struct ifnet *ifp;
struct ifreq *ifr;
int error;
int oif_flags;
switch (cmd) {
case SIOCGIFCONF:
case OSIOCGIFCONF:
#ifdef __amd64__
case SIOCGIFCONF32:
#endif
return (ifconf(cmd, data));
}
ifr = (struct ifreq *)data;
switch (cmd) {
#ifdef VIMAGE
case SIOCSIFRVNET:
error = priv_check(td, PRIV_NET_SETIFVNET);
if (error)
return (error);
return (if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid));
#endif
case SIOCIFCREATE:
case SIOCIFCREATE2:
error = priv_check(td, PRIV_NET_IFCREATE);
if (error)
return (error);
return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
case SIOCIFDESTROY:
error = priv_check(td, PRIV_NET_IFDESTROY);
if (error)
return (error);
return if_clone_destroy(ifr->ifr_name);
case SIOCIFGCLONERS:
return (if_clone_list((struct if_clonereq *)data));
case SIOCGIFGMEMB:
return (if_getgroupmembers((struct ifgroupreq *)data));
}
ifp = ifunit_ref(ifr->ifr_name);
if (ifp == NULL)
return (ENXIO);
error = ifhwioctl(cmd, ifp, data, td);
if (error != ENOIOCTL) {
if_rele(ifp);
return (error);
}
oif_flags = ifp->if_flags;
if (so->so_proto == NULL) {
if_rele(ifp);
return (EOPNOTSUPP);
}
#ifndef COMPAT_43
error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd,
data,
ifp, td));
if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL)
error = (*ifp->if_ioctl)(ifp, cmd, data);
#else
{
u_long ocmd = cmd;
switch (cmd) {
case SIOCSIFDSTADDR:
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
#if BYTE_ORDER != BIG_ENDIAN
if (ifr->ifr_addr.sa_family == 0 &&
ifr->ifr_addr.sa_len < 16) {
ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
ifr->ifr_addr.sa_len = 16;
}
#else
if (ifr->ifr_addr.sa_len == 0)
ifr->ifr_addr.sa_len = 16;
#endif
break;
case OSIOCGIFADDR:
cmd = SIOCGIFADDR;
break;
case OSIOCGIFDSTADDR:
cmd = SIOCGIFDSTADDR;
break;
case OSIOCGIFBRDADDR:
cmd = SIOCGIFBRDADDR;
break;
case OSIOCGIFNETMASK:
cmd = SIOCGIFNETMASK;
}
error = ((*so->so_proto->pr_usrreqs->pru_control)(so,
cmd,
data,
ifp, td));
if (error == EOPNOTSUPP && ifp != NULL &&
ifp->if_ioctl != NULL)
error = (*ifp->if_ioctl)(ifp, cmd, data);
switch (ocmd) {
case OSIOCGIFADDR:
case OSIOCGIFDSTADDR:
case OSIOCGIFBRDADDR:
case OSIOCGIFNETMASK:
*(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
}
}
#endif /* COMPAT_43 */
if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
#ifdef INET6
if (ifp->if_flags & IFF_UP) {
int s = splimp();
in6_if_up(ifp);
splx(s);
}
#endif
}
if_rele(ifp);
return (error);
}
/*
* The code common to handling reference counted flags,
* e.g., in ifpromisc() and if_allmulti().
* The "pflag" argument can specify a permanent mode flag to check,
* such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
*
* Only to be used on stack-owned flags, not driver-owned flags.
*/
static int
if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
{
struct ifreq ifr;
int error;
int oldflags, oldcount;
/* Sanity checks to catch programming errors */
KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
("%s: setting driver-owned flag %d", __func__, flag));
if (onswitch)
KASSERT(*refcount >= 0,
("%s: increment negative refcount %d for flag %d",
__func__, *refcount, flag));
else
KASSERT(*refcount > 0,
("%s: decrement non-positive refcount %d for flag %d",
__func__, *refcount, flag));
/* In case this mode is permanent, just touch refcount */
if (ifp->if_flags & pflag) {
*refcount += onswitch ? 1 : -1;
return (0);
}
/* Save ifnet parameters for if_ioctl() may fail */
oldcount = *refcount;
oldflags = ifp->if_flags;
/*
* See if we aren't the only and touching refcount is enough.
* Actually toggle interface flag if we are the first or last.
*/
if (onswitch) {
if ((*refcount)++)
return (0);
ifp->if_flags |= flag;
} else {
if (--(*refcount))
return (0);
ifp->if_flags &= ~flag;
}
/* Call down the driver since we've changed interface flags */
if (ifp->if_ioctl == NULL) {
error = EOPNOTSUPP;
goto recover;
}
ifr.ifr_flags = ifp->if_flags & 0xffff;
ifr.ifr_flagshigh = ifp->if_flags >> 16;
error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
if (error)
goto recover;
/* Notify userland that interface flags have changed */
rt_ifmsg(ifp);
return (0);
recover:
/* Recover after driver error */
*refcount = oldcount;
ifp->if_flags = oldflags;
return (error);
}
/*
* Set/clear promiscuous mode on interface ifp based on the truth value
* of pswitch. The calls are reference counted so that only the first
* "on" request actually has an effect, as does the final "off" request.
* Results are undefined if the "off" and "on" requests are not matched.
*/
int
ifpromisc(struct ifnet *ifp, int pswitch)
{
int error;
int oldflags = ifp->if_flags;
error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
&ifp->if_pcount, pswitch);
/* If promiscuous mode status has changed, log a message */
if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
log(LOG_INFO, "%s: promiscuous mode %s\n",
ifp->if_xname,
(ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
return (error);
}
/*
* Return interface configuration
* of system. List may be used
* in later ioctl's (above) to get
* other information.
*/
/*ARGSUSED*/
static int
ifconf(u_long cmd, caddr_t data)
{
struct ifconf *ifc = (struct ifconf *)data;
#ifdef __amd64__
struct ifconf32 *ifc32 = (struct ifconf32 *)data;
struct ifconf ifc_swab;
#endif
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifreq ifr;
struct sbuf *sb;
int error, full = 0, valid_len, max_len;
#ifdef __amd64__
if (cmd == SIOCGIFCONF32) {
ifc_swab.ifc_len = ifc32->ifc_len;
ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf;
ifc = &ifc_swab;
}
#endif
/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
max_len = MAXPHYS - 1;
/* Prevent hostile input from being able to crash the system */
if (ifc->ifc_len <= 0)
return (EINVAL);
again:
if (ifc->ifc_len <= max_len) {
max_len = ifc->ifc_len;
full = 1;
}
sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
max_len = 0;
valid_len = 0;
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
int addrs;
/*
* Zero the ifr_name buffer to make sure we don't
* disclose the contents of the stack.
*/
memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));
if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
>= sizeof(ifr.ifr_name)) {
sbuf_delete(sb);
IFNET_RUNLOCK();
return (ENAMETOOLONG);
}
addrs = 0;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa = ifa->ifa_addr;
if (prison_if(curthread->td_ucred, sa) != 0)
continue;
addrs++;
#ifdef COMPAT_43
if (cmd == OSIOCGIFCONF) {
struct osockaddr *osa =
(struct osockaddr *)&ifr.ifr_addr;
ifr.ifr_addr = *sa;
osa->sa_family = sa->sa_family;
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
} else
#endif
if (sa->sa_len <= sizeof(*sa)) {
ifr.ifr_addr = *sa;
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
} else {
sbuf_bcat(sb, &ifr,
offsetof(struct ifreq, ifr_addr));
max_len += offsetof(struct ifreq, ifr_addr);
sbuf_bcat(sb, sa, sa->sa_len);
max_len += sa->sa_len;
}
if (!sbuf_overflowed(sb))
valid_len = sbuf_len(sb);
}
IF_ADDR_UNLOCK(ifp);
if (addrs == 0) {
bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
sbuf_bcat(sb, &ifr, sizeof(ifr));
max_len += sizeof(ifr);
if (!sbuf_overflowed(sb))
valid_len = sbuf_len(sb);
}
}
IFNET_RUNLOCK();
/*
* If we didn't allocate enough space (uncommon), try again. If
* we have already allocated as much space as we are allowed,
* return what we've got.
*/
if (valid_len != max_len && !full) {
sbuf_delete(sb);
goto again;
}
ifc->ifc_len = valid_len;
#ifdef __amd64__
if (cmd == SIOCGIFCONF32)
ifc32->ifc_len = valid_len;
#endif
sbuf_finish(sb);
error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
sbuf_delete(sb);
return (error);
}
/*
* Just like ifpromisc(), but for all-multicast-reception mode.
*/
int
if_allmulti(struct ifnet *ifp, int onswitch)
{
return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
}
struct ifmultiaddr *
if_findmulti(struct ifnet *ifp, struct sockaddr *sa)
{
struct ifmultiaddr *ifma;
IF_ADDR_LOCK_ASSERT(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (sa->sa_family == AF_LINK) {
if (sa_dl_equal(ifma->ifma_addr, sa))
break;
} else {
if (sa_equal(ifma->ifma_addr, sa))
break;
}
}
return ifma;
}
/*
* Allocate a new ifmultiaddr and initialize based on passed arguments. We
* make copies of passed sockaddrs. The ifmultiaddr will not be added to
* the ifnet multicast address list here, so the caller must do that and
* other setup work (such as notifying the device driver). The reference
* count is initialized to 1.
*/
static struct ifmultiaddr *
if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
int mflags)
{
struct ifmultiaddr *ifma;
struct sockaddr *dupsa;
ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
M_ZERO);
if (ifma == NULL)
return (NULL);
dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
if (dupsa == NULL) {
free(ifma, M_IFMADDR);
return (NULL);
}
bcopy(sa, dupsa, sa->sa_len);
ifma->ifma_addr = dupsa;
ifma->ifma_ifp = ifp;
ifma->ifma_refcount = 1;
ifma->ifma_protospec = NULL;
if (llsa == NULL) {
ifma->ifma_lladdr = NULL;
return (ifma);
}
dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
if (dupsa == NULL) {
free(ifma->ifma_addr, M_IFMADDR);
free(ifma, M_IFMADDR);
return (NULL);
}
bcopy(llsa, dupsa, llsa->sa_len);
ifma->ifma_lladdr = dupsa;
return (ifma);
}
/*
* if_freemulti: free ifmultiaddr structure and possibly attached related
* addresses. The caller is responsible for implementing reference
* counting, notifying the driver, handling routing messages, and releasing
* any dependent link layer state.
*/
static void
if_freemulti(struct ifmultiaddr *ifma)
{
KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
ifma->ifma_refcount));
KASSERT(ifma->ifma_protospec == NULL,
("if_freemulti: protospec not NULL"));
if (ifma->ifma_lladdr != NULL)
free(ifma->ifma_lladdr, M_IFMADDR);
free(ifma->ifma_addr, M_IFMADDR);
free(ifma, M_IFMADDR);
}
/*
* Register an additional multicast address with a network interface.
*
* - If the address is already present, bump the reference count on the
* address and return.
* - If the address is not link-layer, look up a link layer address.
* - Allocate address structures for one or both addresses, and attach to the
* multicast address list on the interface. If automatically adding a link
* layer address, the protocol address will own a reference to the link
* layer address, to be freed when it is freed.
* - Notify the network device driver of an addition to the multicast address
* list.
*
* 'sa' points to caller-owned memory with the desired multicast address.
*
* 'retifma' will be used to return a pointer to the resulting multicast
* address reference, if desired.
*/
int
if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
struct ifmultiaddr **retifma)
{
struct ifmultiaddr *ifma, *ll_ifma;
struct sockaddr *llsa;
int error;
/*
* If the address is already present, return a new reference to it;
* otherwise, allocate storage and set up a new address.
*/
IF_ADDR_LOCK(ifp);
ifma = if_findmulti(ifp, sa);
if (ifma != NULL) {
ifma->ifma_refcount++;
if (retifma != NULL)
*retifma = ifma;
IF_ADDR_UNLOCK(ifp);
return (0);
}
/*
* The address isn't already present; resolve the protocol address
* into a link layer address, and then look that up, bump its
* refcount or allocate an ifma for that also. If 'llsa' was
* returned, we will need to free it later.
*/
llsa = NULL;
ll_ifma = NULL;
if (ifp->if_resolvemulti != NULL) {
error = ifp->if_resolvemulti(ifp, &llsa, sa);
if (error)
goto unlock_out;
}
/*
* Allocate the new address. Don't hook it up yet, as we may also
* need to allocate a link layer multicast address.
*/
ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
if (ifma == NULL) {
error = ENOMEM;
goto free_llsa_out;
}
/*
* If a link layer address is found, we'll need to see if it's
* already present in the address list, or allocate is as well.
* When this block finishes, the link layer address will be on the
* list.
*/
if (llsa != NULL) {
ll_ifma = if_findmulti(ifp, llsa);
if (ll_ifma == NULL) {
ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
if (ll_ifma == NULL) {
--ifma->ifma_refcount;
if_freemulti(ifma);
error = ENOMEM;
goto free_llsa_out;
}
TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
ifma_link);
} else
ll_ifma->ifma_refcount++;
ifma->ifma_llifma = ll_ifma;
}
/*
* We now have a new multicast address, ifma, and possibly a new or
* referenced link layer address. Add the primary address to the
* ifnet address list.
*/
TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
if (retifma != NULL)
*retifma = ifma;
/*
* Must generate the message while holding the lock so that 'ifma'
* pointer is still valid.
*/
rt_newmaddrmsg(RTM_NEWMADDR, ifma);
IF_ADDR_UNLOCK(ifp);
/*
* We are certain we have added something, so call down to the
* interface to let them know about it.
*/
if (ifp->if_ioctl != NULL) {
(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
}
if (llsa != NULL)
free(llsa, M_IFMADDR);
return (0);
free_llsa_out:
if (llsa != NULL)
free(llsa, M_IFMADDR);
unlock_out:
IF_ADDR_UNLOCK(ifp);
return (error);
}
/*
* Delete a multicast group membership by network-layer group address.
*
* Returns ENOENT if the entry could not be found. If ifp no longer
* exists, results are undefined. This entry point should only be used
* from subsystems which do appropriate locking to hold ifp for the
* duration of the call.
* Network-layer protocol domains must use if_delmulti_ifma().
*/
int
if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
{
struct ifmultiaddr *ifma;
int lastref;
#ifdef INVARIANTS
struct ifnet *oifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(oifp, &V_ifnet, if_link)
if (ifp == oifp)
break;
if (ifp != oifp)
ifp = NULL;
IFNET_RUNLOCK_NOSLEEP();
KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
#endif
if (ifp == NULL)
return (ENOENT);
IF_ADDR_LOCK(ifp);
lastref = 0;
ifma = if_findmulti(ifp, sa);
if (ifma != NULL)
lastref = if_delmulti_locked(ifp, ifma, 0);
IF_ADDR_UNLOCK(ifp);
if (ifma == NULL)
return (ENOENT);
if (lastref && ifp->if_ioctl != NULL) {
(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
}
return (0);
}
/*
* Delete all multicast group membership for an interface.
* Should be used to quickly flush all multicast filters.
*/
void
if_delallmulti(struct ifnet *ifp)
{
struct ifmultiaddr *ifma;
struct ifmultiaddr *next;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
if_delmulti_locked(ifp, ifma, 0);
IF_ADDR_UNLOCK(ifp);
}
/*
* Delete a multicast group membership by group membership pointer.
* Network-layer protocol domains must use this routine.
*
* It is safe to call this routine if the ifp disappeared.
*/
void
if_delmulti_ifma(struct ifmultiaddr *ifma)
{
struct ifnet *ifp;
int lastref;
ifp = ifma->ifma_ifp;
#ifdef DIAGNOSTIC
if (ifp == NULL) {
printf("%s: ifma_ifp seems to be detached\n", __func__);
} else {
struct ifnet *oifp;
IFNET_RLOCK_NOSLEEP();
TAILQ_FOREACH(oifp, &V_ifnet, if_link)
if (ifp == oifp)
break;
if (ifp != oifp) {
printf("%s: ifnet %p disappeared\n", __func__, ifp);
ifp = NULL;
}
IFNET_RUNLOCK_NOSLEEP();
}
#endif
/*
* If and only if the ifnet instance exists: Acquire the address lock.
*/
if (ifp != NULL)
IF_ADDR_LOCK(ifp);
lastref = if_delmulti_locked(ifp, ifma, 0);
if (ifp != NULL) {
/*
* If and only if the ifnet instance exists:
* Release the address lock.
* If the group was left: update the hardware hash filter.
*/
IF_ADDR_UNLOCK(ifp);
if (lastref && ifp->if_ioctl != NULL) {
(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
}
}
}
/*
* Perform deletion of network-layer and/or link-layer multicast address.
*
* Return 0 if the reference count was decremented.
* Return 1 if the final reference was released, indicating that the
* hardware hash filter should be reprogrammed.
*/
static int
if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
{
struct ifmultiaddr *ll_ifma;
if (ifp != NULL && ifma->ifma_ifp != NULL) {
KASSERT(ifma->ifma_ifp == ifp,
("%s: inconsistent ifp %p", __func__, ifp));
IF_ADDR_LOCK_ASSERT(ifp);
}
ifp = ifma->ifma_ifp;
/*
* If the ifnet is detaching, null out references to ifnet,
* so that upper protocol layers will notice, and not attempt
* to obtain locks for an ifnet which no longer exists. The
* routing socket announcement must happen before the ifnet
* instance is detached from the system.
*/
if (detaching) {
#ifdef DIAGNOSTIC
printf("%s: detaching ifnet instance %p\n", __func__, ifp);
#endif
/*
* ifp may already be nulled out if we are being reentered
* to delete the ll_ifma.
*/
if (ifp != NULL) {
rt_newmaddrmsg(RTM_DELMADDR, ifma);
ifma->ifma_ifp = NULL;
}
}
if (--ifma->ifma_refcount > 0)
return 0;
/*
* If this ifma is a network-layer ifma, a link-layer ifma may
* have been associated with it. Release it first if so.
*/
ll_ifma = ifma->ifma_llifma;
if (ll_ifma != NULL) {
KASSERT(ifma->ifma_lladdr != NULL,
("%s: llifma w/o lladdr", __func__));
if (detaching)
ll_ifma->ifma_ifp = NULL; /* XXX */
if (--ll_ifma->ifma_refcount == 0) {
if (ifp != NULL) {
TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
ifma_link);
}
if_freemulti(ll_ifma);
}
}
if (ifp != NULL)
TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
if_freemulti(ifma);
/*
* The last reference to this instance of struct ifmultiaddr
* was released; the hardware should be notified of this change.
*/
return 1;
}
/*
* Set the link layer address on an interface.
*
* At this time we only support certain types of interfaces,
* and we don't allow the length of the address to change.
*/
int
if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
{
struct sockaddr_dl *sdl;
struct ifaddr *ifa;
struct ifreq ifr;
IF_ADDR_LOCK(ifp);
ifa = ifp->if_addr;
if (ifa == NULL) {
IF_ADDR_UNLOCK(ifp);
return (EINVAL);
}
ifa_ref(ifa);
IF_ADDR_UNLOCK(ifp);
sdl = (struct sockaddr_dl *)ifa->ifa_addr;
if (sdl == NULL) {
ifa_free(ifa);
return (EINVAL);
}
if (len != sdl->sdl_alen) { /* don't allow length to change */
ifa_free(ifa);
return (EINVAL);
}
switch (ifp->if_type) {
case IFT_ETHER:
case IFT_FDDI:
case IFT_XETHER:
case IFT_ISO88025:
case IFT_L2VLAN:
case IFT_BRIDGE:
case IFT_ARCNET:
case IFT_IEEE8023ADLAG:
case IFT_IEEE80211:
bcopy(lladdr, LLADDR(sdl), len);
ifa_free(ifa);
break;
default:
ifa_free(ifa);
return (ENODEV);
}
/*
* If the interface is already up, we need
* to re-init it in order to reprogram its
* address filter.
*/
if ((ifp->if_flags & IFF_UP) != 0) {
if (ifp->if_ioctl) {
ifp->if_flags &= ~IFF_UP;
ifr.ifr_flags = ifp->if_flags & 0xffff;
ifr.ifr_flagshigh = ifp->if_flags >> 16;
(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
ifp->if_flags |= IFF_UP;
ifr.ifr_flags = ifp->if_flags & 0xffff;
ifr.ifr_flagshigh = ifp->if_flags >> 16;
(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
}
#ifdef INET
/*
* Also send gratuitous ARPs to notify other nodes about
* the address change.
*/
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family == AF_INET)
arp_ifinit(ifp, ifa);
}
#endif
}
return (0);
}
/*
* The name argument must be a pointer to storage which will last as
* long as the interface does. For physical devices, the result of
* device_get_name(dev) is a good choice and for pseudo-devices a
* static string works well.
*/
void
if_initname(struct ifnet *ifp, const char *name, int unit)
{
ifp->if_dname = name;
ifp->if_dunit = unit;
if (unit != IF_DUNIT_NONE)
snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
else
strlcpy(ifp->if_xname, name, IFNAMSIZ);
}
int
if_printf(struct ifnet *ifp, const char * fmt, ...)
{
va_list ap;
int retval;
retval = printf("%s: ", ifp->if_xname);
va_start(ap, fmt);
retval += vprintf(fmt, ap);
va_end(ap);
return (retval);
}
void
if_start(struct ifnet *ifp)
{
(*(ifp)->if_start)(ifp);
}
/*
* Backwards compatibility interface for drivers
* that have not implemented it
*/
static int
if_transmit(struct ifnet *ifp, struct mbuf *m)
{
int error;
IFQ_HANDOFF(ifp, m, error);
return (error);
}
int
if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
{
int active = 0;
IF_LOCK(ifq);
if (_IF_QFULL(ifq)) {
_IF_DROP(ifq);
IF_UNLOCK(ifq);
m_freem(m);
return (0);
}
if (ifp != NULL) {
ifp->if_obytes += m->m_pkthdr.len + adjust;
if (m->m_flags & (M_BCAST|M_MCAST))
ifp->if_omcasts++;
active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
}
_IF_ENQUEUE(ifq, m);
IF_UNLOCK(ifq);
if (ifp != NULL && !active)
(*(ifp)->if_start)(ifp);
return (1);
}
void
if_register_com_alloc(u_char type,
if_com_alloc_t *a, if_com_free_t *f)
{
KASSERT(if_com_alloc[type] == NULL,
("if_register_com_alloc: %d already registered", type));
KASSERT(if_com_free[type] == NULL,
("if_register_com_alloc: %d free already registered", type));
if_com_alloc[type] = a;
if_com_free[type] = f;
}
void
if_deregister_com_alloc(u_char type)
{
KASSERT(if_com_alloc[type] != NULL,
("if_deregister_com_alloc: %d not registered", type));
KASSERT(if_com_free[type] != NULL,
("if_deregister_com_alloc: %d free not registered", type));
if_com_alloc[type] = NULL;
if_com_free[type] = NULL;
}
#ifdef DDB
static void
if_show_ifnet(struct ifnet *ifp)
{
if (ifp == NULL)
return;
db_printf("%s:\n", ifp->if_xname);
#define IF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, ifp->e);
IF_DB_PRINTF("%s", if_dname);
IF_DB_PRINTF("%d", if_dunit);
IF_DB_PRINTF("%s", if_description);
IF_DB_PRINTF("%u", if_index);
IF_DB_PRINTF("%u", if_refcount);
IF_DB_PRINTF("%p", if_softc);
IF_DB_PRINTF("%p", if_l2com);
IF_DB_PRINTF("%p", if_vnet);
IF_DB_PRINTF("%p", if_home_vnet);
IF_DB_PRINTF("%p", if_addr);
IF_DB_PRINTF("%p", if_llsoftc);
IF_DB_PRINTF("%p", if_label);
IF_DB_PRINTF("%u", if_pcount);
IF_DB_PRINTF("0x%08x", if_flags);
IF_DB_PRINTF("0x%08x", if_drv_flags);
IF_DB_PRINTF("0x%08x", if_capabilities);
IF_DB_PRINTF("0x%08x", if_capenable);
IF_DB_PRINTF("%p", if_snd.ifq_head);
IF_DB_PRINTF("%p", if_snd.ifq_tail);
IF_DB_PRINTF("%d", if_snd.ifq_len);
IF_DB_PRINTF("%d", if_snd.ifq_maxlen);
IF_DB_PRINTF("%d", if_snd.ifq_drops);
IF_DB_PRINTF("%p", if_snd.ifq_drv_head);
IF_DB_PRINTF("%p", if_snd.ifq_drv_tail);
IF_DB_PRINTF("%d", if_snd.ifq_drv_len);
IF_DB_PRINTF("%d", if_snd.ifq_drv_maxlen);
IF_DB_PRINTF("%d", if_snd.altq_type);
IF_DB_PRINTF("%x", if_snd.altq_flags);
#undef IF_DB_PRINTF
}
DB_SHOW_COMMAND(ifnet, db_show_ifnet)
{
if (!have_addr) {
db_printf("usage: show ifnet <struct ifnet *>\n");
return;
}
if_show_ifnet((struct ifnet *)addr);
}
DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets)
{
VNET_ITERATOR_DECL(vnet_iter);
struct ifnet *ifp;
u_short idx;
VNET_FOREACH(vnet_iter) {
CURVNET_SET_QUIET(vnet_iter);
#ifdef VIMAGE
db_printf("vnet=%p\n", curvnet);
#endif
for (idx = 1; idx <= V_if_index; idx++) {
ifp = V_ifindex_table[idx].ife_ifnet;
if (ifp == NULL)
continue;
db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp);
if (db_pager_quit)
break;
}
CURVNET_RESTORE();
}
}
#endif
Index: stable/8/sys/net/if_var.h
===================================================================
--- stable/8/sys/net/if_var.h (revision 209276)
+++ stable/8/sys/net/if_var.h (revision 209277)
@@ -1,904 +1,904 @@
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* From: @(#)if.h 8.1 (Berkeley) 6/10/93
* $FreeBSD$
*/
#ifndef _NET_IF_VAR_H_
#define _NET_IF_VAR_H_
/*
* Structures defining a network interface, providing a packet
* transport mechanism (ala level 0 of the PUP protocols).
*
* Each interface accepts output datagrams of a specified maximum
* length, and provides higher level routines with input datagrams
* received from its medium.
*
* Output occurs when the routine if_output is called, with three parameters:
* (*ifp->if_output)(ifp, m, dst, rt)
* Here m is the mbuf chain to be sent and dst is the destination address.
* The output routine encapsulates the supplied datagram if necessary,
* and then transmits it on its medium.
*
* On input, each interface unwraps the data received by it, and either
* places it on the input queue of an internetwork datagram routine
* and posts the associated software interrupt, or passes the datagram to a raw
* packet input routine.
*
* Routines exist for locating interfaces by their addresses
* or for locating an interface on a certain network, as well as more general
* routing and gateway routines maintaining information used to locate
* interfaces. These routines live in the files if.c and route.c
*/
#ifdef __STDC__
/*
* Forward structure declarations for function prototypes [sic].
*/
struct mbuf;
struct thread;
struct rtentry;
struct rt_addrinfo;
struct socket;
struct ether_header;
struct carp_if;
struct ifvlantrunk;
struct route;
struct vnet;
#endif
#include <sys/queue.h> /* get TAILQ macros */
#ifdef _KERNEL
#include <sys/mbuf.h>
#include <sys/eventhandler.h>
#include <sys/buf_ring.h>
#include <net/vnet.h>
#endif /* _KERNEL */
#include <sys/lock.h> /* XXX */
#include <sys/mutex.h> /* XXX */
#include <sys/rwlock.h> /* XXX */
#include <sys/sx.h> /* XXX */
#include <sys/event.h> /* XXX */
#include <sys/_task.h>
#define IF_DUNIT_NONE -1
#include <altq/if_altq.h>
TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */
TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */
TAILQ_HEAD(ifprefixhead, ifprefix);
TAILQ_HEAD(ifmultihead, ifmultiaddr);
TAILQ_HEAD(ifgrouphead, ifg_group);
/*
* Structure defining a queue for a network interface.
*/
struct ifqueue {
struct mbuf *ifq_head;
struct mbuf *ifq_tail;
int ifq_len;
int ifq_maxlen;
int ifq_drops;
struct mtx ifq_mtx;
};
/*
* Structure defining a network interface.
*
* (Would like to call this struct ``if'', but C isn't PL/1.)
*/
struct ifnet {
void *if_softc; /* pointer to driver state */
void *if_l2com; /* pointer to protocol bits */
struct vnet *if_vnet; /* pointer to network stack instance */
TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */
char if_xname[IFNAMSIZ]; /* external name (name + unit) */
const char *if_dname; /* driver name */
int if_dunit; /* unit or IF_DUNIT_NONE */
u_int if_refcount; /* reference count */
struct ifaddrhead if_addrhead; /* linked list of addresses per if */
/*
* if_addrhead is the list of all addresses associated to
* an interface.
* Some code in the kernel assumes that first element
* of the list has type AF_LINK, and contains sockaddr_dl
* addresses which store the link-level address and the name
* of the interface.
* However, access to the AF_LINK address through this
* field is deprecated. Use if_addr or ifaddr_byindex() instead.
*/
int if_pcount; /* number of promiscuous listeners */
struct carp_if *if_carp; /* carp interface structure */
struct bpf_if *if_bpf; /* packet filter structure */
u_short if_index; /* numeric abbreviation for this if */
short if_timer; /* time 'til if_watchdog called */
struct ifvlantrunk *if_vlantrunk; /* pointer to 802.1q data */
int if_flags; /* up/down, broadcast, etc. */
int if_capabilities; /* interface features & capabilities */
int if_capenable; /* enabled features & capabilities */
void *if_linkmib; /* link-type-specific MIB data */
size_t if_linkmiblen; /* length of above data */
struct if_data if_data;
struct ifmultihead if_multiaddrs; /* multicast addresses configured */
int if_amcount; /* number of all-multicast requests */
/* procedure handles */
int (*if_output) /* output routine (enqueue) */
(struct ifnet *, struct mbuf *, struct sockaddr *,
struct route *);
void (*if_input) /* input routine (from h/w driver) */
(struct ifnet *, struct mbuf *);
void (*if_start) /* initiate output routine */
(struct ifnet *);
int (*if_ioctl) /* ioctl routine */
(struct ifnet *, u_long, caddr_t);
void (*if_watchdog) /* timer routine */
(struct ifnet *);
void (*if_init) /* Init routine */
(void *);
int (*if_resolvemulti) /* validate/resolve multicast */
(struct ifnet *, struct sockaddr **, struct sockaddr *);
void (*if_qflush) /* flush any queues */
(struct ifnet *);
int (*if_transmit) /* initiate output routine */
(struct ifnet *, struct mbuf *);
void (*if_reassign) /* reassign to vnet routine */
(struct ifnet *, struct vnet *, char *);
struct vnet *if_home_vnet; /* where this ifnet originates from */
struct ifaddr *if_addr; /* pointer to link-level address */
void *if_llsoftc; /* link layer softc */
int if_drv_flags; /* driver-managed status flags */
struct ifaltq if_snd; /* output queue (includes altq) */
const u_int8_t *if_broadcastaddr; /* linklevel broadcast bytestring */
void *if_bridge; /* bridge glue */
struct label *if_label; /* interface MAC label */
/* these are only used by IPv6 */
struct ifprefixhead if_prefixhead; /* list of prefixes per if */
void *if_afdata[AF_MAX];
int if_afdata_initialized;
struct rwlock if_afdata_lock;
struct task if_linktask; /* task for link change events */
struct mtx if_addr_mtx; /* mutex to protect address lists */
LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */
TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */
/* protected by if_addr_mtx */
void *if_pf_kif;
void *if_lagg; /* lagg glue */
u_char if_alloctype; /* if_type at time of allocation */
/*
* Spare fields are added so that we can modify sensitive data
* structures without changing the kernel binary interface, and must
* be used with care where binary compatibility is required.
*/
char if_cspare[3];
char *if_description; /* interface description */
void *if_pspare[7];
int if_ispare[4];
};
typedef void if_init_f_t(void *);
/*
* XXX These aliases are terribly dangerous because they could apply
* to anything.
*/
#define if_mtu if_data.ifi_mtu
#define if_type if_data.ifi_type
#define if_physical if_data.ifi_physical
#define if_addrlen if_data.ifi_addrlen
#define if_hdrlen if_data.ifi_hdrlen
#define if_metric if_data.ifi_metric
#define if_link_state if_data.ifi_link_state
#define if_baudrate if_data.ifi_baudrate
#define if_hwassist if_data.ifi_hwassist
#define if_ipackets if_data.ifi_ipackets
#define if_ierrors if_data.ifi_ierrors
#define if_opackets if_data.ifi_opackets
#define if_oerrors if_data.ifi_oerrors
#define if_collisions if_data.ifi_collisions
#define if_ibytes if_data.ifi_ibytes
#define if_obytes if_data.ifi_obytes
#define if_imcasts if_data.ifi_imcasts
#define if_omcasts if_data.ifi_omcasts
#define if_iqdrops if_data.ifi_iqdrops
#define if_noproto if_data.ifi_noproto
#define if_lastchange if_data.ifi_lastchange
/* for compatibility with other BSDs */
#define if_addrlist if_addrhead
#define if_list if_link
#define if_name(ifp) ((ifp)->if_xname)
/*
* Locks for address lists on the network interface.
*/
#define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_mtx, \
"if_addr_mtx", NULL, MTX_DEF)
#define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_mtx)
#define IF_ADDR_LOCK(if) mtx_lock(&(if)->if_addr_mtx)
#define IF_ADDR_UNLOCK(if) mtx_unlock(&(if)->if_addr_mtx)
#define IF_ADDR_LOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED)
/*
* Function variations on locking macros intended to be used by loadable
* kernel modules in order to divorce them from the internals of address list
* locking.
*/
void if_addr_rlock(struct ifnet *ifp); /* if_addrhead */
void if_addr_runlock(struct ifnet *ifp); /* if_addrhead */
void if_maddr_rlock(struct ifnet *ifp); /* if_multiaddrs */
void if_maddr_runlock(struct ifnet *ifp); /* if_multiaddrs */
/*
* Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq)
* are queues of messages stored on ifqueue structures
* (defined above). Entries are added to and deleted from these structures
* by these macros, which should be called with ipl raised to splimp().
*/
#define IF_LOCK(ifq) mtx_lock(&(ifq)->ifq_mtx)
#define IF_UNLOCK(ifq) mtx_unlock(&(ifq)->ifq_mtx)
#define IF_LOCK_ASSERT(ifq) mtx_assert(&(ifq)->ifq_mtx, MA_OWNED)
#define _IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen)
#define _IF_DROP(ifq) ((ifq)->ifq_drops++)
#define _IF_QLEN(ifq) ((ifq)->ifq_len)
#define _IF_ENQUEUE(ifq, m) do { \
(m)->m_nextpkt = NULL; \
if ((ifq)->ifq_tail == NULL) \
(ifq)->ifq_head = m; \
else \
(ifq)->ifq_tail->m_nextpkt = m; \
(ifq)->ifq_tail = m; \
(ifq)->ifq_len++; \
} while (0)
#define IF_ENQUEUE(ifq, m) do { \
IF_LOCK(ifq); \
_IF_ENQUEUE(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define _IF_PREPEND(ifq, m) do { \
(m)->m_nextpkt = (ifq)->ifq_head; \
if ((ifq)->ifq_tail == NULL) \
(ifq)->ifq_tail = (m); \
(ifq)->ifq_head = (m); \
(ifq)->ifq_len++; \
} while (0)
#define IF_PREPEND(ifq, m) do { \
IF_LOCK(ifq); \
_IF_PREPEND(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define _IF_DEQUEUE(ifq, m) do { \
(m) = (ifq)->ifq_head; \
if (m) { \
if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \
(ifq)->ifq_tail = NULL; \
(m)->m_nextpkt = NULL; \
(ifq)->ifq_len--; \
} \
} while (0)
#define IF_DEQUEUE(ifq, m) do { \
IF_LOCK(ifq); \
_IF_DEQUEUE(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head)
#define IF_POLL(ifq, m) _IF_POLL(ifq, m)
#define _IF_DRAIN(ifq) do { \
struct mbuf *m; \
for (;;) { \
_IF_DEQUEUE(ifq, m); \
if (m == NULL) \
break; \
m_freem(m); \
} \
} while (0)
#define IF_DRAIN(ifq) do { \
IF_LOCK(ifq); \
_IF_DRAIN(ifq); \
IF_UNLOCK(ifq); \
} while(0)
#ifdef _KERNEL
/* interface link layer address change event */
typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(iflladdr_event, iflladdr_event_handler_t);
/* interface address change event */
typedef void (*ifaddr_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
/* new interface arrival event */
typedef void (*ifnet_arrival_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
/* interface departure event */
typedef void (*ifnet_departure_event_handler_t)(void *, struct ifnet *);
EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);
/*
* interface groups
*/
struct ifg_group {
char ifg_group[IFNAMSIZ];
u_int ifg_refcnt;
void *ifg_pf_kif;
TAILQ_HEAD(, ifg_member) ifg_members;
TAILQ_ENTRY(ifg_group) ifg_next;
};
struct ifg_member {
TAILQ_ENTRY(ifg_member) ifgm_next;
struct ifnet *ifgm_ifp;
};
struct ifg_list {
struct ifg_group *ifgl_group;
TAILQ_ENTRY(ifg_list) ifgl_next;
};
/* group attach event */
typedef void (*group_attach_event_handler_t)(void *, struct ifg_group *);
EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
/* group detach event */
typedef void (*group_detach_event_handler_t)(void *, struct ifg_group *);
EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
/* group change event */
typedef void (*group_change_event_handler_t)(void *, const char *);
EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);
#define IF_AFDATA_LOCK_INIT(ifp) \
rw_init(&(ifp)->if_afdata_lock, "if_afdata")
#define IF_AFDATA_WLOCK(ifp) rw_wlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_RLOCK(ifp) rw_rlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_WUNLOCK(ifp) rw_wunlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_RUNLOCK(ifp) rw_runlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp)
#define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp)
#define IF_AFDATA_TRYLOCK(ifp) rw_try_wlock(&(ifp)->if_afdata_lock)
#define IF_AFDATA_DESTROY(ifp) rw_destroy(&(ifp)->if_afdata_lock)
#define IF_AFDATA_LOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_LOCKED)
#define IF_AFDATA_UNLOCK_ASSERT(ifp) rw_assert(&(ifp)->if_afdata_lock, RA_UNLOCKED)
int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp,
int adjust);
#define IF_HANDOFF(ifq, m, ifp) \
if_handoff((struct ifqueue *)ifq, m, ifp, 0)
#define IF_HANDOFF_ADJ(ifq, m, ifp, adj) \
if_handoff((struct ifqueue *)ifq, m, ifp, adj)
void if_start(struct ifnet *);
#define IFQ_ENQUEUE(ifq, m, err) \
do { \
IF_LOCK(ifq); \
if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_ENQUEUE(ifq, m, NULL, err); \
else { \
if (_IF_QFULL(ifq)) { \
m_freem(m); \
(err) = ENOBUFS; \
} else { \
_IF_ENQUEUE(ifq, m); \
(err) = 0; \
} \
} \
if (err) \
(ifq)->ifq_drops++; \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_DEQUEUE_NOLOCK(ifq, m) \
do { \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue_ptr(ifq, ALTDQ_REMOVE); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_DEQUEUE(ifq, m); \
else \
_IF_DEQUEUE(ifq, m); \
} while (0)
#define IFQ_DEQUEUE(ifq, m) \
do { \
IF_LOCK(ifq); \
IFQ_DEQUEUE_NOLOCK(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_POLL_NOLOCK(ifq, m) \
do { \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue_ptr(ifq, ALTDQ_POLL); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_POLL(ifq, m); \
else \
_IF_POLL(ifq, m); \
} while (0)
#define IFQ_POLL(ifq, m) \
do { \
IF_LOCK(ifq); \
IFQ_POLL_NOLOCK(ifq, m); \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_PURGE_NOLOCK(ifq) \
do { \
if (ALTQ_IS_ENABLED(ifq)) { \
ALTQ_PURGE(ifq); \
} else \
_IF_DRAIN(ifq); \
} while (0)
#define IFQ_PURGE(ifq) \
do { \
IF_LOCK(ifq); \
IFQ_PURGE_NOLOCK(ifq); \
IF_UNLOCK(ifq); \
} while (0)
#define IFQ_SET_READY(ifq) \
do { ((ifq)->altq_flags |= ALTQF_READY); } while (0)
#define IFQ_LOCK(ifq) IF_LOCK(ifq)
#define IFQ_UNLOCK(ifq) IF_UNLOCK(ifq)
#define IFQ_LOCK_ASSERT(ifq) IF_LOCK_ASSERT(ifq)
#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0)
#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++)
#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len)
#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++)
#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len))
/*
* The IFF_DRV_OACTIVE test should really occur in the device driver, not in
* the handoff logic, as that flag is locked by the device driver.
*/
#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \
do { \
int len; \
short mflags; \
\
len = (m)->m_pkthdr.len; \
mflags = (m)->m_flags; \
IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \
if ((err) == 0) { \
(ifp)->if_obytes += len + (adj); \
if (mflags & M_MCAST) \
(ifp)->if_omcasts++; \
if (((ifp)->if_drv_flags & IFF_DRV_OACTIVE) == 0) \
if_start(ifp); \
} \
} while (0)
#define IFQ_HANDOFF(ifp, m, err) \
IFQ_HANDOFF_ADJ(ifp, m, 0, err)
#define IFQ_DRV_DEQUEUE(ifq, m) \
do { \
(m) = (ifq)->ifq_drv_head; \
if (m) { \
if (((ifq)->ifq_drv_head = (m)->m_nextpkt) == NULL) \
(ifq)->ifq_drv_tail = NULL; \
(m)->m_nextpkt = NULL; \
(ifq)->ifq_drv_len--; \
} else { \
IFQ_LOCK(ifq); \
IFQ_DEQUEUE_NOLOCK(ifq, m); \
while ((ifq)->ifq_drv_len < (ifq)->ifq_drv_maxlen) { \
struct mbuf *m0; \
IFQ_DEQUEUE_NOLOCK(ifq, m0); \
if (m0 == NULL) \
break; \
m0->m_nextpkt = NULL; \
if ((ifq)->ifq_drv_tail == NULL) \
(ifq)->ifq_drv_head = m0; \
else \
(ifq)->ifq_drv_tail->m_nextpkt = m0; \
(ifq)->ifq_drv_tail = m0; \
(ifq)->ifq_drv_len++; \
} \
IFQ_UNLOCK(ifq); \
} \
} while (0)
#define IFQ_DRV_PREPEND(ifq, m) \
do { \
(m)->m_nextpkt = (ifq)->ifq_drv_head; \
if ((ifq)->ifq_drv_tail == NULL) \
(ifq)->ifq_drv_tail = (m); \
(ifq)->ifq_drv_head = (m); \
(ifq)->ifq_drv_len++; \
} while (0)
#define IFQ_DRV_IS_EMPTY(ifq) \
(((ifq)->ifq_drv_len == 0) && ((ifq)->ifq_len == 0))
#define IFQ_DRV_PURGE(ifq) \
do { \
struct mbuf *m, *n = (ifq)->ifq_drv_head; \
while((m = n) != NULL) { \
n = m->m_nextpkt; \
m_freem(m); \
} \
(ifq)->ifq_drv_head = (ifq)->ifq_drv_tail = NULL; \
(ifq)->ifq_drv_len = 0; \
IFQ_PURGE(ifq); \
} while (0)
#ifdef _KERNEL
static __inline void
drbr_stats_update(struct ifnet *ifp, int len, int mflags)
{
#ifndef NO_SLOW_STATS
ifp->if_obytes += len;
if (mflags & M_MCAST)
ifp->if_omcasts++;
#endif
}
static __inline int
drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m)
{
int error = 0;
int len = m->m_pkthdr.len;
int mflags = m->m_flags;
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_ENQUEUE(&ifp->if_snd, m, error);
return (error);
}
#endif
if ((error = buf_ring_enqueue_bytes(br, m, len)) == ENOBUFS) {
br->br_drops++;
m_freem(m);
} else
drbr_stats_update(ifp, len, mflags);
return (error);
}
static __inline void
drbr_flush(struct ifnet *ifp, struct buf_ring *br)
{
struct mbuf *m;
#ifdef ALTQ
if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
IFQ_PURGE(&ifp->if_snd);
#endif
while ((m = buf_ring_dequeue_sc(br)) != NULL)
m_freem(m);
}
static __inline void
drbr_free(struct buf_ring *br, struct malloc_type *type)
{
drbr_flush(NULL, br);
buf_ring_free(br, type);
}
static __inline struct mbuf *
drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
struct mbuf *m;
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_DEQUEUE(&ifp->if_snd, m);
return (m);
}
#endif
return (buf_ring_dequeue_sc(br));
}
static __inline struct mbuf *
drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
int (*func) (struct mbuf *, void *), void *arg)
{
struct mbuf *m;
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
IFQ_LOCK(&ifp->if_snd);
IFQ_POLL_NOLOCK(&ifp->if_snd, m);
if (m != NULL && func(m, arg) == 0) {
IFQ_UNLOCK(&ifp->if_snd);
return (NULL);
}
IFQ_DEQUEUE_NOLOCK(&ifp->if_snd, m);
IFQ_UNLOCK(&ifp->if_snd);
return (m);
}
#endif
m = buf_ring_peek(br);
if (m == NULL || func(m, arg) == 0)
return (NULL);
return (buf_ring_dequeue_sc(br));
}
static __inline int
drbr_empty(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
return (IFQ_IS_EMPTY(&ifp->if_snd));
#endif
return (buf_ring_empty(br));
}
static __inline int
drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
return (1);
#endif
return (!buf_ring_empty(br));
}
static __inline int
drbr_inuse(struct ifnet *ifp, struct buf_ring *br)
{
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
return (ifp->if_snd.ifq_len);
#endif
return (buf_ring_count(br));
}
#endif
/*
* 72 was chosen below because it is the size of a TCP/IP
* header (40) + the minimum mss (32).
*/
#define IF_MINMTU 72
#define IF_MAXMTU 65535
#endif /* _KERNEL */
/*
* The ifaddr structure contains information about one address
* of an interface. They are maintained by the different address families,
* are allocated and attached when an address is set, and are linked
* together so all addresses for an interface can be located.
*
* NOTE: a 'struct ifaddr' is always at the beginning of a larger
* chunk of malloc'ed memory, where we store the three addresses
* (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
*/
struct ifaddr {
struct sockaddr *ifa_addr; /* address of interface */
struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */
#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */
struct sockaddr *ifa_netmask; /* used to determine subnet */
struct if_data if_data; /* not all members are meaningful */
struct ifnet *ifa_ifp; /* back-pointer to interface */
TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */
void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */
(int, struct rtentry *, struct rt_addrinfo *);
u_short ifa_flags; /* mostly rt_flags for cloning */
u_int ifa_refcnt; /* references to this structure */
int ifa_metric; /* cost of going out this interface */
int (*ifa_claim_addr) /* check if an addr goes to this if */
(struct ifaddr *, struct sockaddr *);
struct mtx ifa_mtx;
};
#define IFA_ROUTE RTF_UP /* route installed */
#define IFA_RTSELF RTF_HOST /* loopback route to self installed */
/* for compatibility with other BSDs */
#define ifa_list ifa_link
#ifdef _KERNEL
#define IFA_LOCK(ifa) mtx_lock(&(ifa)->ifa_mtx)
#define IFA_UNLOCK(ifa) mtx_unlock(&(ifa)->ifa_mtx)
void ifa_free(struct ifaddr *ifa);
void ifa_init(struct ifaddr *ifa);
void ifa_ref(struct ifaddr *ifa);
#endif
/*
* The prefix structure contains information about one prefix
* of an interface. They are maintained by the different address families,
* are allocated and attached when a prefix or an address is set,
* and are linked together so all prefixes for an interface can be located.
*/
struct ifprefix {
struct sockaddr *ifpr_prefix; /* prefix of interface */
struct ifnet *ifpr_ifp; /* back-pointer to interface */
TAILQ_ENTRY(ifprefix) ifpr_list; /* queue macro glue */
u_char ifpr_plen; /* prefix length in bits */
u_char ifpr_type; /* protocol dependent prefix type */
};
/*
* Multicast address structure. This is analogous to the ifaddr
* structure except that it keeps track of multicast addresses.
*/
struct ifmultiaddr {
TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
struct sockaddr *ifma_addr; /* address this membership is for */
struct sockaddr *ifma_lladdr; /* link-layer translation, if any */
struct ifnet *ifma_ifp; /* back-pointer to interface */
u_int ifma_refcount; /* reference count */
void *ifma_protospec; /* protocol-specific state, if any */
struct ifmultiaddr *ifma_llifma; /* pointer to ifma for ifma_lladdr */
};
#ifdef _KERNEL
extern struct rwlock ifnet_rwlock;
extern struct sx ifnet_sxlock;
#define IFNET_LOCK_INIT() do { \
rw_init_flags(&ifnet_rwlock, "ifnet_rw", RW_RECURSE); \
sx_init_flags(&ifnet_sxlock, "ifnet_sx", SX_RECURSE); \
} while(0)
#define IFNET_WLOCK() do { \
sx_xlock(&ifnet_sxlock); \
rw_wlock(&ifnet_rwlock); \
} while (0)
#define IFNET_WUNLOCK() do { \
rw_wunlock(&ifnet_rwlock); \
sx_xunlock(&ifnet_sxlock); \
} while (0)
/*
* To assert the ifnet lock, you must know not only whether it's for read or
* write, but also whether it was acquired with sleep support or not.
*/
#define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED)
#define IFNET_RLOCK_NOSLEEP_ASSERT() rw_assert(&ifnet_rwlock, RA_RLOCKED)
#define IFNET_WLOCK_ASSERT() do { \
sx_assert(&ifnet_sxlock, SA_XLOCKED); \
rw_assert(&ifnet_rwlock, RA_WLOCKED); \
} while (0)
#define IFNET_RLOCK() sx_slock(&ifnet_sxlock)
#define IFNET_RLOCK_NOSLEEP() rw_rlock(&ifnet_rwlock)
#define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock)
#define IFNET_RUNLOCK_NOSLEEP() rw_runlock(&ifnet_rwlock)
/*
* Look up an ifnet given its index; the _ref variant also acquires a
* reference that must be freed using if_rele(). It is almost always a bug
* to call ifnet_byindex() instead if ifnet_byindex_ref().
*/
struct ifnet *ifnet_byindex(u_short idx);
struct ifnet *ifnet_byindex_locked(u_short idx);
struct ifnet *ifnet_byindex_ref(u_short idx);
/*
* Given the index, ifaddr_byindex() returns the one and only
* link-level ifaddr for the interface. You are not supposed to use
* it to traverse the list of addresses associated to the interface.
*/
struct ifaddr *ifaddr_byindex(u_short idx);
VNET_DECLARE(struct ifnethead, ifnet);
VNET_DECLARE(struct ifgrouphead, ifg_head);
VNET_DECLARE(int, if_index);
VNET_DECLARE(struct ifnet *, loif); /* first loopback interface */
VNET_DECLARE(int, useloopback);
#define V_ifnet VNET(ifnet)
#define V_ifg_head VNET(ifg_head)
#define V_if_index VNET(if_index)
#define V_loif VNET(loif)
#define V_useloopback VNET(useloopback)
extern int ifqmaxlen;
int if_addgroup(struct ifnet *, const char *);
int if_delgroup(struct ifnet *, const char *);
int if_addmulti(struct ifnet *, struct sockaddr *, struct ifmultiaddr **);
int if_allmulti(struct ifnet *, int);
struct ifnet* if_alloc(u_char);
void if_attach(struct ifnet *);
void if_dead(struct ifnet *);
int if_delmulti(struct ifnet *, struct sockaddr *);
void if_delmulti_ifma(struct ifmultiaddr *);
void if_detach(struct ifnet *);
void if_vmove(struct ifnet *, struct vnet *);
void if_purgeaddrs(struct ifnet *);
void if_delallmulti(struct ifnet *);
void if_down(struct ifnet *);
struct ifmultiaddr *
if_findmulti(struct ifnet *, struct sockaddr *);
void if_free(struct ifnet *);
void if_free_type(struct ifnet *, u_char);
void if_initname(struct ifnet *, const char *, int);
void if_link_state_change(struct ifnet *, int);
int if_printf(struct ifnet *, const char *, ...) __printflike(2, 3);
void if_qflush(struct ifnet *);
void if_ref(struct ifnet *);
void if_rele(struct ifnet *);
int if_setlladdr(struct ifnet *, const u_char *, int);
void if_up(struct ifnet *);
int ifioctl(struct socket *, u_long, caddr_t, struct thread *);
int ifpromisc(struct ifnet *, int);
struct ifnet *ifunit(const char *);
struct ifnet *ifunit_ref(const char *);
void ifq_init(struct ifaltq *, struct ifnet *ifp);
void ifq_delete(struct ifaltq *);
int ifa_add_loopback_route(struct ifaddr *, struct sockaddr *);
int ifa_del_loopback_route(struct ifaddr *, struct sockaddr *);
struct ifaddr *ifa_ifwithaddr(struct sockaddr *);
int ifa_ifwithaddr_check(struct sockaddr *);
struct ifaddr *ifa_ifwithbroadaddr(struct sockaddr *);
struct ifaddr *ifa_ifwithdstaddr(struct sockaddr *);
-struct ifaddr *ifa_ifwithnet(struct sockaddr *);
+struct ifaddr *ifa_ifwithnet(struct sockaddr *, int);
struct ifaddr *ifa_ifwithroute(int, struct sockaddr *, struct sockaddr *);
struct ifaddr *ifa_ifwithroute_fib(int, struct sockaddr *, struct sockaddr *, u_int);
struct ifaddr *ifaof_ifpforaddr(struct sockaddr *, struct ifnet *);
int if_simloop(struct ifnet *ifp, struct mbuf *m, int af, int hlen);
typedef void *if_com_alloc_t(u_char type, struct ifnet *ifp);
typedef void if_com_free_t(void *com, u_char type);
void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f);
void if_deregister_com_alloc(u_char type);
#define IF_LLADDR(ifp) \
LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))
#ifdef DEVICE_POLLING
enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };
typedef int poll_handler_t(struct ifnet *ifp, enum poll_cmd cmd, int count);
int ether_poll_register(poll_handler_t *h, struct ifnet *ifp);
int ether_poll_deregister(struct ifnet *ifp);
#endif /* DEVICE_POLLING */
#endif /* _KERNEL */
#endif /* !_NET_IF_VAR_H_ */
Index: stable/8/sys/net/route.c
===================================================================
--- stable/8/sys/net/route.c (revision 209276)
+++ stable/8/sys/net/route.c (revision 209277)
@@ -1,1599 +1,1599 @@
/*-
* Copyright (c) 1980, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)route.c 8.3.1.1 (Berkeley) 2/23/95
* $FreeBSD$
*/
/************************************************************************
* Note: In this file a 'fib' is a "forwarding information base" *
* Which is the new name for an in kernel routing (next hop) table. *
***********************************************************************/
#include "opt_inet.h"
#include "opt_route.h"
#include "opt_mrouting.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/syslog.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/sysproto.h>
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/vnet.h>
#include <net/flowtable.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <netinet/in.h>
#include <netinet/ip_mroute.h>
#include <vm/uma.h>
u_int rt_numfibs = RT_NUMFIBS;
SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
/*
* Allow the boot code to allow LESS than RT_MAXFIBS to be used.
* We can't do more because storage is statically allocated for now.
* (for compatibility reasons.. this will change).
*/
TUNABLE_INT("net.fibs", &rt_numfibs);
/*
* By default add routes to all fibs for new interfaces.
* Once this is set to 0 then only allocate routes on interface
* changes for the FIB of the caller when adding a new set of addresses
* to an interface. XXX this is a shotgun aproach to a problem that needs
* a more fine grained solution.. that will come.
*/
u_int rt_add_addr_allfibs = 1;
SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
&rt_add_addr_allfibs, 0, "");
TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);
VNET_DEFINE(struct rtstat, rtstat);
#define V_rtstat VNET(rtstat)
VNET_DEFINE(struct radix_node_head *, rt_tables);
#define V_rt_tables VNET(rt_tables)
VNET_DEFINE(int, rttrash); /* routes not in table but not freed */
#define V_rttrash VNET(rttrash)
/* compare two sockaddr structures */
#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
/*
* Convert a 'struct radix_node *' to a 'struct rtentry *'.
* The operation can be done safely (in this code) because a
* 'struct rtentry' starts with two 'struct radix_node''s, the first
* one representing leaf nodes in the routing tree, which is
* what the code in radix.c passes us as a 'struct radix_node'.
*
* But because there are a lot of assumptions in this conversion,
* do not cast explicitly, but always use the macro below.
*/
#define RNTORT(p) ((struct rtentry *)(p))
static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */
#define V_rtzone VNET(rtzone)
#if 0
/* default fib for tunnels to use */
u_int tunnel_fib = 0;
SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, "");
#endif
/*
* handler for net.my_fibnum
*/
static int
sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
{
int fibnum;
int error;
fibnum = curthread->td_proc->p_fibnum;
error = sysctl_handle_int(oidp, &fibnum, 0, req);
return (error);
}
SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD,
NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");
static __inline struct radix_node_head **
rt_tables_get_rnh_ptr(int table, int fam)
{
struct radix_node_head **rnh;
KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
__func__));
KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
__func__));
/* rnh is [fib=0][af=0]. */
rnh = (struct radix_node_head **)V_rt_tables;
/* Get the offset to the requested table and fam. */
rnh += table * (AF_MAX+1) + fam;
return (rnh);
}
struct radix_node_head *
rt_tables_get_rnh(int table, int fam)
{
return (*rt_tables_get_rnh_ptr(table, fam));
}
/*
* route initialization must occur before ip6_init2(), which happenas at
* SI_ORDER_MIDDLE.
*/
static void
route_init(void)
{
struct domain *dom;
int max_keylen = 0;
/* whack the tunable ints into line. */
if (rt_numfibs > RT_MAXFIBS)
rt_numfibs = RT_MAXFIBS;
if (rt_numfibs == 0)
rt_numfibs = 1;
for (dom = domains; dom; dom = dom->dom_next)
if (dom->dom_maxrtkey > max_keylen)
max_keylen = dom->dom_maxrtkey;
rn_init(max_keylen); /* init all zeroes, all ones, mask table */
}
SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
static void
vnet_route_init(const void *unused __unused)
{
struct domain *dom;
struct radix_node_head **rnh;
int table;
int fam;
V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK|M_ZERO);
V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtattach) {
for (table = 0; table < rt_numfibs; table++) {
if ( (fam = dom->dom_family) == AF_INET ||
table == 0) {
/* for now only AF_INET has > 1 table */
/* XXX MRT
* rtattach will be also called
* from vfs_export.c but the
* offset will be 0
* (only for AF_INET and AF_INET6
* which don't need it anyhow)
*/
rnh = rt_tables_get_rnh_ptr(table, fam);
if (rnh == NULL)
panic("%s: rnh NULL", __func__);
dom->dom_rtattach((void **)rnh,
dom->dom_rtoffset);
} else {
break;
}
}
}
}
}
VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
vnet_route_init, 0);
#ifdef VIMAGE
static void
vnet_route_uninit(const void *unused __unused)
{
int table;
int fam;
struct domain *dom;
struct radix_node_head **rnh;
for (dom = domains; dom; dom = dom->dom_next) {
if (dom->dom_rtdetach) {
for (table = 0; table < rt_numfibs; table++) {
if ( (fam = dom->dom_family) == AF_INET ||
table == 0) {
/* For now only AF_INET has > 1 tbl. */
rnh = rt_tables_get_rnh_ptr(table, fam);
if (rnh == NULL)
panic("%s: rnh NULL", __func__);
dom->dom_rtdetach((void **)rnh,
dom->dom_rtoffset);
} else {
break;
}
}
}
}
}
VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
vnet_route_uninit, 0);
#endif
#ifndef _SYS_SYSPROTO_H_
struct setfib_args {
int fibnum;
};
#endif
int
setfib(struct thread *td, struct setfib_args *uap)
{
if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs)
return EINVAL;
td->td_proc->p_fibnum = uap->fibnum;
return (0);
}
/*
* Packet routing routines.
*/
void
rtalloc(struct route *ro)
{
rtalloc_ign_fib(ro, 0UL, 0);
}
void
rtalloc_fib(struct route *ro, u_int fibnum)
{
rtalloc_ign_fib(ro, 0UL, fibnum);
}
void
rtalloc_ign(struct route *ro, u_long ignore)
{
struct rtentry *rt;
if ((rt = ro->ro_rt) != NULL) {
if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
return;
RTFREE(rt);
ro->ro_rt = NULL;
}
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
}
void
rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
{
struct rtentry *rt;
if ((rt = ro->ro_rt) != NULL) {
if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
return;
RTFREE(rt);
ro->ro_rt = NULL;
}
ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
if (ro->ro_rt)
RT_UNLOCK(ro->ro_rt);
}
/*
* Look up the route that matches the address given
* Or, at least try.. Create a cloned route if needed.
*
* The returned route, if any, is locked.
*/
struct rtentry *
rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
{
return (rtalloc1_fib(dst, report, ignflags, 0));
}
struct rtentry *
rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
u_int fibnum)
{
struct radix_node_head *rnh;
struct rtentry *rt;
struct radix_node *rn;
struct rtentry *newrt;
struct rt_addrinfo info;
int err = 0, msgtype = RTM_MISS;
int needlock;
KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
fibnum = 0;
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
newrt = NULL;
/*
* Look up the address in the table for that Address Family
*/
if (rnh == NULL) {
V_rtstat.rts_unreach++;
goto miss;
}
needlock = !(ignflags & RTF_RNH_LOCKED);
if (needlock)
RADIX_NODE_HEAD_RLOCK(rnh);
#ifdef INVARIANTS
else
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
#endif
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
newrt = rt = RNTORT(rn);
RT_LOCK(newrt);
RT_ADDREF(newrt);
if (needlock)
RADIX_NODE_HEAD_RUNLOCK(rnh);
goto done;
} else if (needlock)
RADIX_NODE_HEAD_RUNLOCK(rnh);
/*
* Either we hit the root or couldn't find any match,
* Which basically means
* "caint get there frm here"
*/
V_rtstat.rts_unreach++;
miss:
if (report) {
/*
* If required, report the failure to the supervising
* Authorities.
* For a delete, this is not an error. (report == 0)
*/
bzero(&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
rt_missmsg(msgtype, &info, 0, err);
}
done:
if (newrt)
RT_LOCK_ASSERT(newrt);
return (newrt);
}
/*
* Remove a reference count from an rtentry.
* If the count gets low enough, take it out of the routing table
*/
void
rtfree(struct rtentry *rt)
{
struct radix_node_head *rnh;
KASSERT(rt != NULL,("%s: NULL rt", __func__));
rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
KASSERT(rnh != NULL,("%s: NULL rnh", __func__));
RT_LOCK_ASSERT(rt);
/*
* The callers should use RTFREE_LOCKED() or RTFREE(), so
* we should come here exactly with the last reference.
*/
RT_REMREF(rt);
if (rt->rt_refcnt > 0) {
log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
goto done;
}
/*
* On last reference give the "close method" a chance
* to cleanup private state. This also permits (for
* IPv4 and IPv6) a chance to decide if the routing table
* entry should be purged immediately or at a later time.
* When an immediate purge is to happen the close routine
* typically calls rtexpunge which clears the RTF_UP flag
* on the entry so that the code below reclaims the storage.
*/
if (rt->rt_refcnt == 0 && rnh->rnh_close)
rnh->rnh_close((struct radix_node *)rt, rnh);
/*
* If we are no longer "up" (and ref == 0)
* then we can free the resources associated
* with the route.
*/
if ((rt->rt_flags & RTF_UP) == 0) {
if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic("rtfree 2");
/*
* the rtentry must have been removed from the routing table
* so it is represented in rttrash.. remove that now.
*/
V_rttrash--;
#ifdef DIAGNOSTIC
if (rt->rt_refcnt < 0) {
printf("rtfree: %p not freed (neg refs)\n", rt);
goto done;
}
#endif
/*
* release references on items we hold them on..
* e.g other routes and ifaddrs.
*/
if (rt->rt_ifa)
ifa_free(rt->rt_ifa);
/*
* The key is separatly alloc'd so free it (see rt_setgate()).
* This also frees the gateway, as they are always malloc'd
* together.
*/
Free(rt_key(rt));
/*
* and the rtentry itself of course
*/
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
return;
}
done:
RT_UNLOCK(rt);
}
/*
* Force a routing table entry to the specified
* destination to go through the given gateway.
* Normally called as a result of a routing redirect
* message from the network layer.
*/
void
rtredirect(struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct sockaddr *src)
{
rtredirect_fib(dst, gateway, netmask, flags, src, 0);
}
void
rtredirect_fib(struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct sockaddr *src,
u_int fibnum)
{
struct rtentry *rt, *rt0 = NULL;
int error = 0;
short *stat = NULL;
struct rt_addrinfo info;
struct ifaddr *ifa;
struct radix_node_head *rnh;
ifa = NULL;
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL) {
error = EAFNOSUPPORT;
goto out;
}
/* verify the gateway is directly reachable */
- if ((ifa = ifa_ifwithnet(gateway)) == NULL) {
+ if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
error = ENETUNREACH;
goto out;
}
rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */
/*
* If the redirect isn't from our current router for this dst,
* it's either old or wrong. If it redirects us to ourselves,
* we have a routing loop, perhaps as a result of an interface
* going down recently.
*/
if (!(flags & RTF_DONE) && rt &&
(!sa_equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
error = EINVAL;
else if (ifa_ifwithaddr_check(gateway))
error = EHOSTUNREACH;
if (error)
goto done;
/*
* Create a new entry if we just got back a wildcard entry
* or the the lookup failed. This is necessary for hosts
* which use routing redirects generated by smart gateways
* to dynamically build the routing tables.
*/
if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
goto create;
/*
* Don't listen to the redirect if it's
* for a route to an interface.
*/
if (rt->rt_flags & RTF_GATEWAY) {
if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
/*
* Changing from route to net => route to host.
* Create new route, rather than smashing route to net.
*/
create:
rt0 = rt;
rt = NULL;
flags |= RTF_GATEWAY | RTF_DYNAMIC;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
info.rti_ifa = ifa;
info.rti_flags = flags;
if (rt0 != NULL)
RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */
error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
if (rt != NULL) {
RT_LOCK(rt);
if (rt0 != NULL)
EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
flags = rt->rt_flags;
}
if (rt0 != NULL)
RTFREE(rt0);
stat = &V_rtstat.rts_dynamic;
} else {
struct rtentry *gwrt;
/*
* Smash the current notion of the gateway to
* this destination. Should check about netmask!!!
*/
rt->rt_flags |= RTF_MODIFIED;
flags |= RTF_MODIFIED;
stat = &V_rtstat.rts_newgateway;
/*
* add the key and gateway (in one malloc'd chunk).
*/
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
RT_LOCK(rt);
rt_setgate(rt, rt_key(rt), gateway);
gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
RADIX_NODE_HEAD_UNLOCK(rnh);
EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
RTFREE_LOCKED(gwrt);
}
} else
error = EHOSTUNREACH;
done:
if (rt)
RTFREE_LOCKED(rt);
out:
if (error)
V_rtstat.rts_badredirect++;
else if (stat != NULL)
(*stat)++;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
info.rti_info[RTAX_AUTHOR] = src;
rt_missmsg(RTM_REDIRECT, &info, flags, error);
if (ifa != NULL)
ifa_free(ifa);
}
int
rtioctl(u_long req, caddr_t data)
{
return (rtioctl_fib(req, data, 0));
}
/*
* Routing table ioctl interface.
*/
int
rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
{
/*
* If more ioctl commands are added here, make sure the proper
* super-user checks are being performed because it is possible for
* prison-root to make it this far if raw sockets have been enabled
* in jails.
*/
#ifdef INET
/* Multicast goop, grrr... */
return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
#else /* INET */
return ENXIO;
#endif /* INET */
}
/*
* For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
*/
struct ifaddr *
ifa_ifwithroute(int flags, struct sockaddr *dst, struct sockaddr *gateway)
{
return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
}
struct ifaddr *
ifa_ifwithroute_fib(int flags, struct sockaddr *dst, struct sockaddr *gateway,
u_int fibnum)
{
register struct ifaddr *ifa;
int not_found = 0;
if ((flags & RTF_GATEWAY) == 0) {
/*
* If we are adding a route to an interface,
* and the interface is a pt to pt link
* we should search for the destination
* as our clue to the interface. Otherwise
* we can use the local address.
*/
ifa = NULL;
if (flags & RTF_HOST)
ifa = ifa_ifwithdstaddr(dst);
if (ifa == NULL)
ifa = ifa_ifwithaddr(gateway);
} else {
/*
* If we are adding a route to a remote net
* or host, the gateway may still be on the
* other end of a pt to pt link.
*/
ifa = ifa_ifwithdstaddr(gateway);
}
if (ifa == NULL)
- ifa = ifa_ifwithnet(gateway);
+ ifa = ifa_ifwithnet(gateway, 0);
if (ifa == NULL) {
struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
if (rt == NULL)
return (NULL);
/*
* dismiss a gateway that is reachable only
* through the default router
*/
switch (gateway->sa_family) {
case AF_INET:
if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
not_found = 1;
break;
case AF_INET6:
if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
not_found = 1;
break;
default:
break;
}
if (!not_found && rt->rt_ifa != NULL) {
ifa = rt->rt_ifa;
ifa_ref(ifa);
}
RT_REMREF(rt);
RT_UNLOCK(rt);
if (not_found || ifa == NULL)
return (NULL);
}
if (ifa->ifa_addr->sa_family != dst->sa_family) {
struct ifaddr *oifa = ifa;
ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
if (ifa == NULL)
ifa = oifa;
else
ifa_free(oifa);
}
return (ifa);
}
/*
* Do appropriate manipulations of a routing tree given
* all the bits of info needed
*/
int
rtrequest(int req,
struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct rtentry **ret_nrt)
{
return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
}
int
rtrequest_fib(int req,
struct sockaddr *dst,
struct sockaddr *gateway,
struct sockaddr *netmask,
int flags,
struct rtentry **ret_nrt,
u_int fibnum)
{
struct rt_addrinfo info;
if (dst->sa_len == 0)
return(EINVAL);
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = flags;
info.rti_info[RTAX_DST] = dst;
info.rti_info[RTAX_GATEWAY] = gateway;
info.rti_info[RTAX_NETMASK] = netmask;
return rtrequest1_fib(req, &info, ret_nrt, fibnum);
}
/*
* These (questionable) definitions of apparent local variables apply
* to the next two functions. XXXXXX!!!
*/
#define dst info->rti_info[RTAX_DST]
#define gateway info->rti_info[RTAX_GATEWAY]
#define netmask info->rti_info[RTAX_NETMASK]
#define ifaaddr info->rti_info[RTAX_IFA]
#define ifpaddr info->rti_info[RTAX_IFP]
#define flags info->rti_flags
int
rt_getifa(struct rt_addrinfo *info)
{
return (rt_getifa_fib(info, 0));
}
/*
* Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined,
* it will be referenced so the caller must free it.
*/
int
rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
{
struct ifaddr *ifa;
int error = 0;
/*
* ifp may be specified by sockaddr_dl
* when protocol address is ambiguous.
*/
if (info->rti_ifp == NULL && ifpaddr != NULL &&
ifpaddr->sa_family == AF_LINK &&
- (ifa = ifa_ifwithnet(ifpaddr)) != NULL) {
+ (ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
info->rti_ifp = ifa->ifa_ifp;
ifa_free(ifa);
}
if (info->rti_ifa == NULL && ifaaddr != NULL)
info->rti_ifa = ifa_ifwithaddr(ifaaddr);
if (info->rti_ifa == NULL) {
struct sockaddr *sa;
sa = ifaaddr != NULL ? ifaaddr :
(gateway != NULL ? gateway : dst);
if (sa != NULL && info->rti_ifp != NULL)
info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
else if (dst != NULL && gateway != NULL)
info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
fibnum);
else if (sa != NULL)
info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
fibnum);
}
if ((ifa = info->rti_ifa) != NULL) {
if (info->rti_ifp == NULL)
info->rti_ifp = ifa->ifa_ifp;
} else
error = ENETUNREACH;
return (error);
}
/*
* Expunges references to a route that's about to be reclaimed.
* The route must be locked.
*/
int
rtexpunge(struct rtentry *rt)
{
#if !defined(RADIX_MPATH)
struct radix_node *rn;
#else
struct rt_addrinfo info;
int fib;
struct rtentry *rt0;
#endif
struct radix_node_head *rnh;
struct ifaddr *ifa;
int error = 0;
/*
* Find the correct routing tree to use for this Address Family
*/
rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
RT_LOCK_ASSERT(rt);
if (rnh == NULL)
return (EAFNOSUPPORT);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
#ifdef RADIX_MPATH
fib = rt->rt_fibnum;
bzero(&info, sizeof(info));
info.rti_ifp = rt->rt_ifp;
info.rti_flags = RTF_RNH_LOCKED;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;
RT_UNLOCK(rt);
error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);
if (error == 0 && rt0 != NULL) {
rt = rt0;
RT_LOCK(rt);
} else if (error != 0) {
RT_LOCK(rt);
return (error);
}
#else
/*
* Remove the item from the tree; it should be there,
* but when callers invoke us blindly it may not (sigh).
*/
rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
if (rn == NULL) {
error = ESRCH;
goto bad;
}
KASSERT((rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) == 0,
("unexpected flags 0x%x", rn->rn_flags));
KASSERT(rt == RNTORT(rn),
("lookup mismatch, rt %p rn %p", rt, rn));
#endif /* RADIX_MPATH */
rt->rt_flags &= ~RTF_UP;
/*
* Give the protocol a chance to keep things in sync.
*/
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
struct rt_addrinfo info;
bzero((caddr_t)&info, sizeof(info));
info.rti_flags = rt->rt_flags;
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
}
/*
* one more rtentry floating around that is not
* linked to the routing table.
*/
V_rttrash++;
#if !defined(RADIX_MPATH)
bad:
#endif
return (error);
}
#ifdef RADIX_MPATH
static int
rn_mpath_update(int req, struct rt_addrinfo *info,
struct radix_node_head *rnh, struct rtentry **ret_nrt)
{
/*
* if we got multipath routes, we require users to specify
* a matching RTAX_GATEWAY.
*/
struct rtentry *rt, *rto = NULL;
register struct radix_node *rn;
int error = 0;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
return (ESRCH);
rto = rt = RNTORT(rn);
rt = rt_mpath_matchgate(rt, gateway);
if (rt == NULL)
return (ESRCH);
/*
* this is the first entry in the chain
*/
if (rto == rt) {
rn = rn_mpath_next((struct radix_node *)rt);
/*
* there is another entry, now it's active
*/
if (rn) {
rto = RNTORT(rn);
RT_LOCK(rto);
rto->rt_flags |= RTF_UP;
RT_UNLOCK(rto);
} else if (rt->rt_flags & RTF_GATEWAY) {
/*
* For gateway routes, we need to
* make sure that we we are deleting
* the correct gateway.
* rt_mpath_matchgate() does not
* check the case when there is only
* one route in the chain.
*/
if (gateway &&
(rt->rt_gateway->sa_len != gateway->sa_len ||
memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
error = ESRCH;
else {
/*
* remove from tree before returning it
* to the caller
*/
rn = rnh->rnh_deladdr(dst, netmask, rnh);
KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
goto gwdelete;
}
}
/*
* use the normal delete code to remove
* the first entry
*/
if (req != RTM_DELETE)
goto nondelete;
error = ENOENT;
goto done;
}
/*
* if the entry is 2nd and on up
*/
if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
panic ("rtrequest1: rt_mpath_deldup");
gwdelete:
RT_LOCK(rt);
RT_ADDREF(rt);
if (req == RTM_DELETE) {
rt->rt_flags &= ~RTF_UP;
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
}
nondelete:
if (req != RTM_DELETE)
panic("unrecognized request %d", req);
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
done:
return (error);
}
#endif
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
{
int error = 0, needlock = 0;
register struct rtentry *rt;
#ifdef FLOWTABLE
register struct rtentry *rt0;
#endif
register struct radix_node *rn;
register struct radix_node_head *rnh;
struct ifaddr *ifa;
struct sockaddr *ndst;
#define senderr(x) { error = x ; goto bad; }
KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
fibnum = 0;
/*
* Find the correct routing tree to use for this Address Family
*/
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL)
return (EAFNOSUPPORT);
needlock = ((flags & RTF_RNH_LOCKED) == 0);
flags &= ~RTF_RNH_LOCKED;
if (needlock)
RADIX_NODE_HEAD_LOCK(rnh);
else
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
/*
* If we are adding a host route then we don't want to put
* a netmask in the tree, nor do we want to clone it.
*/
if (flags & RTF_HOST)
netmask = NULL;
switch (req) {
case RTM_DELETE:
#ifdef RADIX_MPATH
if (rn_mpath_capable(rnh)) {
error = rn_mpath_update(req, info, rnh, ret_nrt);
/*
* "bad" holds true for the success case
* as well
*/
if (error != ENOENT)
goto bad;
error = 0;
}
#endif
/*
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
rn = rnh->rnh_deladdr(dst, netmask, rnh);
if (rn == NULL)
senderr(ESRCH);
if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic ("rtrequest delete");
rt = RNTORT(rn);
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
/*
* give the protocol a chance to keep things in sync.
*/
if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
ifa->ifa_rtrequest(RTM_DELETE, rt, info);
/*
* One more rtentry floating around that is not
* linked to the routing table. rttrash will be decremented
* when RTFREE(rt) is eventually called.
*/
V_rttrash++;
/*
* If the caller wants it, then it can have it,
* but it's up to it to free the rtentry as we won't be
* doing it.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_UNLOCK(rt);
} else
RTFREE_LOCKED(rt);
break;
case RTM_RESOLVE:
/*
* resolve was only used for route cloning
* here for compat
*/
break;
case RTM_ADD:
if ((flags & RTF_GATEWAY) && !gateway)
senderr(EINVAL);
if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
(gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
senderr(EINVAL);
if (info->rti_ifa == NULL) {
error = rt_getifa_fib(info, fibnum);
if (error)
senderr(error);
} else
ifa_ref(info->rti_ifa);
ifa = info->rti_ifa;
rt = uma_zalloc(V_rtzone, M_NOWAIT | M_ZERO);
if (rt == NULL) {
if (ifa != NULL)
ifa_free(ifa);
senderr(ENOBUFS);
}
RT_LOCK_INIT(rt);
rt->rt_flags = RTF_UP | flags;
rt->rt_fibnum = fibnum;
/*
* Add the gateway. Possibly re-malloc-ing the storage for it
*
*/
RT_LOCK(rt);
if ((error = rt_setgate(rt, dst, gateway)) != 0) {
RT_LOCK_DESTROY(rt);
if (ifa != NULL)
ifa_free(ifa);
uma_zfree(V_rtzone, rt);
senderr(error);
}
/*
* point to the (possibly newly malloc'd) dest address.
*/
ndst = (struct sockaddr *)rt_key(rt);
/*
* make sure it contains the value we want (masked if needed).
*/
if (netmask) {
rt_maskedcopy(dst, ndst, netmask);
} else
bcopy(dst, ndst, dst->sa_len);
/*
* We use the ifa reference returned by rt_getifa_fib().
* This moved from below so that rnh->rnh_addaddr() can
* examine the ifa and ifa->ifa_ifp if it so desires.
*/
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_rmx.rmx_weight = 1;
#ifdef RADIX_MPATH
/* do not permit exactly the same dst/mask/gw pair */
if (rn_mpath_capable(rnh) &&
rt_mpath_conflict(rnh, rt, netmask)) {
if (rt->rt_ifa) {
ifa_free(rt->rt_ifa);
}
Free(rt_key(rt));
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
senderr(EEXIST);
}
#endif
#ifdef FLOWTABLE
rt0 = NULL;
/* XXX
* "flow-table" only support IPv4 at the moment.
*/
#ifdef INET
if (dst->sa_family == AF_INET) {
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
struct sockaddr *mask;
u_char *m, *n;
int len;
/*
* compare mask to see if the new route is
* more specific than the existing one
*/
rt0 = RNTORT(rn);
RT_LOCK(rt0);
RT_ADDREF(rt0);
RT_UNLOCK(rt0);
/*
* A host route is already present, so
* leave the flow-table entries as is.
*/
if (rt0->rt_flags & RTF_HOST) {
RTFREE(rt0);
rt0 = NULL;
} else if (!(flags & RTF_HOST) && netmask) {
mask = rt_mask(rt0);
len = mask->sa_len;
m = (u_char *)mask;
n = (u_char *)netmask;
while (len-- > 0) {
if (*n != *m)
break;
n++;
m++;
}
if (len == 0 || (*n < *m)) {
RTFREE(rt0);
rt0 = NULL;
}
}
}
}
#endif
#endif
/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
/*
* If it still failed to go into the tree,
* then un-make it (this should be a function)
*/
if (rn == NULL) {
if (rt->rt_ifa)
ifa_free(rt->rt_ifa);
Free(rt_key(rt));
RT_LOCK_DESTROY(rt);
uma_zfree(V_rtzone, rt);
#ifdef FLOWTABLE
if (rt0 != NULL)
RTFREE(rt0);
#endif
senderr(EEXIST);
}
#ifdef FLOWTABLE
else if (rt0 != NULL) {
#ifdef INET
flowtable_route_flush(V_ip_ft, rt0);
#endif
RTFREE(rt0);
}
#endif
/*
* If this protocol has something to add to this then
* allow it to do that as well.
*/
if (ifa->ifa_rtrequest)
ifa->ifa_rtrequest(req, rt, info);
/*
* actually return a resultant rtentry and
* give the caller a single reference.
*/
if (ret_nrt) {
*ret_nrt = rt;
RT_ADDREF(rt);
}
RT_UNLOCK(rt);
break;
default:
error = EOPNOTSUPP;
}
bad:
if (needlock)
RADIX_NODE_HEAD_UNLOCK(rnh);
return (error);
#undef senderr
}
#undef dst
#undef gateway
#undef netmask
#undef ifaaddr
#undef ifpaddr
#undef flags
int
rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
{
/* XXX dst may be overwritten, can we move this to below */
int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
#ifdef INVARIANTS
struct radix_node_head *rnh;
rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
#endif
RT_LOCK_ASSERT(rt);
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
/*
* Prepare to store the gateway in rt->rt_gateway.
* Both dst and gateway are stored one after the other in the same
* malloc'd chunk. If we have room, we can reuse the old buffer,
* rt_gateway already points to the right place.
* Otherwise, malloc a new block and update the 'dst' address.
*/
if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) {
caddr_t new;
R_Malloc(new, caddr_t, dlen + glen);
if (new == NULL)
return ENOBUFS;
/*
* XXX note, we copy from *dst and not *rt_key(rt) because
* rt_setgate() can be called to initialize a newly
* allocated route entry, in which case rt_key(rt) == NULL
* (and also rt->rt_gateway == NULL).
* Free()/free() handle a NULL argument just fine.
*/
bcopy(dst, new, dlen);
Free(rt_key(rt)); /* free old block, if any */
rt_key(rt) = (struct sockaddr *)new;
rt->rt_gateway = (struct sockaddr *)(new + dlen);
}
/*
* Copy the new gateway value into the memory chunk.
*/
bcopy(gate, rt->rt_gateway, glen);
return (0);
}
void
rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask)
{
register u_char *cp1 = (u_char *)src;
register u_char *cp2 = (u_char *)dst;
register u_char *cp3 = (u_char *)netmask;
u_char *cplim = cp2 + *cp3;
u_char *cplim2 = cp2 + *cp1;
*cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */
cp3 += 2;
if (cplim > cplim2)
cplim = cplim2;
while (cp2 < cplim)
*cp2++ = *cp1++ & *cp3++;
if (cp2 < cplim2)
bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
}
/*
* Set up a routing table entry, normally
* for an interface.
*/
#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
static inline int
rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
{
struct sockaddr *dst;
struct sockaddr *netmask;
struct rtentry *rt = NULL;
struct rt_addrinfo info;
int error = 0;
int startfib, endfib;
char tempbuf[_SOCKADDR_TMPSIZE];
int didwork = 0;
int a_failure = 0;
static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
netmask = NULL;
} else {
dst = ifa->ifa_addr;
netmask = ifa->ifa_netmask;
}
if ( dst->sa_family != AF_INET)
fibnum = 0;
if (fibnum == -1) {
if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
startfib = endfib = curthread->td_proc->p_fibnum;
} else {
startfib = 0;
endfib = rt_numfibs - 1;
}
} else {
KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
startfib = fibnum;
endfib = fibnum;
}
if (dst->sa_len == 0)
return(EINVAL);
/*
* If it's a delete, check that if it exists,
* it's on the correct interface or we might scrub
* a route to another ifa which would
* be confusing at best and possibly worse.
*/
if (cmd == RTM_DELETE) {
/*
* It's a delete, so it should already exist..
* If it's a net, mask off the host bits
* (Assuming we have a mask)
* XXX this is kinda inet specific..
*/
if (netmask != NULL) {
rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
dst = (struct sockaddr *)tempbuf;
}
}
/*
* Now go through all the requested tables (fibs) and do the
* requested action. Realistically, this will either be fib 0
* for protocols that don't do multiple tables or all the
* tables for those that do. XXX For this version only AF_INET.
* When that changes code should be refactored to protocol
* independent parts and protocol dependent parts.
*/
for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
if (cmd == RTM_DELETE) {
struct radix_node_head *rnh;
struct radix_node *rn;
/*
* Look up an rtentry that is in the routing tree and
* contains the correct info.
*/
rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
if (rnh == NULL)
/* this table doesn't exist but others might */
continue;
RADIX_NODE_HEAD_LOCK(rnh);
#ifdef RADIX_MPATH
if (rn_mpath_capable(rnh)) {
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL)
error = ESRCH;
else {
rt = RNTORT(rn);
/*
* for interface route the
* rt->rt_gateway is sockaddr_intf
* for cloning ARP entries, so
* rt_mpath_matchgate must use the
* interface address
*/
rt = rt_mpath_matchgate(rt,
ifa->ifa_addr);
if (!rt)
error = ESRCH;
}
}
else
#endif
rn = rnh->rnh_lookup(dst, netmask, rnh);
error = (rn == NULL ||
(rn->rn_flags & RNF_ROOT) ||
RNTORT(rn)->rt_ifa != ifa ||
!sa_equal((struct sockaddr *)rn->rn_key, dst));
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error) {
/* this is only an error if bad on ALL tables */
continue;
}
}
/*
* Do the actual request
*/
bzero((caddr_t)&info, sizeof(info));
info.rti_ifa = ifa;
info.rti_flags = flags | ifa->ifa_flags;
info.rti_info[RTAX_DST] = dst;
/*
* doing this for compatibility reasons
*/
if (cmd == RTM_ADD)
info.rti_info[RTAX_GATEWAY] =
(struct sockaddr *)&null_sdl;
else
info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = netmask;
error = rtrequest1_fib(cmd, &info, &rt, fibnum);
if (error == 0 && rt != NULL) {
/*
* notify any listening routing agents of the change
*/
RT_LOCK(rt);
#ifdef RADIX_MPATH
/*
* in case address alias finds the first address
* e.g. ifconfig bge0 192.103.54.246/24
* e.g. ifconfig bge0 192.103.54.247/24
* the address set in the route is 192.103.54.246
* so we need to replace it with 192.103.54.247
*/
if (memcmp(rt->rt_ifa->ifa_addr,
ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
ifa_free(rt->rt_ifa);
ifa_ref(ifa);
rt->rt_ifp = ifa->ifa_ifp;
rt->rt_ifa = ifa;
}
#endif
/*
* doing this for compatibility reasons
*/
if (cmd == RTM_ADD) {
((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
rt->rt_ifp->if_type;
((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
rt->rt_ifp->if_index;
}
RT_ADDREF(rt);
RT_UNLOCK(rt);
rt_newaddrmsg(cmd, ifa, error, rt);
RT_LOCK(rt);
RT_REMREF(rt);
if (cmd == RTM_DELETE) {
/*
* If we are deleting, and we found an entry,
* then it's been removed from the tree..
* now throw it away.
*/
RTFREE_LOCKED(rt);
} else {
if (cmd == RTM_ADD) {
/*
* We just wanted to add it..
* we don't actually need a reference.
*/
RT_REMREF(rt);
}
RT_UNLOCK(rt);
}
didwork = 1;
}
if (error)
a_failure = error;
}
if (cmd == RTM_DELETE) {
if (didwork) {
error = 0;
} else {
/* we only give an error if it wasn't in any table */
error = ((flags & RTF_HOST) ?
EHOSTUNREACH : ENETUNREACH);
}
} else {
if (a_failure) {
/* return an error if any of them failed */
error = a_failure;
}
}
return (error);
}
/* special one for inet internal use. may not use. */
int
rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
{
return (rtinit1(ifa, cmd, flags, -1));
}
/*
* Set up a routing table entry, normally
* for an interface.
*/
int
rtinit(struct ifaddr *ifa, int cmd, int flags)
{
struct sockaddr *dst;
int fib = 0;
if (flags & RTF_HOST) {
dst = ifa->ifa_dstaddr;
} else {
dst = ifa->ifa_addr;
}
if (dst->sa_family == AF_INET)
fib = -1;
return (rtinit1(ifa, cmd, flags, fib));
}
Index: stable/8/sys/net/rtsock.c
===================================================================
--- stable/8/sys/net/rtsock.c (revision 209276)
+++ stable/8/sys/net/rtsock.c (revision 209277)
@@ -1,1683 +1,1694 @@
/*-
* Copyright (c) 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
* $FreeBSD$
*/
#include "opt_compat.h"
#include "opt_sctp.h"
#include "opt_mpath.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/domain.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/rwlock.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_llatbl.h>
+#include <net/if_types.h>
#include <net/netisr.h>
#include <net/raw_cb.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
#ifdef INET6
#include <netinet6/scope6_var.h>
#endif
#if defined(INET) || defined(INET6)
#ifdef SCTP
extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
#endif /* SCTP */
#endif
#ifdef COMPAT_FREEBSD32
#include <sys/mount.h>
#include <compat/freebsd32/freebsd32.h>
struct if_data32 {
uint8_t ifi_type;
uint8_t ifi_physical;
uint8_t ifi_addrlen;
uint8_t ifi_hdrlen;
uint8_t ifi_link_state;
uint8_t ifi_spare_char1;
uint8_t ifi_spare_char2;
uint8_t ifi_datalen;
uint32_t ifi_mtu;
uint32_t ifi_metric;
uint32_t ifi_baudrate;
uint32_t ifi_ipackets;
uint32_t ifi_ierrors;
uint32_t ifi_opackets;
uint32_t ifi_oerrors;
uint32_t ifi_collisions;
uint32_t ifi_ibytes;
uint32_t ifi_obytes;
uint32_t ifi_imcasts;
uint32_t ifi_omcasts;
uint32_t ifi_iqdrops;
uint32_t ifi_noproto;
uint32_t ifi_hwassist;
int32_t ifi_epoch;
struct timeval32 ifi_lastchange;
};
struct if_msghdr32 {
uint16_t ifm_msglen;
uint8_t ifm_version;
uint8_t ifm_type;
int32_t ifm_addrs;
int32_t ifm_flags;
uint16_t ifm_index;
struct if_data32 ifm_data;
};
#endif
MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
/* NB: these are not modified */
static struct sockaddr route_src = { 2, PF_ROUTE, };
static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, };
static struct {
int ip_count; /* attached w/ AF_INET */
int ip6_count; /* attached w/ AF_INET6 */
int ipx_count; /* attached w/ AF_IPX */
int any_count; /* total attached */
} route_cb;
struct mtx rtsock_mtx;
MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
#define RTSOCK_LOCK() mtx_lock(&rtsock_mtx)
#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)
SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
struct walkarg {
int w_tmemsize;
int w_op, w_arg;
caddr_t w_tmem;
struct sysctl_req *w_req;
};
static void rts_input(struct mbuf *m);
static struct mbuf *rt_msg1(int type, struct rt_addrinfo *rtinfo);
static int rt_msg2(int type, struct rt_addrinfo *rtinfo,
caddr_t cp, struct walkarg *w);
static int rt_xaddrs(caddr_t cp, caddr_t cplim,
struct rt_addrinfo *rtinfo);
static int sysctl_dumpentry(struct radix_node *rn, void *vw);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *m, struct socket *so);
static void rt_setmetrics(u_long which, const struct rt_metrics *in,
struct rt_metrics_lite *out);
static void rt_getmetrics(const struct rt_metrics_lite *in,
struct rt_metrics *out);
static void rt_dispatch(struct mbuf *, const struct sockaddr *);
static struct netisr_handler rtsock_nh = {
.nh_name = "rtsock",
.nh_handler = rts_input,
.nh_proto = NETISR_ROUTE,
.nh_policy = NETISR_POLICY_SOURCE,
};
static int
sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
{
int error, qlimit;
netisr_getqlimit(&rtsock_nh, &qlimit);
error = sysctl_handle_int(oidp, &qlimit, 0, req);
if (error || !req->newptr)
return (error);
if (qlimit < 1)
return (EINVAL);
return (netisr_setqlimit(&rtsock_nh, qlimit));
}
SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
0, 0, sysctl_route_netisr_maxqlen, "I",
"maximum routing socket dispatch queue length");
static void
rts_init(void)
{
int tmp;
if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
rtsock_nh.nh_qlimit = tmp;
netisr_register(&rtsock_nh);
}
SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);
static void
rts_input(struct mbuf *m)
{
struct sockproto route_proto;
unsigned short *family;
struct m_tag *tag;
route_proto.sp_family = PF_ROUTE;
tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
if (tag != NULL) {
family = (unsigned short *)(tag + 1);
route_proto.sp_protocol = *family;
m_tag_delete(m, tag);
} else
route_proto.sp_protocol = 0;
raw_input(m, &route_proto, &route_src);
}
/*
* It really doesn't make any sense at all for this code to share much
* with raw_usrreq.c, since its functionality is so restricted. XXX
*/
static void
rts_abort(struct socket *so)
{
raw_usrreqs.pru_abort(so);
}
static void
rts_close(struct socket *so)
{
raw_usrreqs.pru_close(so);
}
/* pru_accept is EOPNOTSUPP */
static int
rts_attach(struct socket *so, int proto, struct thread *td)
{
struct rawcb *rp;
int s, error;
KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));
/* XXX */
rp = malloc(sizeof *rp, M_PCB, M_WAITOK | M_ZERO);
if (rp == NULL)
return ENOBUFS;
/*
* The splnet() is necessary to block protocols from sending
* error notifications (like RTM_REDIRECT or RTM_LOSING) while
* this PCB is extant but incompletely initialized.
* Probably we should try to do more of this work beforehand and
* eliminate the spl.
*/
s = splnet();
so->so_pcb = (caddr_t)rp;
so->so_fibnum = td->td_proc->p_fibnum;
error = raw_attach(so, proto);
rp = sotorawcb(so);
if (error) {
splx(s);
so->so_pcb = NULL;
free(rp, M_PCB);
return error;
}
RTSOCK_LOCK();
switch(rp->rcb_proto.sp_protocol) {
case AF_INET:
route_cb.ip_count++;
break;
case AF_INET6:
route_cb.ip6_count++;
break;
case AF_IPX:
route_cb.ipx_count++;
break;
}
route_cb.any_count++;
RTSOCK_UNLOCK();
soisconnected(so);
so->so_options |= SO_USELOOPBACK;
splx(s);
return 0;
}
static int
rts_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
}
static int
rts_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
{
return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
}
/* pru_connect2 is EOPNOTSUPP */
/* pru_control is EOPNOTSUPP */
static void
rts_detach(struct socket *so)
{
struct rawcb *rp = sotorawcb(so);
KASSERT(rp != NULL, ("rts_detach: rp == NULL"));
RTSOCK_LOCK();
switch(rp->rcb_proto.sp_protocol) {
case AF_INET:
route_cb.ip_count--;
break;
case AF_INET6:
route_cb.ip6_count--;
break;
case AF_IPX:
route_cb.ipx_count--;
break;
}
route_cb.any_count--;
RTSOCK_UNLOCK();
raw_usrreqs.pru_detach(so);
}
static int
rts_disconnect(struct socket *so)
{
return (raw_usrreqs.pru_disconnect(so));
}
/* pru_listen is EOPNOTSUPP */
static int
rts_peeraddr(struct socket *so, struct sockaddr **nam)
{
return (raw_usrreqs.pru_peeraddr(so, nam));
}
/* pru_rcvd is EOPNOTSUPP */
/* pru_rcvoob is EOPNOTSUPP */
static int
rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct thread *td)
{
return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
}
/* pru_sense is null */
static int
rts_shutdown(struct socket *so)
{
return (raw_usrreqs.pru_shutdown(so));
}
static int
rts_sockaddr(struct socket *so, struct sockaddr **nam)
{
return (raw_usrreqs.pru_sockaddr(so, nam));
}
static struct pr_usrreqs route_usrreqs = {
.pru_abort = rts_abort,
.pru_attach = rts_attach,
.pru_bind = rts_bind,
.pru_connect = rts_connect,
.pru_detach = rts_detach,
.pru_disconnect = rts_disconnect,
.pru_peeraddr = rts_peeraddr,
.pru_send = rts_send,
.pru_shutdown = rts_shutdown,
.pru_sockaddr = rts_sockaddr,
.pru_close = rts_close,
};
#ifndef _SOCKADDR_UNION_DEFINED
#define _SOCKADDR_UNION_DEFINED
/*
* The union of all possible address formats we handle.
*/
union sockaddr_union {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
};
#endif /* _SOCKADDR_UNION_DEFINED */
static int
rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
struct rtentry *rt, union sockaddr_union *saun, struct ucred *cred)
{
/* First, see if the returned address is part of the jail. */
if (prison_if(cred, rt->rt_ifa->ifa_addr) == 0) {
info->rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
return (0);
}
switch (info->rti_info[RTAX_DST]->sa_family) {
#ifdef INET
case AF_INET:
{
struct in_addr ia;
struct ifaddr *ifa;
int found;
found = 0;
/*
* Try to find an address on the given outgoing interface
* that belongs to the jail.
*/
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa;
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
ia = ((struct sockaddr_in *)sa)->sin_addr;
if (prison_check_ip4(cred, &ia) == 0) {
found = 1;
break;
}
}
IF_ADDR_UNLOCK(ifp);
if (!found) {
/*
* As a last resort return the 'default' jail address.
*/
ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)->
sin_addr;
if (prison_get_ip4(cred, &ia) != 0)
return (ESRCH);
}
bzero(&saun->sin, sizeof(struct sockaddr_in));
saun->sin.sin_len = sizeof(struct sockaddr_in);
saun->sin.sin_family = AF_INET;
saun->sin.sin_addr.s_addr = ia.s_addr;
info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
break;
}
#endif
#ifdef INET6
case AF_INET6:
{
struct in6_addr ia6;
struct ifaddr *ifa;
int found;
found = 0;
/*
* Try to find an address on the given outgoing interface
* that belongs to the jail.
*/
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct sockaddr *sa;
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET6)
continue;
bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
&ia6, sizeof(struct in6_addr));
if (prison_check_ip6(cred, &ia6) == 0) {
found = 1;
break;
}
}
IF_ADDR_UNLOCK(ifp);
if (!found) {
/*
* As a last resort return the 'default' jail address.
*/
ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)->
sin6_addr;
if (prison_get_ip6(cred, &ia6) != 0)
return (ESRCH);
}
bzero(&saun->sin6, sizeof(struct sockaddr_in6));
saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
saun->sin6.sin6_family = AF_INET6;
bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
if (sa6_recoverscope(&saun->sin6) != 0)
return (ESRCH);
info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
break;
}
#endif
default:
return (ESRCH);
}
return (0);
}
/*ARGSUSED*/
static int
route_output(struct mbuf *m, struct socket *so)
{
#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
struct rt_msghdr *rtm = NULL;
struct rtentry *rt = NULL;
struct radix_node_head *rnh;
struct rt_addrinfo info;
int len, error = 0;
struct ifnet *ifp = NULL;
union sockaddr_union saun;
#define senderr(e) { error = e; goto flush;}
if (m == NULL || ((m->m_len < sizeof(long)) &&
(m = m_pullup(m, sizeof(long))) == NULL))
return (ENOBUFS);
if ((m->m_flags & M_PKTHDR) == 0)
panic("route_output");
len = m->m_pkthdr.len;
if (len < sizeof(*rtm) ||
len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
info.rti_info[RTAX_DST] = NULL;
senderr(EINVAL);
}
R_Malloc(rtm, struct rt_msghdr *, len);
if (rtm == NULL) {
info.rti_info[RTAX_DST] = NULL;
senderr(ENOBUFS);
}
m_copydata(m, 0, len, (caddr_t)rtm);
if (rtm->rtm_version != RTM_VERSION) {
info.rti_info[RTAX_DST] = NULL;
senderr(EPROTONOSUPPORT);
}
rtm->rtm_pid = curproc->p_pid;
bzero(&info, sizeof(info));
info.rti_addrs = rtm->rtm_addrs;
if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) {
info.rti_info[RTAX_DST] = NULL;
senderr(EINVAL);
}
info.rti_flags = rtm->rtm_flags;
if (info.rti_info[RTAX_DST] == NULL ||
info.rti_info[RTAX_DST]->sa_family >= AF_MAX ||
(info.rti_info[RTAX_GATEWAY] != NULL &&
info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
senderr(EINVAL);
/*
* Verify that the caller has the appropriate privilege; RTM_GET
* is the only operation the non-superuser is allowed.
*/
if (rtm->rtm_type != RTM_GET) {
error = priv_check(curthread, PRIV_NET_ROUTE);
if (error)
senderr(error);
}
/*
* The given gateway address may be an interface address.
* For example, issuing a "route change" command on a route
* entry that was created from a tunnel, and the gateway
* address given is the local end point. In this case the
* RTF_GATEWAY flag must be cleared or the destination will
* not be reachable even though there is no error message.
*/
if (info.rti_info[RTAX_GATEWAY] != NULL &&
info.rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
struct route gw_ro;
bzero(&gw_ro, sizeof(gw_ro));
gw_ro.ro_dst = *info.rti_info[RTAX_GATEWAY];
rtalloc_ign_fib(&gw_ro, 0, so->so_fibnum);
/*
* A host route through the loopback interface is
* installed for each interface adddress. In pre 8.0
* releases the interface address of a PPP link type
* is not reachable locally. This behavior is fixed as
* part of the new L2/L3 redesign and rewrite work. The
* signature of this interface address route is the
* AF_LINK sa_family type of the rt_gateway, and the
* rt_ifp has the IFF_LOOPBACK flag set.
*/
if (gw_ro.ro_rt != NULL &&
gw_ro.ro_rt->rt_gateway->sa_family == AF_LINK &&
gw_ro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)
info.rti_flags &= ~RTF_GATEWAY;
if (gw_ro.ro_rt != NULL)
RTFREE(gw_ro.ro_rt);
}
switch (rtm->rtm_type) {
struct rtentry *saved_nrt;
case RTM_ADD:
if (info.rti_info[RTAX_GATEWAY] == NULL)
senderr(EINVAL);
saved_nrt = NULL;
/* support for new ARP code */
if (info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK &&
(rtm->rtm_flags & RTF_LLDATA) != 0) {
error = lla_rt_output(rtm, &info);
break;
}
error = rtrequest1_fib(RTM_ADD, &info, &saved_nrt,
so->so_fibnum);
if (error == 0 && saved_nrt) {
RT_LOCK(saved_nrt);
rt_setmetrics(rtm->rtm_inits,
&rtm->rtm_rmx, &saved_nrt->rt_rmx);
rtm->rtm_index = saved_nrt->rt_ifp->if_index;
RT_REMREF(saved_nrt);
RT_UNLOCK(saved_nrt);
}
break;
case RTM_DELETE:
saved_nrt = NULL;
/* support for new ARP code */
if (info.rti_info[RTAX_GATEWAY] &&
(info.rti_info[RTAX_GATEWAY]->sa_family == AF_LINK) &&
(rtm->rtm_flags & RTF_LLDATA) != 0) {
error = lla_rt_output(rtm, &info);
break;
}
error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt,
so->so_fibnum);
if (error == 0) {
RT_LOCK(saved_nrt);
rt = saved_nrt;
goto report;
}
break;
case RTM_GET:
case RTM_CHANGE:
case RTM_LOCK:
rnh = rt_tables_get_rnh(so->so_fibnum,
info.rti_info[RTAX_DST]->sa_family);
if (rnh == NULL)
senderr(EAFNOSUPPORT);
RADIX_NODE_HEAD_RLOCK(rnh);
rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST],
info.rti_info[RTAX_NETMASK], rnh);
if (rt == NULL) { /* XXX looks bogus */
RADIX_NODE_HEAD_RUNLOCK(rnh);
senderr(ESRCH);
}
#ifdef RADIX_MPATH
/*
* for RTM_CHANGE/LOCK, if we got multipath routes,
* we require users to specify a matching RTAX_GATEWAY.
*
* for RTM_GET, gate is optional even with multipath.
* if gate == NULL the first match is returned.
* (no need to call rt_mpath_matchgate if gate == NULL)
*/
if (rn_mpath_capable(rnh) &&
(rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
if (!rt) {
RADIX_NODE_HEAD_RUNLOCK(rnh);
senderr(ESRCH);
}
}
#endif
/*
* If performing proxied L2 entry insertion, and
* the actual PPP host entry is found, perform
* another search to retrieve the prefix route of
* the local end point of the PPP link.
*/
- if ((rtm->rtm_flags & RTF_ANNOUNCE) &&
- (rt->rt_ifp->if_flags & IFF_POINTOPOINT)) {
+ if (rtm->rtm_flags & RTF_ANNOUNCE) {
struct sockaddr laddr;
- rt_maskedcopy(rt->rt_ifa->ifa_addr,
- &laddr,
- rt->rt_ifa->ifa_netmask);
+
+ if (rt->rt_ifp != NULL &&
+ rt->rt_ifp->if_type == IFT_PROPVIRTUAL) {
+ struct ifaddr *ifa;
+
+ ifa = ifa_ifwithnet(info.rti_info[RTAX_DST], 1);
+ if (ifa != NULL)
+ rt_maskedcopy(ifa->ifa_addr,
+ &laddr,
+ ifa->ifa_netmask);
+ } else
+ rt_maskedcopy(rt->rt_ifa->ifa_addr,
+ &laddr,
+ rt->rt_ifa->ifa_netmask);
/*
* refactor rt and no lock operation necessary
*/
rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh);
if (rt == NULL) {
RADIX_NODE_HEAD_RUNLOCK(rnh);
senderr(ESRCH);
}
}
RT_LOCK(rt);
RT_ADDREF(rt);
RADIX_NODE_HEAD_RUNLOCK(rnh);
/*
* Fix for PR: 82974
*
* RTM_CHANGE/LOCK need a perfect match, rn_lookup()
* returns a perfect match in case a netmask is
* specified. For host routes only a longest prefix
* match is returned so it is necessary to compare the
* existence of the netmask. If both have a netmask
* rnh_lookup() did a perfect match and if none of them
* have a netmask both are host routes which is also a
* perfect match.
*/
if (rtm->rtm_type != RTM_GET &&
(!rt_mask(rt) != !info.rti_info[RTAX_NETMASK])) {
RT_UNLOCK(rt);
senderr(ESRCH);
}
switch(rtm->rtm_type) {
case RTM_GET:
report:
RT_LOCK_ASSERT(rt);
if ((rt->rt_flags & RTF_HOST) == 0
? jailed_without_vnet(curthread->td_ucred)
: prison_if(curthread->td_ucred,
rt_key(rt)) != 0) {
RT_UNLOCK(rt);
senderr(ESRCH);
}
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_GENMASK] = 0;
if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
ifp = rt->rt_ifp;
if (ifp) {
info.rti_info[RTAX_IFP] =
ifp->if_addr->ifa_addr;
error = rtm_get_jailed(&info, ifp, rt,
&saun, curthread->td_ucred);
if (error != 0) {
RT_UNLOCK(rt);
senderr(error);
}
if (ifp->if_flags & IFF_POINTOPOINT)
info.rti_info[RTAX_BRD] =
rt->rt_ifa->ifa_dstaddr;
rtm->rtm_index = ifp->if_index;
} else {
info.rti_info[RTAX_IFP] = NULL;
info.rti_info[RTAX_IFA] = NULL;
}
} else if ((ifp = rt->rt_ifp) != NULL) {
rtm->rtm_index = ifp->if_index;
}
len = rt_msg2(rtm->rtm_type, &info, NULL, NULL);
if (len > rtm->rtm_msglen) {
struct rt_msghdr *new_rtm;
R_Malloc(new_rtm, struct rt_msghdr *, len);
if (new_rtm == NULL) {
RT_UNLOCK(rt);
senderr(ENOBUFS);
}
bcopy(rtm, new_rtm, rtm->rtm_msglen);
Free(rtm); rtm = new_rtm;
}
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL);
rtm->rtm_flags = rt->rt_flags;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
case RTM_CHANGE:
/*
* New gateway could require new ifaddr, ifp;
* flags may also be different; ifp may be specified
* by ll sockaddr when protocol address is ambiguous
*/
if (((rt->rt_flags & RTF_GATEWAY) &&
info.rti_info[RTAX_GATEWAY] != NULL) ||
info.rti_info[RTAX_IFP] != NULL ||
(info.rti_info[RTAX_IFA] != NULL &&
!sa_equal(info.rti_info[RTAX_IFA],
rt->rt_ifa->ifa_addr))) {
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
error = rt_getifa_fib(&info, rt->rt_fibnum);
/*
* XXXRW: Really we should release this
* reference later, but this maintains
* historical behavior.
*/
if (info.rti_ifa != NULL)
ifa_free(info.rti_ifa);
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error != 0)
senderr(error);
RT_LOCK(rt);
}
if (info.rti_ifa != NULL &&
info.rti_ifa != rt->rt_ifa &&
rt->rt_ifa != NULL &&
rt->rt_ifa->ifa_rtrequest != NULL) {
rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt,
&info);
ifa_free(rt->rt_ifa);
}
if (info.rti_info[RTAX_GATEWAY] != NULL) {
RT_UNLOCK(rt);
RADIX_NODE_HEAD_LOCK(rnh);
RT_LOCK(rt);
error = rt_setgate(rt, rt_key(rt),
info.rti_info[RTAX_GATEWAY]);
RADIX_NODE_HEAD_UNLOCK(rnh);
if (error != 0) {
RT_UNLOCK(rt);
senderr(error);
}
rt->rt_flags |= (RTF_GATEWAY & info.rti_flags);
}
if (info.rti_ifa != NULL &&
info.rti_ifa != rt->rt_ifa) {
ifa_ref(info.rti_ifa);
rt->rt_ifa = info.rti_ifa;
rt->rt_ifp = info.rti_ifp;
}
/* Allow some flags to be toggled on change. */
rt->rt_flags = (rt->rt_flags & ~RTF_FMASK) |
(rtm->rtm_flags & RTF_FMASK);
rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
&rt->rt_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, &info);
/* FALLTHROUGH */
case RTM_LOCK:
/* We don't support locks anymore */
break;
}
RT_UNLOCK(rt);
break;
default:
senderr(EOPNOTSUPP);
}
flush:
if (rtm) {
if (error)
rtm->rtm_errno = error;
else
rtm->rtm_flags |= RTF_DONE;
}
if (rt) /* XXX can this be true? */
RTFREE(rt);
{
struct rawcb *rp = NULL;
/*
* Check to see if we don't want our own messages.
*/
if ((so->so_options & SO_USELOOPBACK) == 0) {
if (route_cb.any_count <= 1) {
if (rtm)
Free(rtm);
m_freem(m);
return (error);
}
/* There is another listener, so construct message */
rp = sotorawcb(so);
}
if (rtm) {
m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
if (m->m_pkthdr.len < rtm->rtm_msglen) {
m_freem(m);
m = NULL;
} else if (m->m_pkthdr.len > rtm->rtm_msglen)
m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
Free(rtm);
}
if (m) {
if (rp) {
/*
* XXX insure we don't get a copy by
* invalidating our protocol
*/
unsigned short family = rp->rcb_proto.sp_family;
rp->rcb_proto.sp_family = 0;
rt_dispatch(m, info.rti_info[RTAX_DST]);
rp->rcb_proto.sp_family = family;
} else
rt_dispatch(m, info.rti_info[RTAX_DST]);
}
}
return (error);
#undef sa_equal
}
static void
rt_setmetrics(u_long which, const struct rt_metrics *in,
struct rt_metrics_lite *out)
{
#define metric(f, e) if (which & (f)) out->e = in->e;
/*
* Only these are stored in the routing entry since introduction
* of tcp hostcache. The rest is ignored.
*/
metric(RTV_MTU, rmx_mtu);
metric(RTV_WEIGHT, rmx_weight);
/* Userland -> kernel timebase conversion. */
if (which & RTV_EXPIRE)
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_second + time_uptime : 0;
#undef metric
}
static void
rt_getmetrics(const struct rt_metrics_lite *in, struct rt_metrics *out)
{
#define metric(e) out->e = in->e;
bzero(out, sizeof(*out));
metric(rmx_mtu);
metric(rmx_weight);
/* Kernel -> userland timebase conversion. */
out->rmx_expire = in->rmx_expire ?
in->rmx_expire - time_uptime + time_second : 0;
#undef metric
}
/*
* Extract the addresses of the passed sockaddrs.
* Do a little sanity checking so as to avoid bad memory references.
* This data is derived straight from userland.
*/
static int
rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
{
struct sockaddr *sa;
int i;
for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
if ((rtinfo->rti_addrs & (1 << i)) == 0)
continue;
sa = (struct sockaddr *)cp;
/*
* It won't fit.
*/
if (cp + sa->sa_len > cplim)
return (EINVAL);
/*
* there are no more.. quit now
* If there are more bits, they are in error.
* I've seen this. route(1) can evidently generate these.
* This causes kernel to core dump.
* for compatibility, If we see this, point to a safe address.
*/
if (sa->sa_len == 0) {
rtinfo->rti_info[i] = &sa_zero;
return (0); /* should be EINVAL but for compat */
}
/* accept it */
rtinfo->rti_info[i] = sa;
cp += SA_SIZE(sa);
}
return (0);
}
static struct mbuf *
rt_msg1(int type, struct rt_addrinfo *rtinfo)
{
struct rt_msghdr *rtm;
struct mbuf *m;
int i;
struct sockaddr *sa;
int len, dlen;
switch (type) {
case RTM_DELADDR:
case RTM_NEWADDR:
len = sizeof(struct ifa_msghdr);
break;
case RTM_DELMADDR:
case RTM_NEWMADDR:
len = sizeof(struct ifma_msghdr);
break;
case RTM_IFINFO:
len = sizeof(struct if_msghdr);
break;
case RTM_IFANNOUNCE:
case RTM_IEEE80211:
len = sizeof(struct if_announcemsghdr);
break;
default:
len = sizeof(struct rt_msghdr);
}
if (len > MCLBYTES)
panic("rt_msg1");
m = m_gethdr(M_DONTWAIT, MT_DATA);
if (m && len > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return (m);
m->m_pkthdr.len = m->m_len = len;
m->m_pkthdr.rcvif = NULL;
rtm = mtod(m, struct rt_msghdr *);
bzero((caddr_t)rtm, len);
for (i = 0; i < RTAX_MAX; i++) {
if ((sa = rtinfo->rti_info[i]) == NULL)
continue;
rtinfo->rti_addrs |= (1 << i);
dlen = SA_SIZE(sa);
m_copyback(m, len, dlen, (caddr_t)sa);
len += dlen;
}
if (m->m_pkthdr.len != len) {
m_freem(m);
return (NULL);
}
rtm->rtm_msglen = len;
rtm->rtm_version = RTM_VERSION;
rtm->rtm_type = type;
return (m);
}
static int
rt_msg2(int type, struct rt_addrinfo *rtinfo, caddr_t cp, struct walkarg *w)
{
int i;
int len, dlen, second_time = 0;
caddr_t cp0;
rtinfo->rti_addrs = 0;
again:
switch (type) {
case RTM_DELADDR:
case RTM_NEWADDR:
len = sizeof(struct ifa_msghdr);
break;
case RTM_IFINFO:
#ifdef COMPAT_FREEBSD32
if (w != NULL && w->w_req->flags & SCTL_MASK32) {
len = sizeof(struct if_msghdr32);
break;
}
#endif
len = sizeof(struct if_msghdr);
break;
case RTM_NEWMADDR:
len = sizeof(struct ifma_msghdr);
break;
default:
len = sizeof(struct rt_msghdr);
}
cp0 = cp;
if (cp0)
cp += len;
for (i = 0; i < RTAX_MAX; i++) {
struct sockaddr *sa;
if ((sa = rtinfo->rti_info[i]) == NULL)
continue;
rtinfo->rti_addrs |= (1 << i);
dlen = SA_SIZE(sa);
if (cp) {
bcopy((caddr_t)sa, cp, (unsigned)dlen);
cp += dlen;
}
len += dlen;
}
len = ALIGN(len);
if (cp == NULL && w != NULL && !second_time) {
struct walkarg *rw = w;
if (rw->w_req) {
if (rw->w_tmemsize < len) {
if (rw->w_tmem)
free(rw->w_tmem, M_RTABLE);
rw->w_tmem = (caddr_t)
malloc(len, M_RTABLE, M_NOWAIT);
if (rw->w_tmem)
rw->w_tmemsize = len;
}
if (rw->w_tmem) {
cp = rw->w_tmem;
second_time = 1;
goto again;
}
}
}
if (cp) {
struct rt_msghdr *rtm = (struct rt_msghdr *)cp0;
rtm->rtm_version = RTM_VERSION;
rtm->rtm_type = type;
rtm->rtm_msglen = len;
}
return (len);
}
/*
* This routine is called to generate a message from the routing
* socket indicating that a redirect has occured, a routing lookup
* has failed, or that a protocol has detected timeouts to a particular
* destination.
*/
void
rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
{
struct rt_msghdr *rtm;
struct mbuf *m;
struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
if (route_cb.any_count == 0)
return;
m = rt_msg1(type, rtinfo);
if (m == NULL)
return;
rtm = mtod(m, struct rt_msghdr *);
rtm->rtm_flags = RTF_DONE | flags;
rtm->rtm_errno = error;
rtm->rtm_addrs = rtinfo->rti_addrs;
rt_dispatch(m, sa);
}
/*
* This routine is called to generate a message from the routing
* socket indicating that the status of a network interface has changed.
*/
void
rt_ifmsg(struct ifnet *ifp)
{
struct if_msghdr *ifm;
struct mbuf *m;
struct rt_addrinfo info;
if (route_cb.any_count == 0)
return;
bzero((caddr_t)&info, sizeof(info));
m = rt_msg1(RTM_IFINFO, &info);
if (m == NULL)
return;
ifm = mtod(m, struct if_msghdr *);
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
ifm->ifm_data = ifp->if_data;
ifm->ifm_addrs = 0;
rt_dispatch(m, NULL);
}
/*
* This is called to generate messages from the routing socket
* indicating a network interface has had addresses associated with it.
* if we ever reverse the logic and replace messages TO the routing
* socket indicate a request to configure interfaces, then it will
* be unnecessary as the routing socket will automatically generate
* copies of it.
*/
void
rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt)
{
struct rt_addrinfo info;
struct sockaddr *sa = NULL;
int pass;
struct mbuf *m = NULL;
struct ifnet *ifp = ifa->ifa_ifp;
KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE,
("unexpected cmd %u", cmd));
#if defined(INET) || defined(INET6)
#ifdef SCTP
/*
* notify the SCTP stack
* this will only get called when an address is added/deleted
* XXX pass the ifaddr struct instead if ifa->ifa_addr...
*/
sctp_addr_change(ifa, cmd);
#endif /* SCTP */
#endif
if (route_cb.any_count == 0)
return;
for (pass = 1; pass < 3; pass++) {
bzero((caddr_t)&info, sizeof(info));
if ((cmd == RTM_ADD && pass == 1) ||
(cmd == RTM_DELETE && pass == 2)) {
struct ifa_msghdr *ifam;
int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
if ((m = rt_msg1(ncmd, &info)) == NULL)
continue;
ifam = mtod(m, struct ifa_msghdr *);
ifam->ifam_index = ifp->if_index;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_addrs = info.rti_addrs;
}
if ((cmd == RTM_ADD && pass == 2) ||
(cmd == RTM_DELETE && pass == 1)) {
struct rt_msghdr *rtm;
if (rt == NULL)
continue;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_DST] = sa = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
if ((m = rt_msg1(cmd, &info)) == NULL)
continue;
rtm = mtod(m, struct rt_msghdr *);
rtm->rtm_index = ifp->if_index;
rtm->rtm_flags |= rt->rt_flags;
rtm->rtm_errno = error;
rtm->rtm_addrs = info.rti_addrs;
}
rt_dispatch(m, sa);
}
}
/*
* This is the analogue to the rt_newaddrmsg which performs the same
* function but for multicast group memberhips. This is easier since
* there is no route state to worry about.
*/
void
rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
{
struct rt_addrinfo info;
struct mbuf *m = NULL;
struct ifnet *ifp = ifma->ifma_ifp;
struct ifma_msghdr *ifmam;
if (route_cb.any_count == 0)
return;
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_IFA] = ifma->ifma_addr;
info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
/*
* If a link-layer address is present, present it as a ``gateway''
* (similarly to how ARP entries, e.g., are presented).
*/
info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
m = rt_msg1(cmd, &info);
if (m == NULL)
return;
ifmam = mtod(m, struct ifma_msghdr *);
KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
__func__));
ifmam->ifmam_index = ifp->if_index;
ifmam->ifmam_addrs = info.rti_addrs;
rt_dispatch(m, ifma->ifma_addr);
}
static struct mbuf *
rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
struct rt_addrinfo *info)
{
struct if_announcemsghdr *ifan;
struct mbuf *m;
if (route_cb.any_count == 0)
return NULL;
bzero((caddr_t)info, sizeof(*info));
m = rt_msg1(type, info);
if (m != NULL) {
ifan = mtod(m, struct if_announcemsghdr *);
ifan->ifan_index = ifp->if_index;
strlcpy(ifan->ifan_name, ifp->if_xname,
sizeof(ifan->ifan_name));
ifan->ifan_what = what;
}
return m;
}
/*
* This is called to generate routing socket messages indicating
* IEEE80211 wireless events.
* XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
*/
void
rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
{
struct mbuf *m;
struct rt_addrinfo info;
m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
if (m != NULL) {
/*
* Append the ieee80211 data. Try to stick it in the
* mbuf containing the ifannounce msg; otherwise allocate
* a new mbuf and append.
*
* NB: we assume m is a single mbuf.
*/
if (data_len > M_TRAILINGSPACE(m)) {
struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
if (n == NULL) {
m_freem(m);
return;
}
bcopy(data, mtod(n, void *), data_len);
n->m_len = data_len;
m->m_next = n;
} else if (data_len > 0) {
bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
m->m_len += data_len;
}
if (m->m_flags & M_PKTHDR)
m->m_pkthdr.len += data_len;
mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
rt_dispatch(m, NULL);
}
}
/*
* This is called to generate routing socket messages indicating
* network interface arrival and departure.
*/
void
rt_ifannouncemsg(struct ifnet *ifp, int what)
{
struct mbuf *m;
struct rt_addrinfo info;
m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
if (m != NULL)
rt_dispatch(m, NULL);
}
static void
rt_dispatch(struct mbuf *m, const struct sockaddr *sa)
{
struct m_tag *tag;
/*
* Preserve the family from the sockaddr, if any, in an m_tag for
* use when injecting the mbuf into the routing socket buffer from
* the netisr.
*/
if (sa != NULL) {
tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
M_NOWAIT);
if (tag == NULL) {
m_freem(m);
return;
}
*(unsigned short *)(tag + 1) = sa->sa_family;
m_tag_prepend(m, tag);
}
#ifdef VIMAGE
if (V_loif)
m->m_pkthdr.rcvif = V_loif;
else {
m_freem(m);
return;
}
#endif
netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */
}
/*
* This is used in dumping the kernel table via sysctl().
*/
static int
sysctl_dumpentry(struct radix_node *rn, void *vw)
{
struct walkarg *w = vw;
struct rtentry *rt = (struct rtentry *)rn;
int error = 0, size;
struct rt_addrinfo info;
if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
return 0;
if ((rt->rt_flags & RTF_HOST) == 0
? jailed_without_vnet(w->w_req->td->td_ucred)
: prison_if(w->w_req->td->td_ucred, rt_key(rt)) != 0)
return (0);
bzero((caddr_t)&info, sizeof(info));
info.rti_info[RTAX_DST] = rt_key(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_GENMASK] = 0;
if (rt->rt_ifp) {
info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
}
size = rt_msg2(RTM_GET, &info, NULL, w);
if (w->w_req && w->w_tmem) {
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
/*
* let's be honest about this being a retarded hack
*/
rtm->rtm_fmask = rt->rt_rmx.rmx_pksent;
rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
return (error);
}
return (error);
}
#ifdef COMPAT_FREEBSD32
static void
copy_ifdata32(struct if_data *src, struct if_data32 *dst)
{
bzero(dst, sizeof(*dst));
CP(*src, *dst, ifi_type);
CP(*src, *dst, ifi_physical);
CP(*src, *dst, ifi_addrlen);
CP(*src, *dst, ifi_hdrlen);
CP(*src, *dst, ifi_link_state);
CP(*src, *dst, ifi_datalen);
CP(*src, *dst, ifi_mtu);
CP(*src, *dst, ifi_metric);
CP(*src, *dst, ifi_baudrate);
CP(*src, *dst, ifi_ipackets);
CP(*src, *dst, ifi_ierrors);
CP(*src, *dst, ifi_opackets);
CP(*src, *dst, ifi_oerrors);
CP(*src, *dst, ifi_collisions);
CP(*src, *dst, ifi_ibytes);
CP(*src, *dst, ifi_obytes);
CP(*src, *dst, ifi_imcasts);
CP(*src, *dst, ifi_omcasts);
CP(*src, *dst, ifi_iqdrops);
CP(*src, *dst, ifi_noproto);
CP(*src, *dst, ifi_hwassist);
CP(*src, *dst, ifi_epoch);
TV_CP(*src, *dst, ifi_lastchange);
}
#endif
static int
sysctl_iflist(int af, struct walkarg *w)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct rt_addrinfo info;
int len, error = 0;
bzero((caddr_t)&info, sizeof(info));
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (w->w_arg && w->w_arg != ifp->if_index)
continue;
ifa = ifp->if_addr;
info.rti_info[RTAX_IFP] = ifa->ifa_addr;
len = rt_msg2(RTM_IFINFO, &info, NULL, w);
info.rti_info[RTAX_IFP] = NULL;
if (w->w_req && w->w_tmem) {
struct if_msghdr *ifm;
#ifdef COMPAT_FREEBSD32
if (w->w_req->flags & SCTL_MASK32) {
struct if_msghdr32 *ifm32;
ifm32 = (struct if_msghdr32 *)w->w_tmem;
ifm32->ifm_index = ifp->if_index;
ifm32->ifm_flags = ifp->if_flags |
ifp->if_drv_flags;
copy_ifdata32(&ifp->if_data, &ifm32->ifm_data);
ifm32->ifm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)ifm32,
len);
goto sysctl_out;
}
#endif
ifm = (struct if_msghdr *)w->w_tmem;
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
ifm->ifm_data = ifp->if_data;
ifm->ifm_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, (caddr_t)ifm, len);
#ifdef COMPAT_FREEBSD32
sysctl_out:
#endif
if (error)
goto done;
}
while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
if (af && af != ifa->ifa_addr->sa_family)
continue;
if (prison_if(w->w_req->td->td_ucred,
ifa->ifa_addr) != 0)
continue;
info.rti_info[RTAX_IFA] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
len = rt_msg2(RTM_NEWADDR, &info, NULL, w);
if (w->w_req && w->w_tmem) {
struct ifa_msghdr *ifam;
ifam = (struct ifa_msghdr *)w->w_tmem;
ifam->ifam_index = ifa->ifa_ifp->if_index;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
if (error)
goto done;
}
}
info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
info.rti_info[RTAX_BRD] = NULL;
}
done:
IFNET_RUNLOCK();
return (error);
}
static int
sysctl_ifmalist(int af, struct walkarg *w)
{
struct ifnet *ifp;
struct ifmultiaddr *ifma;
struct rt_addrinfo info;
int len, error = 0;
struct ifaddr *ifa;
bzero((caddr_t)&info, sizeof(info));
IFNET_RLOCK();
TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
if (w->w_arg && w->w_arg != ifp->if_index)
continue;
ifa = ifp->if_addr;
info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (af && af != ifma->ifma_addr->sa_family)
continue;
if (prison_if(w->w_req->td->td_ucred,
ifma->ifma_addr) != 0)
continue;
info.rti_info[RTAX_IFA] = ifma->ifma_addr;
info.rti_info[RTAX_GATEWAY] =
(ifma->ifma_addr->sa_family != AF_LINK) ?
ifma->ifma_lladdr : NULL;
len = rt_msg2(RTM_NEWMADDR, &info, NULL, w);
if (w->w_req && w->w_tmem) {
struct ifma_msghdr *ifmam;
ifmam = (struct ifma_msghdr *)w->w_tmem;
ifmam->ifmam_index = ifma->ifma_ifp->if_index;
ifmam->ifmam_flags = 0;
ifmam->ifmam_addrs = info.rti_addrs;
error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
if (error) {
IF_ADDR_UNLOCK(ifp);
goto done;
}
}
}
IF_ADDR_UNLOCK(ifp);
}
done:
IFNET_RUNLOCK();
return (error);
}
static int
sysctl_rtsock(SYSCTL_HANDLER_ARGS)
{
int *name = (int *)arg1;
u_int namelen = arg2;
struct radix_node_head *rnh = NULL; /* silence compiler. */
int i, lim, error = EINVAL;
u_char af;
struct walkarg w;
name ++;
namelen--;
if (req->newptr)
return (EPERM);
if (namelen != 3)
return ((namelen < 3) ? EISDIR : ENOTDIR);
af = name[0];
if (af > AF_MAX)
return (EINVAL);
bzero(&w, sizeof(w));
w.w_op = name[1];
w.w_arg = name[2];
w.w_req = req;
error = sysctl_wire_old_buffer(req, 0);
if (error)
return (error);
switch (w.w_op) {
case NET_RT_DUMP:
case NET_RT_FLAGS:
if (af == 0) { /* dump all tables */
i = 1;
lim = AF_MAX;
} else /* dump only one table */
i = lim = af;
/*
* take care of llinfo entries, the caller must
* specify an AF
*/
if (w.w_op == NET_RT_FLAGS &&
(w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
if (af != 0)
error = lltable_sysctl_dumparp(af, w.w_req);
else
error = EINVAL;
break;
}
/*
* take care of routing entries
*/
for (error = 0; error == 0 && i <= lim; i++) {
rnh = rt_tables_get_rnh(req->td->td_proc->p_fibnum, i);
if (rnh != NULL) {
RADIX_NODE_HEAD_LOCK(rnh);
error = rnh->rnh_walktree(rnh,
sysctl_dumpentry, &w);
RADIX_NODE_HEAD_UNLOCK(rnh);
} else if (af != 0)
error = EAFNOSUPPORT;
}
break;
case NET_RT_IFLIST:
error = sysctl_iflist(af, &w);
break;
case NET_RT_IFMALIST:
error = sysctl_ifmalist(af, &w);
break;
}
if (w.w_tmem)
free(w.w_tmem, M_RTABLE);
return (error);
}
SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");
/*
* Definitions of protocols supported in the ROUTE domain.
*/
static struct domain routedomain; /* or at least forward */
static struct protosw routesw[] = {
{
.pr_type = SOCK_RAW,
.pr_domain = &routedomain,
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_output = route_output,
.pr_ctlinput = raw_ctlinput,
.pr_init = raw_init,
.pr_usrreqs = &route_usrreqs
}
};
static struct domain routedomain = {
.dom_family = PF_ROUTE,
.dom_name = "route",
.dom_protosw = routesw,
.dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])]
};
VNET_DOMAIN_SET(route);
Index: stable/8/sys/netinet/in.c
===================================================================
--- stable/8/sys/netinet/in.c (revision 209276)
+++ stable/8/sys/netinet/in.c (revision 209277)
@@ -1,1585 +1,1586 @@
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (C) 2001 WIDE Project. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in.c 8.4 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_carp.h"
#include "opt_mpath.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sockio.h>
#include <sys/malloc.h>
#include <sys/priv.h>
#include <sys/socket.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/igmp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
static int in_mask2len(struct in_addr *);
static void in_len2mask(struct in_addr *, int);
static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t,
struct ifnet *, struct thread *);
static int in_addprefix(struct in_ifaddr *, int);
static int in_scrubprefix(struct in_ifaddr *);
static void in_socktrim(struct sockaddr_in *);
static int in_ifinit(struct ifnet *,
struct in_ifaddr *, struct sockaddr_in *, int);
static void in_purgemaddrs(struct ifnet *);
static VNET_DEFINE(int, subnetsarelocal);
#define V_subnetsarelocal VNET(subnetsarelocal)
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
&VNET_NAME(subnetsarelocal), 0,
"Treat all subnets as directly connected");
static VNET_DEFINE(int, sameprefixcarponly);
#define V_sameprefixcarponly VNET(sameprefixcarponly)
SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW,
&VNET_NAME(sameprefixcarponly), 0,
"Refuse to create same prefixes on different interfaces");
VNET_DECLARE(struct inpcbinfo, ripcbinfo);
#define V_ripcbinfo VNET(ripcbinfo)
/*
* Return 1 if an internet address is for a ``local'' host
* (one to which we have a connection). If subnetsarelocal
* is true, this includes other subnets of the local net.
* Otherwise, it includes only the directly-connected (sub)nets.
*/
int
in_localaddr(struct in_addr in)
{
register u_long i = ntohl(in.s_addr);
register struct in_ifaddr *ia;
IN_IFADDR_RLOCK();
if (V_subnetsarelocal) {
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((i & ia->ia_netmask) == ia->ia_net) {
IN_IFADDR_RUNLOCK();
return (1);
}
}
} else {
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
IN_IFADDR_RUNLOCK();
return (1);
}
}
}
IN_IFADDR_RUNLOCK();
return (0);
}
/*
* Return 1 if an internet address is for the local host and configured
* on one of its interfaces.
*/
int
in_localip(struct in_addr in)
{
struct in_ifaddr *ia;
IN_IFADDR_RLOCK();
LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) {
IN_IFADDR_RUNLOCK();
return (1);
}
}
IN_IFADDR_RUNLOCK();
return (0);
}
/*
* Determine whether an IP address is in a reserved set of addresses
* that may not be forwarded, or whether datagrams to that destination
* may be forwarded.
*/
int
in_canforward(struct in_addr in)
{
register u_long i = ntohl(in.s_addr);
register u_long net;
if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i))
return (0);
if (IN_CLASSA(i)) {
net = i & IN_CLASSA_NET;
if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
return (0);
}
return (1);
}
/*
* Trim a mask in a sockaddr
*/
static void
in_socktrim(struct sockaddr_in *ap)
{
register char *cplim = (char *) &ap->sin_addr;
register char *cp = (char *) (&ap->sin_addr + 1);
ap->sin_len = 0;
while (--cp >= cplim)
if (*cp) {
(ap)->sin_len = cp - (char *) (ap) + 1;
break;
}
}
static int
in_mask2len(mask)
struct in_addr *mask;
{
int x, y;
u_char *p;
p = (u_char *)mask;
for (x = 0; x < sizeof(*mask); x++) {
if (p[x] != 0xff)
break;
}
y = 0;
if (x < sizeof(*mask)) {
for (y = 0; y < 8; y++) {
if ((p[x] & (0x80 >> y)) == 0)
break;
}
}
return (x * 8 + y);
}
static void
in_len2mask(struct in_addr *mask, int len)
{
int i;
u_char *p;
p = (u_char *)mask;
bzero(mask, sizeof(*mask));
for (i = 0; i < len / 8; i++)
p[i] = 0xff;
if (len % 8)
p[i] = (0xff00 >> (len % 8)) & 0xff;
}
/*
* Generic internet control operations (ioctl's).
*
* ifp is NULL if not an interface-specific ioctl.
*/
/* ARGSUSED */
int
in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
struct thread *td)
{
register struct ifreq *ifr = (struct ifreq *)data;
register struct in_ifaddr *ia, *iap;
register struct ifaddr *ifa;
struct in_addr allhosts_addr;
struct in_addr dst;
struct in_ifinfo *ii;
struct in_aliasreq *ifra = (struct in_aliasreq *)data;
struct sockaddr_in oldaddr;
int error, hostIsNew, iaIsNew, maskIsNew;
int iaIsFirst;
ia = NULL;
iaIsFirst = 0;
iaIsNew = 0;
allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
/*
* Filter out ioctls we implement directly; forward the rest on to
* in_lifaddr_ioctl() and ifp->if_ioctl().
*/
switch (cmd) {
case SIOCAIFADDR:
case SIOCDIFADDR:
case SIOCGIFADDR:
case SIOCGIFBRDADDR:
case SIOCGIFDSTADDR:
case SIOCGIFNETMASK:
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFDSTADDR:
case SIOCSIFNETMASK:
break;
case SIOCALIFADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_ADDIFADDR);
if (error)
return (error);
}
if (ifp == NULL)
return (EINVAL);
return in_lifaddr_ioctl(so, cmd, data, ifp, td);
case SIOCDLIFADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_DELIFADDR);
if (error)
return (error);
}
if (ifp == NULL)
return (EINVAL);
return in_lifaddr_ioctl(so, cmd, data, ifp, td);
case SIOCGLIFADDR:
if (ifp == NULL)
return (EINVAL);
return in_lifaddr_ioctl(so, cmd, data, ifp, td);
default:
if (ifp == NULL || ifp->if_ioctl == NULL)
return (EOPNOTSUPP);
return ((*ifp->if_ioctl)(ifp, cmd, data));
}
if (ifp == NULL)
return (EADDRNOTAVAIL);
/*
* Security checks before we get involved in any work.
*/
switch (cmd) {
case SIOCAIFADDR:
case SIOCSIFADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
case SIOCSIFDSTADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_ADDIFADDR);
if (error)
return (error);
}
break;
case SIOCDIFADDR:
if (td != NULL) {
error = priv_check(td, PRIV_NET_DELIFADDR);
if (error)
return (error);
}
break;
}
/*
* Find address for this interface, if it exists.
*
* If an alias address was specified, find that one instead of the
* first one on the interface, if possible.
*/
dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr;
IN_IFADDR_RLOCK();
LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) {
if (iap->ia_ifp == ifp &&
iap->ia_addr.sin_addr.s_addr == dst.s_addr) {
if (td == NULL || prison_check_ip4(td->td_ucred,
&dst) == 0)
ia = iap;
break;
}
}
if (ia != NULL)
ifa_ref(&ia->ia_ifa);
IN_IFADDR_RUNLOCK();
if (ia == NULL) {
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
iap = ifatoia(ifa);
if (iap->ia_addr.sin_family == AF_INET) {
if (td != NULL &&
prison_check_ip4(td->td_ucred,
&iap->ia_addr.sin_addr) != 0)
continue;
ia = iap;
break;
}
}
if (ia != NULL)
ifa_ref(&ia->ia_ifa);
IF_ADDR_UNLOCK(ifp);
}
if (ia == NULL)
iaIsFirst = 1;
error = 0;
switch (cmd) {
case SIOCAIFADDR:
case SIOCDIFADDR:
if (ifra->ifra_addr.sin_family == AF_INET) {
struct in_ifaddr *oia;
IN_IFADDR_RLOCK();
for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) {
if (ia->ia_ifp == ifp &&
ia->ia_addr.sin_addr.s_addr ==
ifra->ifra_addr.sin_addr.s_addr)
break;
}
if (ia != NULL && ia != oia)
ifa_ref(&ia->ia_ifa);
if (oia != NULL && ia != oia)
ifa_free(&oia->ia_ifa);
IN_IFADDR_RUNLOCK();
if ((ifp->if_flags & IFF_POINTOPOINT)
&& (cmd == SIOCAIFADDR)
&& (ifra->ifra_dstaddr.sin_addr.s_addr
== INADDR_ANY)) {
error = EDESTADDRREQ;
goto out;
}
}
if (cmd == SIOCDIFADDR && ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
/* FALLTHROUGH */
case SIOCSIFADDR:
case SIOCSIFNETMASK:
case SIOCSIFDSTADDR:
if (ia == NULL) {
ia = (struct in_ifaddr *)
malloc(sizeof *ia, M_IFADDR, M_NOWAIT |
M_ZERO);
if (ia == NULL) {
error = ENOBUFS;
goto out;
}
ifa = &ia->ia_ifa;
ifa_init(ifa);
ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
ia->ia_sockmask.sin_len = 8;
ia->ia_sockmask.sin_family = AF_INET;
if (ifp->if_flags & IFF_BROADCAST) {
ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
ia->ia_broadaddr.sin_family = AF_INET;
}
ia->ia_ifp = ifp;
ifa_ref(ifa); /* if_addrhead */
IF_ADDR_LOCK(ifp);
TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
IF_ADDR_UNLOCK(ifp);
ifa_ref(ifa); /* in_ifaddrhead */
IN_IFADDR_WLOCK();
TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
IN_IFADDR_WUNLOCK();
iaIsNew = 1;
}
break;
case SIOCSIFBRDADDR:
case SIOCGIFADDR:
case SIOCGIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCGIFBRDADDR:
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
break;
}
/*
* Most paths in this switch return directly or via out. Only paths
* that remove the address break in order to hit common removal code.
*/
switch (cmd) {
case SIOCGIFADDR:
*((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr;
goto out;
case SIOCGIFBRDADDR:
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
goto out;
}
*((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr;
goto out;
case SIOCGIFDSTADDR:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
goto out;
}
*((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr;
goto out;
case SIOCGIFNETMASK:
*((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask;
goto out;
case SIOCSIFDSTADDR:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
goto out;
}
oldaddr = ia->ia_dstaddr;
ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr;
if (ifp->if_ioctl != NULL) {
error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR,
(caddr_t)ia);
if (error) {
ia->ia_dstaddr = oldaddr;
goto out;
}
}
if (ia->ia_flags & IFA_ROUTE) {
ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr;
rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
ia->ia_ifa.ifa_dstaddr =
(struct sockaddr *)&ia->ia_dstaddr;
rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP);
}
goto out;
case SIOCSIFBRDADDR:
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EINVAL;
goto out;
}
ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr;
goto out;
case SIOCSIFADDR:
error = in_ifinit(ifp, ia,
(struct sockaddr_in *) &ifr->ifr_addr, 1);
if (error != 0 && iaIsNew)
break;
if (error == 0) {
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
if (iaIsFirst &&
(ifp->if_flags & IFF_MULTICAST) != 0) {
error = in_joingroup(ifp, &allhosts_addr,
NULL, &ii->ii_allhosts);
}
EVENTHANDLER_INVOKE(ifaddr_event, ifp);
}
error = 0;
goto out;
case SIOCSIFNETMASK:
ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr;
ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
goto out;
case SIOCAIFADDR:
maskIsNew = 0;
hostIsNew = 1;
error = 0;
if (ia->ia_addr.sin_family == AF_INET) {
if (ifra->ifra_addr.sin_len == 0) {
ifra->ifra_addr = ia->ia_addr;
hostIsNew = 0;
} else if (ifra->ifra_addr.sin_addr.s_addr ==
ia->ia_addr.sin_addr.s_addr)
hostIsNew = 0;
}
if (ifra->ifra_mask.sin_len) {
/*
* QL: XXX
* Need to scrub the prefix here in case
* the issued command is SIOCAIFADDR with
* the same address, but with a different
* prefix length. And if the prefix length
* is the same as before, then the call is
* un-necessarily executed here.
*/
in_ifscrub(ifp, ia);
ia->ia_sockmask = ifra->ifra_mask;
ia->ia_sockmask.sin_family = AF_INET;
ia->ia_subnetmask =
ntohl(ia->ia_sockmask.sin_addr.s_addr);
maskIsNew = 1;
}
if ((ifp->if_flags & IFF_POINTOPOINT) &&
(ifra->ifra_dstaddr.sin_family == AF_INET)) {
in_ifscrub(ifp, ia);
ia->ia_dstaddr = ifra->ifra_dstaddr;
maskIsNew = 1; /* We lie; but the effect's the same */
}
if (ifra->ifra_addr.sin_family == AF_INET &&
(hostIsNew || maskIsNew))
error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
if (error != 0 && iaIsNew)
goto out;
if ((ifp->if_flags & IFF_BROADCAST) &&
(ifra->ifra_broadaddr.sin_family == AF_INET))
ia->ia_broadaddr = ifra->ifra_broadaddr;
if (error == 0) {
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
if (iaIsFirst &&
(ifp->if_flags & IFF_MULTICAST) != 0) {
error = in_joingroup(ifp, &allhosts_addr,
NULL, &ii->ii_allhosts);
}
EVENTHANDLER_INVOKE(ifaddr_event, ifp);
}
goto out;
case SIOCDIFADDR:
/*
* in_ifscrub kills the interface route.
*/
in_ifscrub(ifp, ia);
/*
* in_ifadown gets rid of all the rest of
* the routes. This is not quite the right
* thing to do, but at least if we are running
* a routing process they will come back.
*/
in_ifadown(&ia->ia_ifa, 1);
EVENTHANDLER_INVOKE(ifaddr_event, ifp);
error = 0;
break;
default:
panic("in_control: unsupported ioctl");
}
IF_ADDR_LOCK(ifp);
TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
IF_ADDR_UNLOCK(ifp);
ifa_free(&ia->ia_ifa); /* if_addrhead */
IN_IFADDR_WLOCK();
TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
if (ia->ia_addr.sin_family == AF_INET) {
struct in_ifaddr *if_ia;
LIST_REMOVE(ia, ia_hash);
IN_IFADDR_WUNLOCK();
/*
* If this is the last IPv4 address configured on this
* interface, leave the all-hosts group.
* No state-change report need be transmitted.
*/
if_ia = NULL;
IFP_TO_IA(ifp, if_ia);
if (if_ia == NULL) {
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
IN_MULTI_LOCK();
if (ii->ii_allhosts) {
(void)in_leavegroup_locked(ii->ii_allhosts,
NULL);
ii->ii_allhosts = NULL;
}
IN_MULTI_UNLOCK();
} else
ifa_free(&if_ia->ia_ifa);
} else
IN_IFADDR_WUNLOCK();
ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
out:
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (error);
}
/*
* SIOC[GAD]LIFADDR.
* SIOCGLIFADDR: get first address. (?!?)
* SIOCGLIFADDR with IFLR_PREFIX:
* get first address that matches the specified prefix.
* SIOCALIFADDR: add the specified address.
* SIOCALIFADDR with IFLR_PREFIX:
* EINVAL since we can't deduce hostid part of the address.
* SIOCDLIFADDR: delete the specified address.
* SIOCDLIFADDR with IFLR_PREFIX:
* delete the first address that matches the specified prefix.
* return values:
* EINVAL on invalid parameters
* EADDRNOTAVAIL on prefix match failed/specified address not found
* other values may be returned from in_ioctl()
*/
static int
in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
struct ifnet *ifp, struct thread *td)
{
struct if_laddrreq *iflr = (struct if_laddrreq *)data;
struct ifaddr *ifa;
/* sanity checks */
if (data == NULL || ifp == NULL) {
panic("invalid argument to in_lifaddr_ioctl");
/*NOTRECHED*/
}
switch (cmd) {
case SIOCGLIFADDR:
/* address must be specified on GET with IFLR_PREFIX */
if ((iflr->flags & IFLR_PREFIX) == 0)
break;
/*FALLTHROUGH*/
case SIOCALIFADDR:
case SIOCDLIFADDR:
/* address must be specified on ADD and DELETE */
if (iflr->addr.ss_family != AF_INET)
return (EINVAL);
if (iflr->addr.ss_len != sizeof(struct sockaddr_in))
return (EINVAL);
/* XXX need improvement */
if (iflr->dstaddr.ss_family
&& iflr->dstaddr.ss_family != AF_INET)
return (EINVAL);
if (iflr->dstaddr.ss_family
&& iflr->dstaddr.ss_len != sizeof(struct sockaddr_in))
return (EINVAL);
break;
default: /*shouldn't happen*/
return (EOPNOTSUPP);
}
if (sizeof(struct in_addr) * 8 < iflr->prefixlen)
return (EINVAL);
switch (cmd) {
case SIOCALIFADDR:
{
struct in_aliasreq ifra;
if (iflr->flags & IFLR_PREFIX)
return (EINVAL);
/* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
bzero(&ifra, sizeof(ifra));
bcopy(iflr->iflr_name, ifra.ifra_name,
sizeof(ifra.ifra_name));
bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len);
if (iflr->dstaddr.ss_family) { /*XXX*/
bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
iflr->dstaddr.ss_len);
}
ifra.ifra_mask.sin_family = AF_INET;
ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in);
in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen);
return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td));
}
case SIOCGLIFADDR:
case SIOCDLIFADDR:
{
struct in_ifaddr *ia;
struct in_addr mask, candidate, match;
struct sockaddr_in *sin;
bzero(&mask, sizeof(mask));
bzero(&match, sizeof(match));
if (iflr->flags & IFLR_PREFIX) {
/* lookup a prefix rather than address. */
in_len2mask(&mask, iflr->prefixlen);
sin = (struct sockaddr_in *)&iflr->addr;
match.s_addr = sin->sin_addr.s_addr;
match.s_addr &= mask.s_addr;
/* if you set extra bits, that's wrong */
if (match.s_addr != sin->sin_addr.s_addr)
return (EINVAL);
} else {
/* on getting an address, take the 1st match */
/* on deleting an address, do exact match */
if (cmd != SIOCGLIFADDR) {
in_len2mask(&mask, 32);
sin = (struct sockaddr_in *)&iflr->addr;
match.s_addr = sin->sin_addr.s_addr;
}
}
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (match.s_addr == 0)
break;
candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr;
candidate.s_addr &= mask.s_addr;
if (candidate.s_addr == match.s_addr)
break;
}
if (ifa == NULL)
return (EADDRNOTAVAIL);
ia = (struct in_ifaddr *)ifa;
if (cmd == SIOCGLIFADDR) {
/* fill in the if_laddrreq structure */
bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len);
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
ia->ia_dstaddr.sin_len);
} else
bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));
iflr->prefixlen =
in_mask2len(&ia->ia_sockmask.sin_addr);
iflr->flags = 0; /*XXX*/
return (0);
} else {
struct in_aliasreq ifra;
/* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
bzero(&ifra, sizeof(ifra));
bcopy(iflr->iflr_name, ifra.ifra_name,
sizeof(ifra.ifra_name));
bcopy(&ia->ia_addr, &ifra.ifra_addr,
ia->ia_addr.sin_len);
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
ia->ia_dstaddr.sin_len);
}
bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr,
ia->ia_sockmask.sin_len);
return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra,
ifp, td));
}
}
}
return (EOPNOTSUPP); /*just for safety*/
}
/*
* Delete any existing route for an interface.
*/
void
in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia)
{
in_scrubprefix(ia);
}
/*
* Initialize an interface's internet address
* and routing table entry.
*/
static int
in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin,
int scrub)
{
register u_long i = ntohl(sin->sin_addr.s_addr);
struct sockaddr_in oldaddr;
int s = splimp(), flags = RTF_UP, error = 0;
oldaddr = ia->ia_addr;
if (oldaddr.sin_family == AF_INET)
LIST_REMOVE(ia, ia_hash);
ia->ia_addr = *sin;
if (ia->ia_addr.sin_family == AF_INET) {
IN_IFADDR_WLOCK();
LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr),
ia, ia_hash);
IN_IFADDR_WUNLOCK();
}
/*
* Give the interface a chance to initialize
* if this is its first address,
* and to validate the address if necessary.
*/
if (ifp->if_ioctl != NULL) {
error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
if (error) {
splx(s);
/* LIST_REMOVE(ia, ia_hash) is done in in_control */
ia->ia_addr = oldaddr;
IN_IFADDR_WLOCK();
if (ia->ia_addr.sin_family == AF_INET)
LIST_INSERT_HEAD(INADDR_HASH(
ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
else
/*
* If oldaddr family is not AF_INET (e.g.
* interface has been just created) in_control
* does not call LIST_REMOVE, and we end up
* with bogus ia entries in hash
*/
LIST_REMOVE(ia, ia_hash);
IN_IFADDR_WUNLOCK();
return (error);
}
}
splx(s);
if (scrub) {
ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr;
in_ifscrub(ifp, ia);
ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
}
if (IN_CLASSA(i))
ia->ia_netmask = IN_CLASSA_NET;
else if (IN_CLASSB(i))
ia->ia_netmask = IN_CLASSB_NET;
else
ia->ia_netmask = IN_CLASSC_NET;
/*
* The subnet mask usually includes at least the standard network part,
* but may may be smaller in the case of supernetting.
* If it is set, we believe it.
*/
if (ia->ia_subnetmask == 0) {
ia->ia_subnetmask = ia->ia_netmask;
ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
} else
ia->ia_netmask &= ia->ia_subnetmask;
ia->ia_net = i & ia->ia_netmask;
ia->ia_subnet = i & ia->ia_subnetmask;
in_socktrim(&ia->ia_sockmask);
#ifdef DEV_CARP
/*
* XXX: carp(4) does not have interface route
*/
if (ifp->if_type == IFT_CARP)
return (0);
#endif
/*
* Add route for the network.
*/
ia->ia_ifa.ifa_metric = ifp->if_metric;
if (ifp->if_flags & IFF_BROADCAST) {
ia->ia_broadaddr.sin_addr.s_addr =
htonl(ia->ia_subnet | ~ia->ia_subnetmask);
ia->ia_netbroadcast.s_addr =
htonl(ia->ia_net | ~ ia->ia_netmask);
} else if (ifp->if_flags & IFF_LOOPBACK) {
ia->ia_dstaddr = ia->ia_addr;
flags |= RTF_HOST;
} else if (ifp->if_flags & IFF_POINTOPOINT) {
if (ia->ia_dstaddr.sin_family != AF_INET)
return (0);
flags |= RTF_HOST;
}
if ((error = in_addprefix(ia, flags)) != 0)
return (error);
if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
return (0);
if (ifp->if_flags & IFF_POINTOPOINT) {
if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
return (0);
}
/*
* add a loopback route to self
*/
if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) {
struct route ia_ro;
bzero(&ia_ro, sizeof(ia_ro));
*((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr;
rtalloc_ign_fib(&ia_ro, 0, 0);
if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
(ia_ro.ro_rt->rt_ifp == V_loif)) {
RT_LOCK(ia_ro.ro_rt);
RT_ADDREF(ia_ro.ro_rt);
RTFREE_LOCKED(ia_ro.ro_rt);
} else
error = ifa_add_loopback_route((struct ifaddr *)ia,
(struct sockaddr *)&ia->ia_addr);
if (error == 0)
ia->ia_flags |= IFA_RTSELF;
if (ia_ro.ro_rt != NULL)
RTFREE(ia_ro.ro_rt);
}
return (error);
}
#define rtinitflags(x) \
((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \
? RTF_HOST : 0)
/*
* Generate a routing message when inserting or deleting
* an interface address alias.
*/
static void in_addralias_rtmsg(int cmd, struct in_addr *prefix,
struct in_ifaddr *target)
{
struct route pfx_ro;
struct sockaddr_in *pfx_addr;
struct rtentry msg_rt;
/* QL: XXX
* This is a bit questionable because there is no
* additional route entry added/deleted for an address
* alias. Therefore this route report is inaccurate.
*/
bzero(&pfx_ro, sizeof(pfx_ro));
pfx_addr = (struct sockaddr_in *)(&pfx_ro.ro_dst);
pfx_addr->sin_len = sizeof(*pfx_addr);
pfx_addr->sin_family = AF_INET;
pfx_addr->sin_addr = *prefix;
rtalloc_ign_fib(&pfx_ro, 0, 0);
if (pfx_ro.ro_rt != NULL) {
msg_rt = *pfx_ro.ro_rt;
/* QL: XXX
* Point the gateway to the new interface
* address as if a new prefix route entry has
* been added through the new address alias.
* All other parts of the rtentry is accurate,
* e.g., rt_key, rt_mask, rt_ifp etc.
*/
msg_rt.rt_gateway =
(struct sockaddr *)&target->ia_addr;
rt_newaddrmsg(cmd,
(struct ifaddr *)target,
0, &msg_rt);
RTFREE(pfx_ro.ro_rt);
}
return;
}
/*
* Check if we have a route for the given prefix already or add one accordingly.
*/
static int
in_addprefix(struct in_ifaddr *target, int flags)
{
struct in_ifaddr *ia;
struct in_addr prefix, mask, p, m;
int error;
if ((flags & RTF_HOST) != 0) {
prefix = target->ia_dstaddr.sin_addr;
mask.s_addr = 0;
} else {
prefix = target->ia_addr.sin_addr;
mask = target->ia_sockmask.sin_addr;
prefix.s_addr &= mask.s_addr;
}
IN_IFADDR_RLOCK();
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia)) {
p = ia->ia_addr.sin_addr;
if (prefix.s_addr != p.s_addr)
continue;
} else {
p = ia->ia_addr.sin_addr;
m = ia->ia_sockmask.sin_addr;
p.s_addr &= m.s_addr;
if (prefix.s_addr != p.s_addr ||
mask.s_addr != m.s_addr)
continue;
}
/*
* If we got a matching prefix route inserted by other
* interface address, we are done here.
*/
if (ia->ia_flags & IFA_ROUTE) {
#ifdef RADIX_MPATH
if (ia->ia_addr.sin_addr.s_addr ==
target->ia_addr.sin_addr.s_addr)
return (EEXIST);
else
break;
#endif
if (V_sameprefixcarponly &&
target->ia_ifp->if_type != IFT_CARP &&
ia->ia_ifp->if_type != IFT_CARP) {
IN_IFADDR_RUNLOCK();
return (EEXIST);
} else {
in_addralias_rtmsg(RTM_ADD, &prefix, target);
IN_IFADDR_RUNLOCK();
return (0);
}
}
}
IN_IFADDR_RUNLOCK();
/*
* No-one seem to have this prefix route, so we try to insert it.
*/
error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags);
if (!error)
target->ia_flags |= IFA_ROUTE;
return (error);
}
extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr);
/*
* If there is no other address in the system that can serve a route to the
* same prefix, remove the route. Hand over the route to the new address
* otherwise.
*/
static int
in_scrubprefix(struct in_ifaddr *target)
{
struct in_ifaddr *ia;
struct in_addr prefix, mask, p;
int error = 0;
struct sockaddr_in prefix0, mask0;
/*
* Remove the loopback route to the interface address.
* The "useloopback" setting is not consulted because if the
* user configures an interface address, turns off this
* setting, and then tries to delete that interface address,
* checking the current setting of "useloopback" would leave
* that interface address loopback route untouched, which
* would be wrong. Therefore the interface address loopback route
* deletion is unconditional.
*/
if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) &&
!(target->ia_ifp->if_flags & IFF_LOOPBACK) &&
(target->ia_flags & IFA_RTSELF)) {
struct route ia_ro;
int freeit = 0;
bzero(&ia_ro, sizeof(ia_ro));
*((struct sockaddr_in *)(&ia_ro.ro_dst)) = target->ia_addr;
rtalloc_ign_fib(&ia_ro, 0, 0);
if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) &&
(ia_ro.ro_rt->rt_ifp == V_loif)) {
RT_LOCK(ia_ro.ro_rt);
if (ia_ro.ro_rt->rt_refcnt <= 1)
freeit = 1;
else
RT_REMREF(ia_ro.ro_rt);
RTFREE_LOCKED(ia_ro.ro_rt);
}
if (freeit)
error = ifa_del_loopback_route((struct ifaddr *)target,
(struct sockaddr *)&target->ia_addr);
if (error == 0)
target->ia_flags &= ~IFA_RTSELF;
/* remove arp cache */
arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr);
}
if (rtinitflags(target))
prefix = target->ia_dstaddr.sin_addr;
else {
prefix = target->ia_addr.sin_addr;
mask = target->ia_sockmask.sin_addr;
prefix.s_addr &= mask.s_addr;
}
if ((target->ia_flags & IFA_ROUTE) == 0) {
in_addralias_rtmsg(RTM_DELETE, &prefix, target);
return (0);
}
IN_IFADDR_RLOCK();
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia))
p = ia->ia_dstaddr.sin_addr;
else {
p = ia->ia_addr.sin_addr;
p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
}
if (prefix.s_addr != p.s_addr)
continue;
/*
* If we got a matching prefix address, move IFA_ROUTE and
* the route itself to it. Make sure that routing daemons
* get a heads-up.
*
* XXX: a special case for carp(4) interface
*/
if ((ia->ia_flags & IFA_ROUTE) == 0
#ifdef DEV_CARP
&& (ia->ia_ifp->if_type != IFT_CARP)
#endif
) {
IN_IFADDR_RUNLOCK();
rtinit(&(target->ia_ifa), (int)RTM_DELETE,
rtinitflags(target));
target->ia_flags &= ~IFA_ROUTE;
error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
rtinitflags(ia) | RTF_UP);
if (error == 0)
ia->ia_flags |= IFA_ROUTE;
return (error);
}
}
IN_IFADDR_RUNLOCK();
/*
* remove all L2 entries on the given prefix
*/
bzero(&prefix0, sizeof(prefix0));
prefix0.sin_len = sizeof(prefix0);
prefix0.sin_family = AF_INET;
prefix0.sin_addr.s_addr = target->ia_subnet;
bzero(&mask0, sizeof(mask0));
mask0.sin_len = sizeof(mask0);
mask0.sin_family = AF_INET;
mask0.sin_addr.s_addr = target->ia_subnetmask;
lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0,
(struct sockaddr *)&mask0);
/*
* As no-one seem to have this prefix, we can remove the route.
*/
rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target));
target->ia_flags &= ~IFA_ROUTE;
return (0);
}
#undef rtinitflags
/*
* Return 1 if the address might be a local broadcast address.
*/
int
in_broadcast(struct in_addr in, struct ifnet *ifp)
{
register struct ifaddr *ifa;
u_long t;
if (in.s_addr == INADDR_BROADCAST ||
in.s_addr == INADDR_ANY)
return (1);
if ((ifp->if_flags & IFF_BROADCAST) == 0)
return (0);
t = ntohl(in.s_addr);
/*
* Look through the list of addresses for a match
* with a broadcast address.
*/
#define ia ((struct in_ifaddr *)ifa)
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET &&
(in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
in.s_addr == ia->ia_netbroadcast.s_addr ||
/*
* Check for old-style (host 0) broadcast.
*/
t == ia->ia_subnet || t == ia->ia_net) &&
/*
* Check for an all one subnetmask. These
* only exist when an interface gets a secondary
* address.
*/
ia->ia_subnetmask != (u_long)0xffffffff)
return (1);
return (0);
#undef ia
}
/*
* On interface removal, clean up IPv4 data structures hung off of the ifnet.
*/
void
in_ifdetach(struct ifnet *ifp)
{
in_pcbpurgeif0(&V_ripcbinfo, ifp);
in_pcbpurgeif0(&V_udbinfo, ifp);
in_purgemaddrs(ifp);
}
/*
* Delete all IPv4 multicast address records, and associated link-layer
* multicast address records, associated with ifp.
* XXX It looks like domifdetach runs AFTER the link layer cleanup.
* XXX This should not race with ifma_protospec being set during
* a new allocation, if it does, we have bigger problems.
*/
static void
in_purgemaddrs(struct ifnet *ifp)
{
LIST_HEAD(,in_multi) purgeinms;
struct in_multi *inm, *tinm;
struct ifmultiaddr *ifma;
LIST_INIT(&purgeinms);
IN_MULTI_LOCK();
/*
* Extract list of in_multi associated with the detaching ifp
* which the PF_INET layer is about to release.
* We need to do this as IF_ADDR_LOCK() may be re-acquired
* by code further down.
*/
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
#if 0
KASSERT(ifma->ifma_protospec != NULL,
("%s: ifma_protospec is NULL", __func__));
#endif
inm = (struct in_multi *)ifma->ifma_protospec;
LIST_INSERT_HEAD(&purgeinms, inm, inm_link);
}
IF_ADDR_UNLOCK(ifp);
LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) {
LIST_REMOVE(inm, inm_link);
inm_release_locked(inm);
}
igmp_ifdetach(ifp);
IN_MULTI_UNLOCK();
}
#include <net/if_dl.h>
#include <netinet/if_ether.h>
struct in_llentry {
struct llentry base;
struct sockaddr_in l3_addr4;
};
static struct llentry *
in_lltable_new(const struct sockaddr *l3addr, u_int flags)
{
struct in_llentry *lle;
lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_DONTWAIT | M_ZERO);
if (lle == NULL) /* NB: caller generates msg */
return NULL;
callout_init(&lle->base.la_timer, CALLOUT_MPSAFE);
/*
* For IPv4 this will trigger "arpresolve" to generate
* an ARP request.
*/
lle->base.la_expire = time_second; /* mark expired */
lle->l3_addr4 = *(const struct sockaddr_in *)l3addr;
lle->base.lle_refcnt = 1;
LLE_LOCK_INIT(&lle->base);
return &lle->base;
}
/*
* Deletes an address from the address table.
* This function is called by the timer functions
* such as arptimer() and nd6_llinfo_timer(), and
* the caller does the locking.
*/
static void
in_lltable_free(struct lltable *llt, struct llentry *lle)
{
LLE_WUNLOCK(lle);
LLE_LOCK_DESTROY(lle);
free(lle, M_LLTABLE);
}
#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
(((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 )
static void
in_lltable_prefix_free(struct lltable *llt,
const struct sockaddr *prefix,
const struct sockaddr *mask)
{
const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix;
const struct sockaddr_in *msk = (const struct sockaddr_in *)mask;
struct llentry *lle, *next;
register int i;
for (i=0; i < LLTBL_HASHTBL_SIZE; i++) {
LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) {
if (IN_ARE_MASKED_ADDR_EQUAL((struct sockaddr_in *)L3_ADDR(lle),
pfx, msk)) {
int canceled;
canceled = callout_drain(&lle->la_timer);
LLE_WLOCK(lle);
if (canceled)
LLE_REMREF(lle);
llentry_free(lle);
}
}
}
}
static int
in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
{
struct rtentry *rt;
KASSERT(l3addr->sa_family == AF_INET,
("sin_family %d", l3addr->sa_family));
/* XXX rtalloc1 should take a const param */
rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0);
- if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) ||
- ((rt->rt_ifp != ifp) && !(flags & LLE_PUB))) {
+ if (rt == NULL || (!(flags & LLE_PUB) &&
+ ((rt->rt_flags & RTF_GATEWAY) ||
+ (rt->rt_ifp != ifp)))) {
#ifdef DIAGNOSTIC
log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
#endif
if (rt != NULL)
RTFREE_LOCKED(rt);
return (EINVAL);
}
RTFREE_LOCKED(rt);
return 0;
}
/*
* Return NULL if not found or marked for deletion.
* If found return lle read locked.
*/
static struct llentry *
in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
{
const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
struct llentries *lleh;
u_int hashkey;
IF_AFDATA_LOCK_ASSERT(ifp);
KASSERT(l3addr->sa_family == AF_INET,
("sin_family %d", l3addr->sa_family));
hashkey = sin->sin_addr.s_addr;
lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)];
LIST_FOREACH(lle, lleh, lle_next) {
struct sockaddr_in *sa2 = (struct sockaddr_in *)L3_ADDR(lle);
if (lle->la_flags & LLE_DELETED)
continue;
if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr)
break;
}
if (lle == NULL) {
#ifdef DIAGNOSTIC
if (flags & LLE_DELETE)
log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle);
#endif
if (!(flags & LLE_CREATE))
return (NULL);
/*
* A route that covers the given address must have
* been installed 1st because we are doing a resolution,
* verify this.
*/
if (!(flags & LLE_IFADDR) &&
in_lltable_rtcheck(ifp, flags, l3addr) != 0)
goto done;
lle = in_lltable_new(l3addr, flags);
if (lle == NULL) {
log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
goto done;
}
lle->la_flags = flags & ~LLE_CREATE;
if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) {
bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen);
lle->la_flags |= (LLE_VALID | LLE_STATIC);
}
lle->lle_tbl = llt;
lle->lle_head = lleh;
LIST_INSERT_HEAD(lleh, lle, lle_next);
} else if (flags & LLE_DELETE) {
if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
LLE_WLOCK(lle);
lle->la_flags = LLE_DELETED;
LLE_WUNLOCK(lle);
#ifdef DIAGNOSTIC
log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
#endif
}
lle = (void *)-1;
}
if (LLE_IS_VALID(lle)) {
if (flags & LLE_EXCLUSIVE)
LLE_WLOCK(lle);
else
LLE_RLOCK(lle);
}
done:
return (lle);
}
static int
in_lltable_dump(struct lltable *llt, struct sysctl_req *wr)
{
#define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle))
struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
/* XXX stack use */
struct {
struct rt_msghdr rtm;
struct sockaddr_inarp sin;
struct sockaddr_dl sdl;
} arpc;
int error, i;
LLTABLE_LOCK_ASSERT();
error = 0;
for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) {
LIST_FOREACH(lle, &llt->lle_head[i], lle_next) {
struct sockaddr_dl *sdl;
/* skip deleted entries */
if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
continue;
/* Skip if jailed and not a valid IP of the prison. */
if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0)
continue;
/*
* produce a msg made of:
* struct rt_msghdr;
* struct sockaddr_inarp; (IPv4)
* struct sockaddr_dl;
*/
bzero(&arpc, sizeof(arpc));
arpc.rtm.rtm_msglen = sizeof(arpc);
arpc.rtm.rtm_version = RTM_VERSION;
arpc.rtm.rtm_type = RTM_GET;
arpc.rtm.rtm_flags = RTF_UP;
arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
arpc.sin.sin_family = AF_INET;
arpc.sin.sin_len = sizeof(arpc.sin);
arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr;
/* publish */
if (lle->la_flags & LLE_PUB) {
arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
/* proxy only */
if (lle->la_flags & LLE_PROXY)
arpc.sin.sin_other = SIN_PROXY;
}
sdl = &arpc.sdl;
sdl->sdl_family = AF_LINK;
sdl->sdl_len = sizeof(*sdl);
sdl->sdl_index = ifp->if_index;
sdl->sdl_type = ifp->if_type;
if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
sdl->sdl_alen = ifp->if_addrlen;
bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
} else {
sdl->sdl_alen = 0;
bzero(LLADDR(sdl), ifp->if_addrlen);
}
arpc.rtm.rtm_rmx.rmx_expire =
lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
if (lle->la_flags & LLE_STATIC)
arpc.rtm.rtm_flags |= RTF_STATIC;
arpc.rtm.rtm_index = ifp->if_index;
error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
if (error)
break;
}
}
return error;
#undef SIN
}
void *
in_domifattach(struct ifnet *ifp)
{
struct in_ifinfo *ii;
struct lltable *llt;
ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
llt = lltable_init(ifp, AF_INET);
if (llt != NULL) {
llt->llt_new = in_lltable_new;
llt->llt_free = in_lltable_free;
llt->llt_prefix_free = in_lltable_prefix_free;
llt->llt_rtcheck = in_lltable_rtcheck;
llt->llt_lookup = in_lltable_lookup;
llt->llt_dump = in_lltable_dump;
}
ii->ii_llt = llt;
ii->ii_igmp = igmp_domifattach(ifp);
return ii;
}
void
in_domifdetach(struct ifnet *ifp, void *aux)
{
struct in_ifinfo *ii = (struct in_ifinfo *)aux;
igmp_domifdetach(ifp);
lltable_free(ii->ii_llt);
free(ii, M_IFADDR);
}
Index: stable/8/sys/netinet/in_pcb.c
===================================================================
--- stable/8/sys/netinet/in_pcb.c (revision 209276)
+++ stable/8/sys/netinet/in_pcb.c (revision 209277)
@@ -1,1957 +1,1957 @@
/*-
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2007-2009 Robert N. M. Watson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ddb.h"
#include "opt_ipsec.h"
#include "opt_inet6.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#include <vm/uma.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif /* INET6 */
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <security/mac/mac_framework.h>
/*
* These configure the range of local port addresses assigned to
* "unspecified" outgoing connections/packets/whatever.
*/
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
/*
* Reserved ports accessible only to root. There are significant
* security considerations that must be accounted for when changing these,
* but the security benefits can be great. Please be careful.
*/
VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
VNET_DEFINE(int, ipport_reservedlow);
/* Variables dealing with random ephemeral port allocation. */
VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
VNET_DEFINE(int, ipport_tcpallocs);
static VNET_DEFINE(int, ipport_tcplastcount);
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
#define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
else if ((var) > (max)) { (var) = (max); }
static void in_pcbremlists(struct inpcb *inp);
static int
sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
{
int error;
#ifdef VIMAGE
error = vnet_sysctl_handle_int(oidp, arg1, arg2, req);
#else
error = sysctl_handle_int(oidp, arg1, arg2, req);
#endif
if (error == 0) {
RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
}
return (error);
}
#undef RANGECHK
SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0,
&sysctl_net_ipport_check, "I", "");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, "");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
"allocations before switching to a sequental one");
SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
/*
* in_pcb.c: manage the Protocol Control Blocks.
*
* NOTE: It is assumed that most of these functions will be called with
* the pcbinfo lock held, and often, the inpcb lock held, as these utility
* functions often modify hash chains or addresses in pcbs.
*/
/*
* Allocate a PCB and associate it with the socket.
* On success return with the PCB locked.
*/
int
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
{
struct inpcb *inp;
int error;
INP_INFO_WLOCK_ASSERT(pcbinfo);
error = 0;
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
if (inp == NULL)
return (ENOBUFS);
bzero(inp, inp_zero_size);
inp->inp_pcbinfo = pcbinfo;
inp->inp_socket = so;
inp->inp_cred = crhold(so->so_cred);
inp->inp_inc.inc_fibnum = so->so_fibnum;
#ifdef MAC
error = mac_inpcb_init(inp, M_NOWAIT);
if (error != 0)
goto out;
mac_inpcb_create(so, inp);
#endif
#ifdef IPSEC
error = ipsec_init_policy(so, &inp->inp_sp);
if (error != 0) {
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
goto out;
}
#endif /*IPSEC*/
#ifdef INET6
if (INP_SOCKAF(so) == AF_INET6) {
inp->inp_vflag |= INP_IPV6PROTO;
if (V_ip6_v6only)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
pcbinfo->ipi_count++;
so->so_pcb = (caddr_t)inp;
#ifdef INET6
if (V_ip6_auto_flowlabel)
inp->inp_flags |= IN6P_AUTOFLOWLABEL;
#endif
INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
inp->inp_refcount = 1; /* Reference from the inpcbinfo */
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
crfree(inp->inp_cred);
uma_zfree(pcbinfo->ipi_zone, inp);
}
#endif
return (error);
}
int
in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
{
int anonport, error;
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
return (EINVAL);
anonport = inp->inp_lport == 0 && (nam == NULL ||
((struct sockaddr_in *)nam)->sin_port == 0);
error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
&inp->inp_lport, cred);
if (error)
return (error);
if (in_pcbinshash(inp) != 0) {
inp->inp_laddr.s_addr = INADDR_ANY;
inp->inp_lport = 0;
return (EAGAIN);
}
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
/*
* Set up a bind operation on a PCB, performing port allocation
* as required, but do not actually modify the PCB. Callers can
* either complete the bind by setting inp_laddr/inp_lport and
* calling in_pcbinshash(), or they can just use the resulting
* port and address to authorise the sending of a once-off packet.
*
* On error, the values of *laddrp and *lportp are not changed.
*/
int
in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
u_short *lportp, struct ucred *cred)
{
struct socket *so = inp->inp_socket;
unsigned short *lastport;
struct sockaddr_in *sin;
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct in_addr laddr;
u_short lport = 0;
int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
int error;
int dorandom;
/*
* Because no actual state changes occur here, a global write lock on
* the pcbinfo isn't required.
*/
INP_INFO_LOCK_ASSERT(pcbinfo);
INP_LOCK_ASSERT(inp);
if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
return (EADDRNOTAVAIL);
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
wild = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
return (error);
} else {
sin = (struct sockaddr_in *)nam;
if (nam->sa_len != sizeof (*sin))
return (EINVAL);
#ifdef notdef
/*
* We should check the family, but old programs
* incorrectly fail to initialize it.
*/
if (sin->sin_family != AF_INET)
return (EAFNOSUPPORT);
#endif
error = prison_local_ip4(cred, &sin->sin_addr);
if (error)
return (error);
if (sin->sin_port != *lportp) {
/* Don't allow the port to change. */
if (*lportp != 0)
return (EINVAL);
lport = sin->sin_port;
}
/* NB: lport is left as 0 if the port isn't being changed. */
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & SO_REUSEADDR)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
* Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
if (lport) {
struct inpcb *t;
struct tcptw *tw;
/* GROSS */
if (ntohs(lport) <= V_ipport_reservedhigh &&
ntohs(lport) >= V_ipport_reservedlow &&
priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
0))
return (EACCES);
if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
priv_check_cred(inp->inp_cred,
PRIV_NETINET_REUSEPORT, 0) != 0) {
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, INPLOOKUP_WILDCARD, cred);
/*
* XXX
* This entire block sorely needs a rewrite.
*/
if (t &&
((t->inp_flags & INP_TIMEWAIT) == 0) &&
(so->so_type != SOCK_STREAM ||
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
(t->inp_socket->so_options &
SO_REUSEPORT) == 0) &&
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
return (EADDRINUSE);
}
t = in_pcblookup_local(pcbinfo, sin->sin_addr,
lport, wild, cred);
if (t && (t->inp_flags & INP_TIMEWAIT)) {
/*
* XXXRW: If an incpb has had its timewait
* state recycled, we treat the address as
* being in use (for now). This is better
* than a panic, but not desirable.
*/
tw = intotw(inp);
if (tw == NULL ||
(reuseport & tw->tw_so_options) == 0)
return (EADDRINUSE);
} else if (t &&
(reuseport & t->inp_socket->so_options) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) !=
INADDR_ANY ||
INP_SOCKAF(so) ==
INP_SOCKAF(t->inp_socket))
#endif
return (EADDRINUSE);
}
}
}
if (*lportp != 0)
lport = *lportp;
if (lport == 0) {
u_short first, last, aux;
int count;
if (inp->inp_flags & INP_HIGHPORT) {
first = V_ipport_hifirstauto; /* sysctl */
last = V_ipport_hilastauto;
lastport = &pcbinfo->ipi_lasthi;
} else if (inp->inp_flags & INP_LOWPORT) {
error = priv_check_cred(cred,
PRIV_NETINET_RESERVEDPORT, 0);
if (error)
return error;
first = V_ipport_lowfirstauto; /* 1023 */
last = V_ipport_lowlastauto; /* 600 */
lastport = &pcbinfo->ipi_lastlow;
} else {
first = V_ipport_firstauto; /* sysctl */
last = V_ipport_lastauto;
lastport = &pcbinfo->ipi_lastport;
}
/*
* For UDP, use random port allocation as long as the user
* allows it. For TCP (and as of yet unknown) connections,
* use random port allocation only if the user allows it AND
* ipport_tick() allows it.
*/
if (V_ipport_randomized &&
(!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
dorandom = 1;
else
dorandom = 0;
/*
* It makes no sense to do random port allocation if
* we have the only port available.
*/
if (first == last)
dorandom = 0;
/* Make sure to not include UDP packets in the count. */
if (pcbinfo != &V_udbinfo)
V_ipport_tcpallocs++;
/*
* Instead of having two loops further down counting up or down
* make sure that first is always <= last and go with only one
* code path implementing all logic.
*/
if (first > last) {
aux = first;
first = last;
last = aux;
}
if (dorandom)
*lastport = first +
(arc4random() % (last - first));
count = last - first;
do {
if (count-- < 0) /* completely used? */
return (EADDRNOTAVAIL);
++*lastport;
if (*lastport < first || *lastport > last)
*lastport = first;
lport = htons(*lastport);
} while (in_pcblookup_local(pcbinfo, laddr,
lport, wild, cred));
}
*laddrp = laddr.s_addr;
*lportp = lport;
return (0);
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
{
u_short lport, fport;
in_addr_t laddr, faddr;
int anonport, error;
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
lport = inp->inp_lport;
laddr = inp->inp_laddr.s_addr;
anonport = (lport == 0);
error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
NULL, cred);
if (error)
return (error);
/* Do the initial binding of the local address if required. */
if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
inp->inp_lport = lport;
inp->inp_laddr.s_addr = laddr;
if (in_pcbinshash(inp) != 0) {
inp->inp_laddr.s_addr = INADDR_ANY;
inp->inp_lport = 0;
return (EAGAIN);
}
}
/* Commit the remaining changes. */
inp->inp_lport = lport;
inp->inp_laddr.s_addr = laddr;
inp->inp_faddr.s_addr = faddr;
inp->inp_fport = fport;
in_pcbrehash(inp);
if (anonport)
inp->inp_flags |= INP_ANONPORT;
return (0);
}
/*
* Do proper source address selection on an unbound socket in case
* of connect. Take jails into account as well.
*/
static int
in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
struct ucred *cred)
{
struct ifaddr *ifa;
struct sockaddr *sa;
struct sockaddr_in *sin;
struct route sro;
int error;
KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
/*
* Bypass source address selection and use the primary jail IP
* if requested.
*/
if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
return (0);
error = 0;
bzero(&sro, sizeof(sro));
sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(struct sockaddr_in);
sin->sin_addr.s_addr = faddr->s_addr;
/*
* If route is known our src addr is taken from the i/f,
* else punt.
*
* Find out route to destination.
*/
if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
/*
* If we found a route, use the address corresponding to
* the outgoing interface.
*
* Otherwise assume faddr is reachable on a directly connected
* network and try to find a corresponding interface to take
* the source address from.
*/
if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
struct in_ifaddr *ia;
struct ifnet *ifp;
ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
if (ia == NULL)
- ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
+ ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0));
if (ia == NULL) {
error = ENETUNREACH;
goto done;
}
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
ifa_free(&ia->ia_ifa);
goto done;
}
ifp = ia->ia_ifp;
ifa_free(&ia->ia_ifa);
ia = NULL;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
IF_ADDR_UNLOCK(ifp);
goto done;
}
IF_ADDR_UNLOCK(ifp);
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
/*
* If the outgoing interface on the route found is not
* a loopback interface, use the address from that interface.
* In case of jails do those three steps:
* 1. check if the interface address belongs to the jail. If so use it.
* 2. check if we have any address on the outgoing interface
* belonging to this jail. If so use it.
* 3. as a last resort return the 'default' jail address.
*/
if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
struct in_ifaddr *ia;
struct ifnet *ifp;
/* If not jailed, use the default returned. */
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/* Jailed. */
/* 1. Check if the iface address belongs to the jail. */
sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
goto done;
}
/*
* 2. Check if we have any address on the outgoing interface
* belonging to this jail.
*/
ia = NULL;
ifp = sro.ro_rt->rt_ifp;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
IF_ADDR_UNLOCK(ifp);
goto done;
}
IF_ADDR_UNLOCK(ifp);
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
/*
* The outgoing interface is marked with 'loopback net', so a route
* to ourselves is here.
* Try to find the interface of the destination address and then
* take the address from there. That interface is not necessarily
* a loopback interface.
* In case of jails, check that it is an address of the jail
* and if we cannot find, fall back to the 'default' jail address.
*/
if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
struct sockaddr_in sain;
struct in_ifaddr *ia;
bzero(&sain, sizeof(struct sockaddr_in));
sain.sin_family = AF_INET;
sain.sin_len = sizeof(struct sockaddr_in);
sain.sin_addr.s_addr = faddr->s_addr;
ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
if (ia == NULL)
- ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
+ ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0));
if (ia == NULL)
ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
if (ia == NULL) {
error = ENETUNREACH;
goto done;
}
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
ifa_free(&ia->ia_ifa);
goto done;
}
/* Jailed. */
if (ia != NULL) {
struct ifnet *ifp;
ifp = ia->ia_ifp;
ifa_free(&ia->ia_ifa);
ia = NULL;
IF_ADDR_LOCK(ifp);
TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
sin = (struct sockaddr_in *)sa;
if (prison_check_ip4(cred,
&sin->sin_addr) == 0) {
ia = (struct in_ifaddr *)ifa;
break;
}
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
IF_ADDR_UNLOCK(ifp);
goto done;
}
IF_ADDR_UNLOCK(ifp);
}
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
goto done;
}
done:
if (sro.ro_rt != NULL)
RTFREE(sro.ro_rt);
return (error);
}
/*
* Set up for a connect from a socket to the specified address.
* On entry, *laddrp and *lportp should contain the current local
* address and port for the PCB; these are updated to the values
* that should be placed in inp_laddr and inp_lport to complete
* the connect.
*
* On success, *faddrp and *fportp will be set to the remote address
* and port. These are not updated in the error case.
*
* If the operation fails because the connection already exists,
* *oinpp will be set to the PCB of that connection so that the
* caller can decide to override it. In all other cases, *oinpp
* is set to NULL.
*/
int
in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
struct inpcb **oinpp, struct ucred *cred)
{
struct sockaddr_in *sin = (struct sockaddr_in *)nam;
struct in_ifaddr *ia;
struct inpcb *oinp;
struct in_addr laddr, faddr;
u_short lport, fport;
int error;
/*
* Because a global state change doesn't actually occur here, a read
* lock is sufficient.
*/
INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
INP_LOCK_ASSERT(inp);
if (oinpp != NULL)
*oinpp = NULL;
if (nam->sa_len != sizeof (*sin))
return (EINVAL);
if (sin->sin_family != AF_INET)
return (EAFNOSUPPORT);
if (sin->sin_port == 0)
return (EADDRNOTAVAIL);
laddr.s_addr = *laddrp;
lport = *lportp;
faddr = sin->sin_addr;
fport = sin->sin_port;
if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
* use the primary local address.
* If the supplied address is INADDR_BROADCAST,
* and the primary interface supports broadcast,
* choose the broadcast address for that interface.
*/
if (faddr.s_addr == INADDR_ANY) {
IN_IFADDR_RLOCK();
faddr =
IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
IN_IFADDR_RUNLOCK();
if (cred != NULL &&
(error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
IN_IFADDR_RLOCK();
if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
IFF_BROADCAST)
faddr = satosin(&TAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
IN_IFADDR_RUNLOCK();
}
}
if (laddr.s_addr == INADDR_ANY) {
error = in_pcbladdr(inp, &faddr, &laddr, cred);
if (error)
return (error);
/*
* If the destination address is multicast and an outgoing
* interface has been set as a multicast option, use the
* address of that interface as our source address.
*/
if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
inp->inp_moptions != NULL) {
struct ip_moptions *imo;
struct ifnet *ifp;
imo = inp->inp_moptions;
if (imo->imo_multicast_ifp != NULL) {
ifp = imo->imo_multicast_ifp;
IN_IFADDR_RLOCK();
TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
if (ia->ia_ifp == ifp)
break;
if (ia == NULL) {
IN_IFADDR_RUNLOCK();
return (EADDRNOTAVAIL);
}
laddr = ia->ia_addr.sin_addr;
IN_IFADDR_RUNLOCK();
}
}
}
oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
0, NULL);
if (oinp != NULL) {
if (oinpp != NULL)
*oinpp = oinp;
return (EADDRINUSE);
}
if (lport == 0) {
error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
cred);
if (error)
return (error);
}
*laddrp = laddr.s_addr;
*lportp = lport;
*faddrp = faddr.s_addr;
*fportp = fport;
return (0);
}
void
in_pcbdisconnect(struct inpcb *inp)
{
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
inp->inp_faddr.s_addr = INADDR_ANY;
inp->inp_fport = 0;
in_pcbrehash(inp);
}
/*
* in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
* For most protocols, this will be invoked immediately prior to calling
* in_pcbfree(). However, with TCP the inpcb may significantly outlive the
* socket, in which case in_pcbfree() is deferred.
*/
void
in_pcbdetach(struct inpcb *inp)
{
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
}
/*
* in_pcbfree_internal() frees an inpcb that has been detached from its
* socket, and whose reference count has reached 0. It will also remove the
* inpcb from any global lists it might remain on.
*/
static void
in_pcbfree_internal(struct inpcb *inp)
{
struct inpcbinfo *ipi = inp->inp_pcbinfo;
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
#ifdef IPSEC
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
#endif /* IPSEC */
inp->inp_gencnt = ++ipi->ipi_gencnt;
in_pcbremlists(inp);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
if (inp->in6p_moptions != NULL)
ip6_freemoptions(inp->in6p_moptions);
}
#endif
if (inp->inp_options)
(void)m_free(inp->inp_options);
if (inp->inp_moptions != NULL)
inp_freemoptions(inp->inp_moptions);
inp->inp_vflag = 0;
crfree(inp->inp_cred);
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
INP_WUNLOCK(inp);
uma_zfree(ipi->ipi_zone, inp);
}
/*
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
* but where the inpcb lock is already held.
*
* While the inpcb will not be freed, releasing the inpcb lock means that the
* connection's state may change, so the caller should be careful to
* revalidate any cached state on reacquiring the lock. Drop the reference
* using in_pcbrele().
*/
void
in_pcbref(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
inp->inp_refcount++;
}
/*
* Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
* return a flag indicating whether or not the inpcb remains valid. If it is
* valid, we return with the inpcb lock held.
*/
int
in_pcbrele(struct inpcb *inp)
{
#ifdef INVARIANTS
struct inpcbinfo *ipi = inp->inp_pcbinfo;
#endif
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
inp->inp_refcount--;
if (inp->inp_refcount > 0)
return (0);
in_pcbfree_internal(inp);
return (1);
}
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
* released using in_pcbrele(), but the inpcb is still unlocked.
*/
void
in_pcbfree(struct inpcb *inp)
{
#ifdef INVARIANTS
struct inpcbinfo *ipi = inp->inp_pcbinfo;
#endif
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
__func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
if (!in_pcbrele(inp))
INP_WUNLOCK(inp);
}
/*
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
* port reservation, and preventing it from being returned by inpcb lookups.
*
* It is used by TCP to mark an inpcb as unused and avoid future packet
* delivery or event notification when a socket remains open but TCP has
* closed. This might occur as a result of a shutdown()-initiated TCP close
* or a RST on the wire, and allows the port binding to be reused while still
* maintaining the invariant that so_pcb always points to a valid inpcb until
* in_pcbdetach().
*
* XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
* lists, but can lead to confusing netstat output, as open sockets with
* closed TCP connections will no longer appear to have their bound port
* number. An explicit flag would be better, as it would allow us to leave
* the port number intact after the connection is dropped.
*
* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
* in_pcbnotifyall() and in_pcbpurgeif0()?
*/
void
in_pcbdrop(struct inpcb *inp)
{
INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
INP_WLOCK_ASSERT(inp);
inp->inp_flags |= INP_DROPPED;
if (inp->inp_flags & INP_INHASHLIST) {
struct inpcbport *phd = inp->inp_phd;
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
LIST_REMOVE(phd, phd_hash);
free(phd, M_PCB);
}
inp->inp_flags &= ~INP_INHASHLIST;
}
}
/*
* Common routines to return the socket addresses associated with inpcbs.
*/
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr_p)
{
struct sockaddr_in *sin;
sin = malloc(sizeof *sin, M_SONAME,
M_WAITOK | M_ZERO);
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = *addr_p;
sin->sin_port = port;
return (struct sockaddr *)sin;
}
int
in_getsockaddr(struct socket *so, struct sockaddr **nam)
{
struct inpcb *inp;
struct in_addr addr;
in_port_t port;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
INP_RLOCK(inp);
port = inp->inp_lport;
addr = inp->inp_laddr;
INP_RUNLOCK(inp);
*nam = in_sockaddr(port, &addr);
return 0;
}
int
in_getpeeraddr(struct socket *so, struct sockaddr **nam)
{
struct inpcb *inp;
struct in_addr addr;
in_port_t port;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
INP_RLOCK(inp);
port = inp->inp_fport;
addr = inp->inp_faddr;
INP_RUNLOCK(inp);
*nam = in_sockaddr(port, &addr);
return 0;
}
void
in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
struct inpcb *(*notify)(struct inpcb *, int))
{
struct inpcb *inp, *inp_temp;
INP_INFO_WLOCK(pcbinfo);
LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
#ifdef INET6
if ((inp->inp_vflag & INP_IPV4) == 0) {
INP_WUNLOCK(inp);
continue;
}
#endif
if (inp->inp_faddr.s_addr != faddr.s_addr ||
inp->inp_socket == NULL) {
INP_WUNLOCK(inp);
continue;
}
if ((*notify)(inp, errno))
INP_WUNLOCK(inp);
}
INP_INFO_WUNLOCK(pcbinfo);
}
void
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
{
struct inpcb *inp;
struct ip_moptions *imo;
int i, gap;
INP_INFO_RLOCK(pcbinfo);
LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(inp);
imo = inp->inp_moptions;
if ((inp->inp_vflag & INP_IPV4) &&
imo != NULL) {
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (imo->imo_multicast_ifp == ifp)
imo->imo_multicast_ifp = NULL;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
*/
for (i = 0, gap = 0; i < imo->imo_num_memberships;
i++) {
if (imo->imo_membership[i]->inm_ifp == ifp) {
in_delmulti(imo->imo_membership[i]);
gap++;
} else if (gap != 0)
imo->imo_membership[i - gap] =
imo->imo_membership[i];
}
imo->imo_num_memberships -= gap;
}
INP_WUNLOCK(inp);
}
INP_INFO_RUNLOCK(pcbinfo);
}
/*
* Lookup a PCB based on the local address and port.
*/
#define INP_LOOKUP_MAPPED_PCB_COST 3
struct inpcb *
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
u_short lport, int wild_okay, struct ucred *cred)
{
struct inpcb *inp;
#ifdef INET6
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
#else
int matchwild = 3;
#endif
int wildcard;
INP_INFO_LOCK_ASSERT(pcbinfo);
if (!wild_okay) {
struct inpcbhead *head;
/*
* Look for an unconnected (wildcard foreign addr) PCB that
* matches the local address and port we're looking for.
*/
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_hashmask)];
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr == INADDR_ANY &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_lport == lport) {
/*
* Found?
*/
if (cred == NULL ||
prison_equal_ip4(cred->cr_prison,
inp->inp_cred->cr_prison))
return (inp);
}
}
/*
* Not found.
*/
return (NULL);
} else {
struct inpcbporthead *porthash;
struct inpcbport *phd;
struct inpcb *match = NULL;
/*
* Best fit PCB lookup.
*
* First see if this local port is in use by looking on the
* port hash list.
*/
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
pcbinfo->ipi_porthashmask)];
LIST_FOREACH(phd, porthash, phd_hash) {
if (phd->phd_port == lport)
break;
}
if (phd != NULL) {
/*
* Port is in use by one or more PCBs. Look for best
* fit.
*/
LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
wildcard = 0;
if (cred != NULL &&
!prison_equal_ip4(inp->inp_cred->cr_prison,
cred->cr_prison))
continue;
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
/*
* We never select the PCB that has
* INP_IPV6 flag and is bound to :: if
* we have another PCB which is bound
* to 0.0.0.0. If a PCB has the
* INP_IPV6 flag, then we set its cost
* higher than IPv4 only PCBs.
*
* Note that the case only happens
* when a socket is bound to ::, under
* the condition that the use of the
* mapped address is allowed.
*/
if ((inp->inp_vflag & INP_IPV6) != 0)
wildcard += INP_LOOKUP_MAPPED_PCB_COST;
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY)
wildcard++;
if (inp->inp_laddr.s_addr != INADDR_ANY) {
if (laddr.s_addr == INADDR_ANY)
wildcard++;
else if (inp->inp_laddr.s_addr != laddr.s_addr)
continue;
} else {
if (laddr.s_addr != INADDR_ANY)
wildcard++;
}
if (wildcard < matchwild) {
match = inp;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
}
return (match);
}
}
#undef INP_LOOKUP_MAPPED_PCB_COST
/*
* Lookup PCB in hash list.
*/
struct inpcb *
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
struct ifnet *ifp)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
u_short fport = fport_arg, lport = lport_arg;
INP_INFO_LOCK_ASSERT(pcbinfo);
/*
* First look for an exact match.
*/
tmpinp = NULL;
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
pcbinfo->ipi_hashmask)];
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr == faddr.s_addr &&
inp->inp_laddr.s_addr == laddr.s_addr &&
inp->inp_fport == fport &&
inp->inp_lport == lport) {
/*
* XXX We should be able to directly return
* the inp here, without any checks.
* Well unless both bound with SO_REUSEPORT?
*/
if (prison_flag(inp->inp_cred, PR_IP4))
return (inp);
if (tmpinp == NULL)
tmpinp = inp;
}
}
if (tmpinp != NULL)
return (tmpinp);
/*
* Then look for a wildcard match, if requested.
*/
if (wildcard == INPLOOKUP_WILDCARD) {
struct inpcb *local_wild = NULL, *local_exact = NULL;
#ifdef INET6
struct inpcb *local_wild_mapped = NULL;
#endif
struct inpcb *jail_wild = NULL;
int injail;
/*
* Order of socket selection - we always prefer jails.
* 1. jailed, non-wild.
* 2. jailed, wild.
* 3. non-jailed, non-wild.
* 4. non-jailed, wild.
*/
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_hashmask)];
LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
if (inp->inp_faddr.s_addr != INADDR_ANY ||
inp->inp_lport != lport)
continue;
/* XXX inp locking */
if (ifp && ifp->if_type == IFT_FAITH &&
(inp->inp_flags & INP_FAITH) == 0)
continue;
injail = prison_flag(inp->inp_cred, PR_IP4);
if (injail) {
if (prison_check_ip4(inp->inp_cred,
&laddr) != 0)
continue;
} else {
if (local_exact != NULL)
continue;
}
if (inp->inp_laddr.s_addr == laddr.s_addr) {
if (injail)
return (inp);
else
local_exact = inp;
} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
#ifdef INET6
/* XXX inp locking, NULL check */
if (inp->inp_vflag & INP_IPV6PROTO)
local_wild_mapped = inp;
else
#endif /* INET6 */
if (injail)
jail_wild = inp;
else
local_wild = inp;
}
} /* LIST_FOREACH */
if (jail_wild != NULL)
return (jail_wild);
if (local_exact != NULL)
return (local_exact);
if (local_wild != NULL)
return (local_wild);
#ifdef INET6
if (local_wild_mapped != NULL)
return (local_wild_mapped);
#endif /* defined(INET6) */
} /* if (wildcard == INPLOOKUP_WILDCARD) */
return (NULL);
}
/*
* Insert PCB onto various hash lists.
*/
int
in_pcbinshash(struct inpcb *inp)
{
struct inpcbhead *pcbhash;
struct inpcbporthead *pcbporthash;
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
("in_pcbinshash: INP_INHASHLIST"));
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
else
#endif /* INET6 */
hashkey_faddr = inp->inp_faddr.s_addr;
pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
pcbporthash = &pcbinfo->ipi_porthashbase[
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
/*
* Go through port list and look for a head for this lport.
*/
LIST_FOREACH(phd, pcbporthash, phd_hash) {
if (phd->phd_port == inp->inp_lport)
break;
}
/*
* If none exists, malloc one and tack it on.
*/
if (phd == NULL) {
phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
if (phd == NULL) {
return (ENOBUFS); /* XXX */
}
phd->phd_port = inp->inp_lport;
LIST_INIT(&phd->phd_pcblist);
LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
}
inp->inp_phd = phd;
LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
inp->inp_flags |= INP_INHASHLIST;
return (0);
}
/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
* hashed port list would have to be updated as well), so the lport must
* not change after in_pcbinshash() has been called.
*/
void
in_pcbrehash(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbhead *head;
u_int32_t hashkey_faddr;
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_flags & INP_INHASHLIST,
("in_pcbrehash: !INP_INHASHLIST"));
#ifdef INET6
if (inp->inp_vflag & INP_IPV6)
hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
else
#endif /* INET6 */
hashkey_faddr = inp->inp_faddr.s_addr;
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
/*
* Remove PCB from various lists.
*/
static void
in_pcbremlists(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
if (inp->inp_flags & INP_INHASHLIST) {
struct inpcbport *phd = inp->inp_phd;
LIST_REMOVE(inp, inp_hash);
LIST_REMOVE(inp, inp_portlist);
if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
LIST_REMOVE(phd, phd_hash);
free(phd, M_PCB);
}
inp->inp_flags &= ~INP_INHASHLIST;
}
LIST_REMOVE(inp, inp_list);
pcbinfo->ipi_count--;
}
/*
* A set label operation has occurred at the socket layer, propagate the
* label change into the in_pcb for the socket.
*/
void
in_pcbsosetlabel(struct socket *so)
{
#ifdef MAC
struct inpcb *inp;
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
INP_WLOCK(inp);
SOCK_LOCK(so);
mac_inpcb_sosetlabel(so, inp);
SOCK_UNLOCK(so);
INP_WUNLOCK(inp);
#endif
}
/*
* ipport_tick runs once per second, determining if random port allocation
* should be continued. If more than ipport_randomcps ports have been
* allocated in the last second, then we return to sequential port
* allocation. We return to random allocation only once we drop below
* ipport_randomcps for at least ipport_randomtime seconds.
*/
void
ipport_tick(void *xtp)
{
VNET_ITERATOR_DECL(vnet_iter);
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
if (V_ipport_tcpallocs <=
V_ipport_tcplastcount + V_ipport_randomcps) {
if (V_ipport_stoprandom > 0)
V_ipport_stoprandom--;
} else
V_ipport_stoprandom = V_ipport_randomtime;
V_ipport_tcplastcount = V_ipport_tcpallocs;
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
}
void
inp_wlock(struct inpcb *inp)
{
INP_WLOCK(inp);
}
void
inp_wunlock(struct inpcb *inp)
{
INP_WUNLOCK(inp);
}
void
inp_rlock(struct inpcb *inp)
{
INP_RLOCK(inp);
}
void
inp_runlock(struct inpcb *inp)
{
INP_RUNLOCK(inp);
}
#ifdef INVARIANTS
void
inp_lock_assert(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
}
void
inp_unlock_assert(struct inpcb *inp)
{
INP_UNLOCK_ASSERT(inp);
}
#endif
void
inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
{
struct inpcb *inp;
INP_INFO_RLOCK(&V_tcbinfo);
LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
func(inp, arg);
INP_WUNLOCK(inp);
}
INP_INFO_RUNLOCK(&V_tcbinfo);
}
struct socket *
inp_inpcbtosocket(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
return (inp->inp_socket);
}
struct tcpcb *
inp_inpcbtotcpcb(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
return ((struct tcpcb *)inp->inp_ppcb);
}
int
inp_ip_tos_get(const struct inpcb *inp)
{
return (inp->inp_ip_tos);
}
void
inp_ip_tos_set(struct inpcb *inp, int val)
{
inp->inp_ip_tos = val;
}
void
inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
uint32_t *faddr, uint16_t *fp)
{
INP_LOCK_ASSERT(inp);
*laddr = inp->inp_laddr.s_addr;
*faddr = inp->inp_faddr.s_addr;
*lp = inp->inp_lport;
*fp = inp->inp_fport;
}
struct inpcb *
so_sotoinpcb(struct socket *so)
{
return (sotoinpcb(so));
}
struct tcpcb *
so_sototcpcb(struct socket *so)
{
return (sototcpcb(so));
}
#ifdef DDB
static void
db_print_indent(int indent)
{
int i;
for (i = 0; i < indent; i++)
db_printf(" ");
}
static void
db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
{
char faddr_str[48], laddr_str[48];
db_print_indent(indent);
db_printf("%s at %p\n", name, inc);
indent += 2;
#ifdef INET6
if (inc->inc_flags & INC_ISIPV6) {
/* IPv6. */
ip6_sprintf(laddr_str, &inc->inc6_laddr);
ip6_sprintf(faddr_str, &inc->inc6_faddr);
} else {
#endif
/* IPv4. */
inet_ntoa_r(inc->inc_laddr, laddr_str);
inet_ntoa_r(inc->inc_faddr, faddr_str);
#ifdef INET6
}
#endif
db_print_indent(indent);
db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
ntohs(inc->inc_lport));
db_print_indent(indent);
db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
ntohs(inc->inc_fport));
}
static void
db_print_inpflags(int inp_flags)
{
int comma;
comma = 0;
if (inp_flags & INP_RECVOPTS) {
db_printf("%sINP_RECVOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVRETOPTS) {
db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVDSTADDR) {
db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_HDRINCL) {
db_printf("%sINP_HDRINCL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_HIGHPORT) {
db_printf("%sINP_HIGHPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_LOWPORT) {
db_printf("%sINP_LOWPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ANONPORT) {
db_printf("%sINP_ANONPORT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVIF) {
db_printf("%sINP_RECVIF", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_MTUDISC) {
db_printf("%sINP_MTUDISC", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_FAITH) {
db_printf("%sINP_FAITH", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_RECVTTL) {
db_printf("%sINP_RECVTTL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_DONTFRAG) {
db_printf("%sINP_DONTFRAG", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_IPV6_V6ONLY) {
db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_PKTINFO) {
db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_HOPLIMIT) {
db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_HOPOPTS) {
db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_DSTOPTS) {
db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RTHDR) {
db_printf("%sIN6P_RTHDR", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RTHDRDSTOPTS) {
db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_TCLASS) {
db_printf("%sIN6P_TCLASS", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_AUTOFLOWLABEL) {
db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_TIMEWAIT) {
db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_ONESBCAST) {
db_printf("%sINP_ONESBCAST", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_DROPPED) {
db_printf("%sINP_DROPPED", comma ? ", " : "");
comma = 1;
}
if (inp_flags & INP_SOCKREF) {
db_printf("%sINP_SOCKREF", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_RFC2292) {
db_printf("%sIN6P_RFC2292", comma ? ", " : "");
comma = 1;
}
if (inp_flags & IN6P_MTU) {
db_printf("IN6P_MTU%s", comma ? ", " : "");
comma = 1;
}
}
static void
db_print_inpvflag(u_char inp_vflag)
{
int comma;
comma = 0;
if (inp_vflag & INP_IPV4) {
db_printf("%sINP_IPV4", comma ? ", " : "");
comma = 1;
}
if (inp_vflag & INP_IPV6) {
db_printf("%sINP_IPV6", comma ? ", " : "");
comma = 1;
}
if (inp_vflag & INP_IPV6PROTO) {
db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
comma = 1;
}
}
static void
db_print_inpcb(struct inpcb *inp, const char *name, int indent)
{
db_print_indent(indent);
db_printf("%s at %p\n", name, inp);
indent += 2;
db_print_indent(indent);
db_printf("inp_flow: 0x%x\n", inp->inp_flow);
db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
db_print_indent(indent);
db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
db_print_indent(indent);
db_printf("inp_label: %p inp_flags: 0x%x (",
inp->inp_label, inp->inp_flags);
db_print_inpflags(inp->inp_flags);
db_printf(")\n");
db_print_indent(indent);
db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
inp->inp_vflag);
db_print_inpvflag(inp->inp_vflag);
db_printf(")\n");
db_print_indent(indent);
db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
db_print_indent(indent);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6) {
db_printf("in6p_options: %p in6p_outputopts: %p "
"in6p_moptions: %p\n", inp->in6p_options,
inp->in6p_outputopts, inp->in6p_moptions);
db_printf("in6p_icmp6filt: %p in6p_cksum %d "
"in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
inp->in6p_hops);
} else
#endif
{
db_printf("inp_ip_tos: %d inp_ip_options: %p "
"inp_ip_moptions: %p\n", inp->inp_ip_tos,
inp->inp_options, inp->inp_moptions);
}
db_print_indent(indent);
db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
(uintmax_t)inp->inp_gencnt);
}
DB_SHOW_COMMAND(inpcb, db_show_inpcb)
{
struct inpcb *inp;
if (!have_addr) {
db_printf("usage: show inpcb <addr>\n");
return;
}
inp = (struct inpcb *)addr;
db_print_inpcb(inp, "inpcb", 0);
}
#endif
Index: stable/8/sys/netinet/ip_options.c
===================================================================
--- stable/8/sys/netinet/ip_options.c (revision 209276)
+++ stable/8/sys/netinet/ip_options.c (revision 209277)
@@ -1,745 +1,745 @@
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California.
* Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ipstealth.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/netisr.h>
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
#include <netinet/ip_icmp.h>
#include <machine/in_cksum.h>
#include <sys/socketvar.h>
#include <security/mac/mac_framework.h>
static int ip_dosourceroute = 0;
SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
&ip_dosourceroute, 0, "Enable forwarding source routed IP packets");
static int ip_acceptsourceroute = 0;
SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
CTLFLAG_RW, &ip_acceptsourceroute, 0,
"Enable accepting source routed IP packets");
int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */
SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW,
&ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)");
static void save_rte(struct mbuf *m, u_char *, struct in_addr);
/*
* Do option processing on a datagram, possibly discarding it if bad options
* are encountered, or forwarding it if source-routed.
*
* The pass argument is used when operating in the IPSTEALTH mode to tell
* what options to process: [LS]SRR (pass 0) or the others (pass 1). The
* reason for as many as two passes is that when doing IPSTEALTH, non-routing
* options should be processed only if the packet is for us.
*
* Returns 1 if packet has been forwarded/freed, 0 if the packet should be
* processed further.
*/
int
ip_dooptions(struct mbuf *m, int pass)
{
struct ip *ip = mtod(m, struct ip *);
u_char *cp;
struct in_ifaddr *ia;
int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
struct in_addr *sin, dst;
uint32_t ntime;
struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
/* Ignore or reject packets with IP options. */
if (ip_doopts == 0)
return 0;
else if (ip_doopts == 2) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_FILTER_PROHIB;
goto bad;
}
dst = ip->ip_dst;
cp = (u_char *)(ip + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp)) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
}
switch (opt) {
default:
break;
/*
* Source routing with record. Find interface with current
* destination address. If none on this machine then drop if
* strictly routed, or do nothing if loosely routed. Record
* interface address and bring up next address component. If
* strictly routed make sure next address is on directly
* accessible net.
*/
case IPOPT_LSRR:
case IPOPT_SSRR:
#ifdef IPSTEALTH
if (V_ipstealth && pass > 0)
break;
#endif
if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
ipaddr.sin_addr = ip->ip_dst;
if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr)
== 0) {
if (opt == IPOPT_SSRR) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
if (!ip_dosourceroute)
goto nosourcerouting;
/*
* Loose routing, and not at next destination
* yet; nothing to do except forward.
*/
break;
}
off--; /* 0 origin */
if (off > optlen - (int)sizeof(struct in_addr)) {
/*
* End of source route. Should be for us.
*/
if (!ip_acceptsourceroute)
goto nosourcerouting;
save_rte(m, cp, ip->ip_src);
break;
}
#ifdef IPSTEALTH
if (V_ipstealth)
goto dropit;
#endif
if (!ip_dosourceroute) {
if (V_ipforwarding) {
char buf[16]; /* aaa.bbb.ccc.ddd\0 */
/*
* Acting as a router, so generate
* ICMP
*/
nosourcerouting:
strcpy(buf, inet_ntoa(ip->ip_dst));
log(LOG_WARNING,
"attempted source route from %s to %s\n",
inet_ntoa(ip->ip_src), buf);
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
} else {
/*
* Not acting as a router, so
* silently drop.
*/
#ifdef IPSTEALTH
dropit:
#endif
IPSTAT_INC(ips_cantforward);
m_freem(m);
return (1);
}
}
/*
* locate outgoing interface
*/
(void)memcpy(&ipaddr.sin_addr, cp + off,
sizeof(ipaddr.sin_addr));
if (opt == IPOPT_SSRR) {
#define INA struct in_ifaddr *
#define SA struct sockaddr *
if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL)
- ia = (INA)ifa_ifwithnet((SA)&ipaddr);
+ ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0);
} else
/* XXX MRT 0 for routing */
ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m));
if (ia == NULL) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_SRCFAIL;
goto bad;
}
ip->ip_dst = ipaddr.sin_addr;
(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
sizeof(struct in_addr));
ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
/*
* Let ip_intr's mcast routing check handle mcast pkts
*/
forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
break;
case IPOPT_RR:
#ifdef IPSTEALTH
if (V_ipstealth && pass == 0)
break;
#endif
if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
/*
* If no space remains, ignore.
*/
off--; /* 0 origin */
if (off > optlen - (int)sizeof(struct in_addr))
break;
(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
sizeof(ipaddr.sin_addr));
/*
* Locate outgoing interface; if we're the
* destination, use the incoming interface (should be
* same).
*/
if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
(ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
type = ICMP_UNREACH;
code = ICMP_UNREACH_HOST;
goto bad;
}
(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
sizeof(struct in_addr));
ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
break;
case IPOPT_TS:
#ifdef IPSTEALTH
if (V_ipstealth && pass == 0)
break;
#endif
code = cp - (u_char *)ip;
if (optlen < 4 || optlen > 40) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if ((off = cp[IPOPT_OFFSET]) < 5) {
code = &cp[IPOPT_OLEN] - (u_char *)ip;
goto bad;
}
if (off > optlen - (int)sizeof(int32_t)) {
cp[IPOPT_OFFSET + 1] += (1 << 4);
if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
break;
}
off--; /* 0 origin */
sin = (struct in_addr *)(cp + off);
switch (cp[IPOPT_OFFSET + 1] & 0x0f) {
case IPOPT_TS_TSONLY:
break;
case IPOPT_TS_TSANDADDR:
if (off + sizeof(uint32_t) +
sizeof(struct in_addr) > optlen) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
ipaddr.sin_addr = dst;
ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
m->m_pkthdr.rcvif);
if (ia == NULL)
continue;
(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
sizeof(struct in_addr));
ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
off += sizeof(struct in_addr);
break;
case IPOPT_TS_PRESPEC:
if (off + sizeof(uint32_t) +
sizeof(struct in_addr) > optlen) {
code = &cp[IPOPT_OFFSET] - (u_char *)ip;
goto bad;
}
(void)memcpy(&ipaddr.sin_addr, sin,
sizeof(struct in_addr));
if (ifa_ifwithaddr((SA)&ipaddr) == NULL)
continue;
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
off += sizeof(struct in_addr);
break;
default:
code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
goto bad;
}
ntime = iptime();
(void)memcpy(cp + off, &ntime, sizeof(uint32_t));
cp[IPOPT_OFFSET] += sizeof(uint32_t);
}
}
if (forward && V_ipforwarding) {
ip_forward(m, 1);
return (1);
}
return (0);
bad:
icmp_error(m, type, code, 0, 0);
IPSTAT_INC(ips_badoptions);
return (1);
}
/*
* Save incoming source route for use in replies, to be picked up later by
* ip_srcroute if the receiver is interested.
*/
static void
save_rte(struct mbuf *m, u_char *option, struct in_addr dst)
{
unsigned olen;
struct ipopt_tag *opts;
opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS,
sizeof(struct ipopt_tag), M_NOWAIT);
if (opts == NULL)
return;
olen = option[IPOPT_OLEN];
if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) {
m_tag_free((struct m_tag *)opts);
return;
}
bcopy(option, opts->ip_srcrt.srcopt, olen);
opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
opts->ip_srcrt.dst = dst;
m_tag_prepend(m, (struct m_tag *)opts);
}
/*
* Retrieve incoming source route for use in replies, in the same form used
* by setsockopt. The first hop is placed before the options, will be
* removed later.
*/
struct mbuf *
ip_srcroute(struct mbuf *m0)
{
struct in_addr *p, *q;
struct mbuf *m;
struct ipopt_tag *opts;
opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL);
if (opts == NULL)
return (NULL);
if (opts->ip_nhops == 0)
return (NULL);
m = m_get(M_DONTWAIT, MT_DATA);
if (m == NULL)
return (NULL);
#define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt))
/* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */
m->m_len = opts->ip_nhops * sizeof(struct in_addr) +
sizeof(struct in_addr) + OPTSIZ;
/*
* First, save first hop for return route.
*/
p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]);
*(mtod(m, struct in_addr *)) = *p--;
/*
* Copy option fields and padding (nop) to mbuf.
*/
opts->ip_srcrt.nop = IPOPT_NOP;
opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
&(opts->ip_srcrt.nop), OPTSIZ);
q = (struct in_addr *)(mtod(m, caddr_t) +
sizeof(struct in_addr) + OPTSIZ);
#undef OPTSIZ
/*
* Record return path as an IP source route, reversing the path
* (pointers are now aligned).
*/
while (p >= opts->ip_srcrt.route) {
*q++ = *p--;
}
/*
* Last hop goes to final destination.
*/
*q = opts->ip_srcrt.dst;
m_tag_delete(m0, (struct m_tag *)opts);
return (m);
}
/*
* Strip out IP options, at higher level protocol in the kernel. Second
* argument is buffer to which options will be moved, and return value is
* their length.
*
* XXX should be deleted; last arg currently ignored.
*/
void
ip_stripoptions(struct mbuf *m, struct mbuf *mopt)
{
int i;
struct ip *ip = mtod(m, struct ip *);
caddr_t opts;
int olen;
olen = (ip->ip_hl << 2) - sizeof (struct ip);
opts = (caddr_t)(ip + 1);
i = m->m_len - (sizeof (struct ip) + olen);
bcopy(opts + olen, opts, (unsigned)i);
m->m_len -= olen;
if (m->m_flags & M_PKTHDR)
m->m_pkthdr.len -= olen;
ip->ip_v = IPVERSION;
ip->ip_hl = sizeof(struct ip) >> 2;
}
/*
* Insert IP options into preformed packet. Adjust IP destination as
* required for IP source routing, as indicated by a non-zero in_addr at the
* start of the options.
*
* XXX This routine assumes that the packet has no options in place.
*/
struct mbuf *
ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
{
struct ipoption *p = mtod(opt, struct ipoption *);
struct mbuf *n;
struct ip *ip = mtod(m, struct ip *);
unsigned optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
if (optlen + ip->ip_len > IP_MAXPACKET) {
*phlen = 0;
return (m); /* XXX should fail */
}
if (p->ipopt_dst.s_addr)
ip->ip_dst = p->ipopt_dst;
if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
MGETHDR(n, M_DONTWAIT, MT_DATA);
if (n == NULL) {
*phlen = 0;
return (m);
}
M_MOVE_PKTHDR(n, m);
n->m_pkthdr.rcvif = NULL;
n->m_pkthdr.len += optlen;
m->m_len -= sizeof(struct ip);
m->m_data += sizeof(struct ip);
n->m_next = m;
m = n;
m->m_len = optlen + sizeof(struct ip);
m->m_data += max_linkhdr;
bcopy(ip, mtod(m, void *), sizeof(struct ip));
} else {
m->m_data -= optlen;
m->m_len += optlen;
m->m_pkthdr.len += optlen;
bcopy(ip, mtod(m, void *), sizeof(struct ip));
}
ip = mtod(m, struct ip *);
bcopy(p->ipopt_list, ip + 1, optlen);
*phlen = sizeof(struct ip) + optlen;
ip->ip_v = IPVERSION;
ip->ip_hl = *phlen >> 2;
ip->ip_len += optlen;
return (m);
}
/*
* Copy options from ip to jp, omitting those not copied during
* fragmentation.
*/
int
ip_optcopy(struct ip *ip, struct ip *jp)
{
u_char *cp, *dp;
int opt, optlen, cnt;
cp = (u_char *)(ip + 1);
dp = (u_char *)(jp + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP) {
/* Preserve for IP mcast tunnel's LSRR alignment. */
*dp++ = IPOPT_NOP;
optlen = 1;
continue;
}
KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
("ip_optcopy: malformed ipv4 option"));
optlen = cp[IPOPT_OLEN];
KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
("ip_optcopy: malformed ipv4 option"));
/* Bogus lengths should have been caught by ip_dooptions. */
if (optlen > cnt)
optlen = cnt;
if (IPOPT_COPIED(opt)) {
bcopy(cp, dp, optlen);
dp += optlen;
}
}
for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
*dp++ = IPOPT_EOL;
return (optlen);
}
/*
* Set up IP options in pcb for insertion in output packets. Store in mbuf
* with pointer in pcbopt, adding pseudo-option with destination address if
* source routed.
*/
int
ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m)
{
int cnt, optlen;
u_char *cp;
struct mbuf **pcbopt;
u_char opt;
INP_WLOCK_ASSERT(inp);
pcbopt = &inp->inp_options;
/* turn off any old options */
if (*pcbopt)
(void)m_free(*pcbopt);
*pcbopt = 0;
if (m == NULL || m->m_len == 0) {
/*
* Only turning off any previous options.
*/
if (m != NULL)
(void)m_free(m);
return (0);
}
if (m->m_len % sizeof(int32_t))
goto bad;
/*
* IP first-hop destination address will be stored before actual
* options; move other options back and clear it when none present.
*/
if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
goto bad;
cnt = m->m_len;
m->m_len += sizeof(struct in_addr);
cp = mtod(m, u_char *) + sizeof(struct in_addr);
bcopy(mtod(m, void *), cp, (unsigned)cnt);
bzero(mtod(m, void *), sizeof(struct in_addr));
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
if (cnt < IPOPT_OLEN + sizeof(*cp))
goto bad;
optlen = cp[IPOPT_OLEN];
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
goto bad;
}
switch (opt) {
default:
break;
case IPOPT_LSRR:
case IPOPT_SSRR:
/*
* User process specifies route as:
*
* ->A->B->C->D
*
* D must be our final destination (but we can't
* check that since we may not have connected yet).
* A is first hop destination, which doesn't appear
* in actual IP option, but is stored before the
* options.
*/
/* XXX-BZ PRIV_NETINET_SETHDROPTS? */
if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
goto bad;
m->m_len -= sizeof(struct in_addr);
cnt -= sizeof(struct in_addr);
optlen -= sizeof(struct in_addr);
cp[IPOPT_OLEN] = optlen;
/*
* Move first hop before start of options.
*/
bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
sizeof(struct in_addr));
/*
* Then copy rest of options back
* to close up the deleted entry.
*/
bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
&cp[IPOPT_OFFSET+1],
(unsigned)cnt - (IPOPT_MINOFF - 1));
break;
}
}
if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
goto bad;
*pcbopt = m;
return (0);
bad:
(void)m_free(m);
return (EINVAL);
}
/*
* Check for the presence of the IP Router Alert option [RFC2113]
* in the header of an IPv4 datagram.
*
* This call is not intended for use from the forwarding path; it is here
* so that protocol domains may check for the presence of the option.
* Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
* option does not have much relevance to the implementation, though this
* may change in future.
* Router alert options SHOULD be passed if running in IPSTEALTH mode and
* we are not the endpoint.
* Length checks on individual options should already have been peformed
* by ip_dooptions() therefore they are folded under INVARIANTS here.
*
* Return zero if not present or options are invalid, non-zero if present.
*/
int
ip_checkrouteralert(struct mbuf *m)
{
struct ip *ip = mtod(m, struct ip *);
u_char *cp;
int opt, optlen, cnt, found_ra;
found_ra = 0;
cp = (u_char *)(ip + 1);
cnt = (ip->ip_hl << 2) - sizeof (struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[IPOPT_OPTVAL];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP)
optlen = 1;
else {
#ifdef INVARIANTS
if (cnt < IPOPT_OLEN + sizeof(*cp))
break;
#endif
optlen = cp[IPOPT_OLEN];
#ifdef INVARIANTS
if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
break;
#endif
}
switch (opt) {
case IPOPT_RA:
#ifdef INVARIANTS
if (optlen != IPOPT_OFFSET + sizeof(uint16_t) ||
(*((uint16_t *)&cp[IPOPT_OFFSET]) != 0))
break;
else
#endif
found_ra = 1;
break;
default:
break;
}
}
return (found_ra);
}
Index: stable/8/sys/netinet/ip_output.c
===================================================================
--- stable/8/sys/netinet/ip_output.c (revision 209276)
+++ stable/8/sys/netinet/ip_output.c (revision 209277)
@@ -1,1278 +1,1278 @@
/*-
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include "opt_ipfw.h"
#include "opt_ipsec.h"
#include "opt_route.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
#include "opt_sctp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/ucred.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/netisr.h>
#include <net/pfil.h>
#include <net/route.h>
#include <net/flowtable.h>
#ifdef RADIX_MPATH
#include <net/radix_mpath.h>
#endif
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
#ifdef SCTP
#include <netinet/sctp.h>
#include <netinet/sctp_crc32.h>
#endif
#ifdef IPSEC
#include <netinet/ip_ipsec.h>
#include <netipsec/ipsec.h>
#endif /* IPSEC*/
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
x, (ntohl(a.s_addr)>>24)&0xFF,\
(ntohl(a.s_addr)>>16)&0xFF,\
(ntohl(a.s_addr)>>8)&0xFF,\
(ntohl(a.s_addr))&0xFF, y);
VNET_DEFINE(u_short, ip_id);
#ifdef MBUF_STRESS_TEST
int mbuf_frag_size = 0;
SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
#endif
static void ip_mloopback
(struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
extern int in_mcast_loop;
extern struct protosw inetsw[];
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
* In the IP forwarding case, the packet will arrive with options already
* inserted, so must have a NULL opt pointer.
*/
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
struct ip *ip;
struct ifnet *ifp = NULL; /* keep compiler happy */
struct mbuf *m0;
int hlen = sizeof (struct ip);
int mtu;
int len, error = 0;
int nortfree = 0;
struct sockaddr_in *dst = NULL; /* keep compiler happy */
struct in_ifaddr *ia = NULL;
int isbroadcast, sw_csum;
struct route iproute;
struct rtentry *rte; /* cache for ro->ro_rt */
struct in_addr odst;
#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag = NULL;
#endif
#ifdef IPSEC
int no_route_but_check_spd = 0;
#endif
M_ASSERTPKTHDR(m);
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
M_SETFIB(m, inp->inp_inc.inc_fibnum);
if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
m->m_pkthdr.flowid = inp->inp_flowid;
m->m_flags |= M_FLOWID;
}
}
if (ro == NULL) {
ro = &iproute;
bzero(ro, sizeof (*ro));
#ifdef FLOWTABLE
{
struct flentry *fle;
/*
* The flow table returns route entries valid for up to 30
* seconds; we rely on the remainder of ip_output() taking no
* longer than that long for the stability of ro_rt. The
* flow ID assignment must have happened before this point.
*/
if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
flow_to_route(fle, ro);
nortfree = 1;
}
}
#endif
}
if (opt) {
len = 0;
m = ip_insertoptions(m, opt, &len);
if (len != 0)
hlen = len;
}
ip = mtod(m, struct ip *);
/*
* Fill in IP header. If we are not allowing fragmentation,
* then the ip_id field is meaningless, but we don't set it
* to zero. Doing so causes various problems when devices along
* the path (routers, load balancers, firewalls, etc.) illegally
* disable DF on our packet. Note that a 16-bit counter
* will wrap around in less than 10 seconds at 100 Mbit/s on a
* medium with MTU 1500. See Steven M. Bellovin, "A Technique
* for Counting NATted Hosts", Proc. IMW'02, available at
* <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
*/
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_hl = hlen >> 2;
ip->ip_id = ip_newid();
IPSTAT_INC(ips_localout);
} else {
hlen = ip->ip_hl << 2;
}
dst = (struct sockaddr_in *)&ro->ro_dst;
again:
/*
* If there is a cached route,
* check that it is to the same destination
* and is still up. If not, free it and try again.
* The address family should also be checked in case of sharing the
* cache with IPv6.
*/
rte = ro->ro_rt;
if (rte && ((rte->rt_flags & RTF_UP) == 0 ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp) ||
dst->sin_family != AF_INET ||
dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
if (!nortfree)
RTFREE(rte);
rte = ro->ro_rt = (struct rtentry *)NULL;
ro->ro_lle = (struct llentry *)NULL;
}
#ifdef IPFIREWALL_FORWARD
if (rte == NULL && fwd_tag == NULL) {
#else
if (rte == NULL) {
#endif
bzero(dst, sizeof(*dst));
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
dst->sin_addr = ip->ip_dst;
}
/*
* If routing to interface only, short circuit routing lookup.
* The use of an all-ones broadcast address implies this; an
* interface is specified by the broadcast address of an interface,
* or the destination address of a ptp interface.
*/
if (flags & IP_SENDONES) {
if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
(ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
ip->ip_dst.s_addr = INADDR_BROADCAST;
dst->sin_addr = ip->ip_dst;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = 1;
} else if (flags & IP_ROUTETOIF) {
if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
- (ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
+ (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = in_broadcast(dst->sin_addr, ifp);
} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
imo != NULL && imo->imo_multicast_ifp != NULL) {
/*
* Bypass the normal routing lookup for multicast
* packets if the interface is specified.
*/
ifp = imo->imo_multicast_ifp;
IFP_TO_IA(ifp, ia);
isbroadcast = 0; /* fool gcc */
} else {
/*
* We want to do any cloning requested by the link layer,
* as this is probably required in all cases for correct
* operation (as it is for ARP).
*/
if (rte == NULL) {
#ifdef RADIX_MPATH
rtalloc_mpath_fib(ro,
ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
#else
in_rtalloc_ign(ro, 0,
inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
#endif
rte = ro->ro_rt;
}
if (rte == NULL ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp)) {
#ifdef IPSEC
/*
* There is no route for this packet, but it is
* possible that a matching SPD entry exists.
*/
no_route_but_check_spd = 1;
mtu = 0; /* Silence GCC warning. */
goto sendit;
#endif
IPSTAT_INC(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
ia = ifatoia(rte->rt_ifa);
ifa_ref(&ia->ia_ifa);
ifp = rte->rt_ifp;
rte->rt_rmx.rmx_pksent++;
if (rte->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)rte->rt_gateway;
if (rte->rt_flags & RTF_HOST)
isbroadcast = (rte->rt_flags & RTF_BROADCAST);
else
isbroadcast = in_broadcast(dst->sin_addr, ifp);
}
/*
* Calculate MTU. If we have a route that is up, use that,
* otherwise use the interface's MTU.
*/
if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) {
/*
* This case can happen if the user changed the MTU
* of an interface after enabling IP on it. Because
* most netifs don't keep track of routes pointing to
* them, there is no way for one to update all its
* routes when the MTU is changed.
*/
if (rte->rt_rmx.rmx_mtu > ifp->if_mtu)
rte->rt_rmx.rmx_mtu = ifp->if_mtu;
mtu = rte->rt_rmx.rmx_mtu;
} else {
mtu = ifp->if_mtu;
}
if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
m->m_flags |= M_MCAST;
/*
* IP destination address is multicast. Make sure "dst"
* still points to the address in "ro". (It may have been
* changed to point to a gateway address, above.)
*/
dst = (struct sockaddr_in *)&ro->ro_dst;
/*
* See if the caller provided any multicast options
*/
if (imo != NULL) {
ip->ip_ttl = imo->imo_multicast_ttl;
if (imo->imo_multicast_vif != -1)
ip->ip_src.s_addr =
ip_mcast_src ?
ip_mcast_src(imo->imo_multicast_vif) :
INADDR_ANY;
} else
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
/*
* Confirm that the outgoing interface supports multicast.
*/
if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
IPSTAT_INC(ips_noroute);
error = ENETUNREACH;
goto bad;
}
}
/*
* If source address not specified yet, use address
* of outgoing interface.
*/
if (ip->ip_src.s_addr == INADDR_ANY) {
/* Interface may have no addresses. */
if (ia != NULL)
ip->ip_src = IA_SIN(ia)->sin_addr;
}
if ((imo == NULL && in_mcast_loop) ||
(imo && imo->imo_multicast_loop)) {
/*
* Loop back multicast datagram if not expressly
* forbidden to do so, even if we are not a member
* of the group; ip_input() will filter it later,
* thus deferring a hash lookup and mutex acquisition
* at the expense of a cheap copy using m_copym().
*/
ip_mloopback(ifp, m, dst, hlen);
} else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IP_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip_mloopback(),
* above, will be forwarded by the ip_input() routine,
* if necessary.
*/
if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
/*
* If rsvp daemon is not running, do not
* set ip_moptions. This ensures that the packet
* is multicast and not just sent down one link
* as prescribed by rsvpd.
*/
if (!V_rsvp_on)
imo = NULL;
if (ip_mforward &&
ip_mforward(ip, ifp, m, imo) != 0) {
m_freem(m);
goto done;
}
}
}
/*
* Multicasts with a time-to-live of zero may be looped-
* back, above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip_mloopback() will
* loop back a copy. ip_input() will drop the copy if
* this host does not belong to the destination group on
* the loopback interface.
*/
if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
m_freem(m);
goto done;
}
goto sendit;
}
/*
* If the source address is not specified yet, use the address
* of the outoing interface.
*/
if (ip->ip_src.s_addr == INADDR_ANY) {
/* Interface may have no addresses. */
if (ia != NULL) {
ip->ip_src = IA_SIN(ia)->sin_addr;
}
}
/*
* Verify that we have any chance at all of being able to queue the
* packet or packet fragments, unless ALTQ is enabled on the given
* interface in which case packetdrop should be done by queueing.
*/
#ifdef ALTQ
if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
ifp->if_snd.ifq_maxlen))
#else
if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
ifp->if_snd.ifq_maxlen)
#endif /* ALTQ */
{
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
goto bad;
}
/*
* Look for broadcast address and
* verify user is allowed to send
* such a packet.
*/
if (isbroadcast) {
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
error = EADDRNOTAVAIL;
goto bad;
}
if ((flags & IP_ALLOWBROADCAST) == 0) {
error = EACCES;
goto bad;
}
/* don't allow broadcast messages to be fragmented */
if (ip->ip_len > mtu) {
error = EMSGSIZE;
goto bad;
}
m->m_flags |= M_BCAST;
} else {
m->m_flags &= ~M_BCAST;
}
sendit:
#ifdef IPSEC
switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) {
case 1:
goto bad;
case -1:
goto done;
case 0:
default:
break; /* Continue with packet processing. */
}
/*
* Check if there was a route for this packet; return error if not.
*/
if (no_route_but_check_spd) {
IPSTAT_INC(ips_noroute);
error = EHOSTUNREACH;
goto bad;
}
/* Update variables that are affected by ipsec4_output(). */
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
#endif /* IPSEC */
/* Jump over all PFIL processing if hooks are not active. */
if (!PFIL_HOOKED(&V_inet_pfil_hook))
goto passout;
/* Run through list of hooks for output packets. */
odst.s_addr = ip->ip_dst.s_addr;
error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
if (error != 0 || m == NULL)
goto done;
ip = mtod(m, struct ip *);
/* See if destination IP address was changed by packet filter. */
if (odst.s_addr != ip->ip_dst.s_addr) {
m->m_flags |= M_SKIP_FIREWALL;
/* If destination is now ourself drop to ip_input(). */
if (in_localip(ip->ip_dst)) {
m->m_flags |= M_FASTFWD_OURS;
if (m->m_pkthdr.rcvif == NULL)
m->m_pkthdr.rcvif = V_loif;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
m->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xffff;
}
m->m_pkthdr.csum_flags |=
CSUM_IP_CHECKED | CSUM_IP_VALID;
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP)
m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
#endif
error = netisr_queue(NETISR_IP, m);
goto done;
} else
goto again; /* Redo the routing table lookup. */
}
#ifdef IPFIREWALL_FORWARD
/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
if (m->m_flags & M_FASTFWD_OURS) {
if (m->m_pkthdr.rcvif == NULL)
m->m_pkthdr.rcvif = V_loif;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
m->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
m->m_pkthdr.csum_data = 0xffff;
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP)
m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
#endif
m->m_pkthdr.csum_flags |=
CSUM_IP_CHECKED | CSUM_IP_VALID;
error = netisr_queue(NETISR_IP, m);
goto done;
}
/* Or forward to some other address? */
fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
if (fwd_tag) {
dst = (struct sockaddr_in *)&ro->ro_dst;
bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
m->m_flags |= M_SKIP_FIREWALL;
m_tag_delete(m, fwd_tag);
goto again;
}
#endif /* IPFIREWALL_FORWARD */
passout:
/* 127/8 must not appear on wire - RFC1122. */
if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
IPSTAT_INC(ips_badaddr);
error = EADDRNOTAVAIL;
goto bad;
}
}
m->m_pkthdr.csum_flags |= CSUM_IP;
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
if (sw_csum & CSUM_DELAY_DATA) {
in_delayed_cksum(m);
sw_csum &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (sw_csum & CSUM_SCTP) {
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
sw_csum &= ~CSUM_SCTP;
}
#endif
m->m_pkthdr.csum_flags &= ifp->if_hwassist;
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
*/
if (ip->ip_len <= mtu ||
(m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
ip->ip_len = htons(ip->ip_len);
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
ip->ip_sum = in_cksum(m, hlen);
/*
* Record statistics for this interface address.
* With CSUM_TSO the byte/packet count will be slightly
* incorrect because we count the IP+TCP headers only
* once instead of for every generated packet.
*/
if (!(flags & IP_FORWARDING) && ia) {
if (m->m_pkthdr.csum_flags & CSUM_TSO)
ia->ia_ifa.if_opackets +=
m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
else
ia->ia_ifa.if_opackets++;
ia->ia_ifa.if_obytes += m->m_pkthdr.len;
}
#ifdef MBUF_STRESS_TEST
if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
#endif
/*
* Reset layer specific mbuf flags
* to avoid confusing lower layers.
*/
m->m_flags &= ~(M_PROTOFLAGS);
error = (*ifp->if_output)(ifp, m,
(struct sockaddr *)dst, ro);
goto done;
}
/* Balk when DF bit is set or the interface didn't support TSO. */
if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
}
/*
* Too large for interface; fragment if possible. If successful,
* on return, m will point to a list of packets to be sent.
*/
error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
if (error)
goto bad;
for (; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = 0;
if (error == 0) {
/* Record statistics for this interface address. */
if (ia != NULL) {
ia->ia_ifa.if_opackets++;
ia->ia_ifa.if_obytes += m->m_pkthdr.len;
}
/*
* Reset layer specific mbuf flags
* to avoid confusing upper layers.
*/
m->m_flags &= ~(M_PROTOFLAGS);
error = (*ifp->if_output)(ifp, m,
(struct sockaddr *)dst, ro);
} else
m_freem(m);
}
if (error == 0)
IPSTAT_INC(ips_fragmented);
done:
if (ro == &iproute && ro->ro_rt && !nortfree) {
RTFREE(ro->ro_rt);
}
if (ia != NULL)
ifa_free(&ia->ia_ifa);
return (error);
bad:
m_freem(m);
goto done;
}
/*
* Create a chain of fragments which fit the given mtu. m_frag points to the
* mbuf to be fragmented; on return it points to the chain with the fragments.
* Return 0 if no error. If error, m_frag may contain a partially built
* chain of fragments that should be freed by the caller.
*
* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
* sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
*/
int
ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
u_long if_hwassist_flags, int sw_csum)
{
int error = 0;
int hlen = ip->ip_hl << 2;
int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
int off;
struct mbuf *m0 = *m_frag; /* the original packet */
int firstlen;
struct mbuf **mnext;
int nfrags;
if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
IPSTAT_INC(ips_cantfrag);
return EMSGSIZE;
}
/*
* Must be able to put at least 8 bytes per fragment.
*/
if (len < 8)
return EMSGSIZE;
/*
* If the interface will not calculate checksums on
* fragmented packets, then do it here.
*/
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (m0->m_pkthdr.csum_flags & CSUM_SCTP &&
(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
sctp_delayed_cksum(m0, hlen);
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
#endif
if (len > PAGE_SIZE) {
/*
* Fragment large datagrams such that each segment
* contains a multiple of PAGE_SIZE amount of data,
* plus headers. This enables a receiver to perform
* page-flipping zero-copy optimizations.
*
* XXX When does this help given that sender and receiver
* could have different page sizes, and also mtu could
* be less than the receiver's page size ?
*/
int newlen;
struct mbuf *m;
for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
off += m->m_len;
/*
* firstlen (off - hlen) must be aligned on an
* 8-byte boundary
*/
if (off < hlen)
goto smart_frag_failure;
off = ((off - hlen) & ~7) + hlen;
newlen = (~PAGE_MASK) & mtu;
if ((newlen + sizeof (struct ip)) > mtu) {
/* we failed, go back the default */
smart_frag_failure:
newlen = len;
off = hlen + len;
}
len = newlen;
} else {
off = hlen + len;
}
firstlen = off - hlen;
mnext = &m0->m_nextpkt; /* pointer to next packet */
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
* Here, m0 is the original packet, m is the fragment being created.
* The fragments are linked off the m_nextpkt of the original
* packet, which after processing serves as the first fragment.
*/
for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
struct ip *mhip; /* ip header on the fragment */
struct mbuf *m;
int mhlen = sizeof (struct ip);
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
IPSTAT_INC(ips_odropped);
goto done;
}
m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
/*
* In the first mbuf, leave room for the link header, then
* copy the original IP header including options. The payload
* goes into an additional mbuf chain returned by m_copym().
*/
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
if (hlen > sizeof (struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
mhip->ip_v = IPVERSION;
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
/* XXX do we need to add ip->ip_off below ? */
mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
if (off + len >= ip->ip_len) { /* last fragment */
len = ip->ip_len - off;
m->m_flags |= M_LASTFRAG;
} else
mhip->ip_off |= IP_MF;
mhip->ip_len = htons((u_short)(len + mhlen));
m->m_next = m_copym(m0, off, len, M_DONTWAIT);
if (m->m_next == NULL) { /* copy failed */
m_free(m);
error = ENOBUFS; /* ??? */
IPSTAT_INC(ips_odropped);
goto done;
}
m->m_pkthdr.len = mhlen + len;
m->m_pkthdr.rcvif = NULL;
#ifdef MAC
mac_netinet_fragment(m0, m);
#endif
m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
mhip->ip_off = htons(mhip->ip_off);
mhip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
mhip->ip_sum = in_cksum(m, mhlen);
*mnext = m;
mnext = &m->m_nextpkt;
}
IPSTAT_ADD(ips_ofragments, nfrags);
/* set first marker for fragment chain */
m0->m_flags |= M_FIRSTFRAG | M_FRAG;
m0->m_pkthdr.csum_data = nfrags;
/*
* Update first fragment by trimming what's been copied out
* and updating header.
*/
m_adj(m0, hlen + firstlen - ip->ip_len);
m0->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_short)m0->m_pkthdr.len);
ip->ip_off |= IP_MF;
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
if (sw_csum & CSUM_DELAY_IP)
ip->ip_sum = in_cksum(m0, hlen);
done:
*m_frag = m0;
return error;
}
void
in_delayed_cksum(struct mbuf *m)
{
struct ip *ip;
u_short csum, offset;
ip = mtod(m, struct ip *);
offset = ip->ip_hl << 2 ;
csum = in_cksum_skip(m, ip->ip_len, offset);
if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
csum = 0xffff;
offset += m->m_pkthdr.csum_data; /* checksum offset */
if (offset + sizeof(u_short) > m->m_len) {
printf("delayed m_pullup, m->len: %d off: %d p: %d\n",
m->m_len, offset, ip->ip_p);
/*
* XXX
* this shouldn't happen, but if it does, the
* correct behavior may be to insert the checksum
* in the appropriate next mbuf in the chain.
*/
return;
}
*(u_short *)(m->m_data + offset) = csum;
}
/*
* IP socket option processing.
*/
int
ip_ctloutput(struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp = sotoinpcb(so);
int error, optval;
error = optval = 0;
if (sopt->sopt_level != IPPROTO_IP) {
if ((sopt->sopt_level == SOL_SOCKET) &&
(sopt->sopt_name == SO_SETFIB)) {
inp->inp_inc.inc_fibnum = so->so_fibnum;
return (0);
}
return (EINVAL);
}
switch (sopt->sopt_dir) {
case SOPT_SET:
switch (sopt->sopt_name) {
case IP_OPTIONS:
#ifdef notyet
case IP_RETOPTS:
#endif
{
struct mbuf *m;
if (sopt->sopt_valsize > MLEN) {
error = EMSGSIZE;
break;
}
MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
if (m == NULL) {
error = ENOBUFS;
break;
}
m->m_len = sopt->sopt_valsize;
error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
m->m_len);
if (error) {
m_free(m);
break;
}
INP_WLOCK(inp);
error = ip_pcbopts(inp, sopt->sopt_name, m);
INP_WUNLOCK(inp);
return (error);
}
case IP_BINDANY:
if (sopt->sopt_td != NULL) {
error = priv_check(sopt->sopt_td,
PRIV_NETINET_BINDANY);
if (error)
break;
}
/* FALLTHROUGH */
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVTTL:
case IP_RECVIF:
case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
break;
switch (sopt->sopt_name) {
case IP_TOS:
inp->inp_ip_tos = optval;
break;
case IP_TTL:
inp->inp_ip_ttl = optval;
break;
case IP_MINTTL:
if (optval >= 0 && optval <= MAXTTL)
inp->inp_ip_minttl = optval;
else
error = EINVAL;
break;
#define OPTSET(bit) do { \
INP_WLOCK(inp); \
if (optval) \
inp->inp_flags |= bit; \
else \
inp->inp_flags &= ~bit; \
INP_WUNLOCK(inp); \
} while (0)
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
case IP_RECVRETOPTS:
OPTSET(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
OPTSET(INP_RECVDSTADDR);
break;
case IP_RECVTTL:
OPTSET(INP_RECVTTL);
break;
case IP_RECVIF:
OPTSET(INP_RECVIF);
break;
case IP_FAITH:
OPTSET(INP_FAITH);
break;
case IP_ONESBCAST:
OPTSET(INP_ONESBCAST);
break;
case IP_DONTFRAG:
OPTSET(INP_DONTFRAG);
break;
case IP_BINDANY:
OPTSET(INP_BINDANY);
break;
}
break;
#undef OPTSET
/*
* Multicast socket options are processed by the in_mcast
* module.
*/
case IP_MULTICAST_IF:
case IP_MULTICAST_VIF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
case IP_ADD_SOURCE_MEMBERSHIP:
case IP_DROP_SOURCE_MEMBERSHIP:
case IP_BLOCK_SOURCE:
case IP_UNBLOCK_SOURCE:
case IP_MSFILTER:
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
error = inp_setmoptions(inp, sopt);
break;
case IP_PORTRANGE:
error = sooptcopyin(sopt, &optval, sizeof optval,
sizeof optval);
if (error)
break;
INP_WLOCK(inp);
switch (optval) {
case IP_PORTRANGE_DEFAULT:
inp->inp_flags &= ~(INP_LOWPORT);
inp->inp_flags &= ~(INP_HIGHPORT);
break;
case IP_PORTRANGE_HIGH:
inp->inp_flags &= ~(INP_LOWPORT);
inp->inp_flags |= INP_HIGHPORT;
break;
case IP_PORTRANGE_LOW:
inp->inp_flags &= ~(INP_HIGHPORT);
inp->inp_flags |= INP_LOWPORT;
break;
default:
error = EINVAL;
break;
}
INP_WUNLOCK(inp);
break;
#ifdef IPSEC
case IP_IPSEC_POLICY:
{
caddr_t req;
struct mbuf *m;
if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
break;
if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
break;
req = mtod(m, caddr_t);
error = ipsec_set_policy(inp, sopt->sopt_name, req,
m->m_len, (sopt->sopt_td != NULL) ?
sopt->sopt_td->td_ucred : NULL);
m_freem(m);
break;
}
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
case SOPT_GET:
switch (sopt->sopt_name) {
case IP_OPTIONS:
case IP_RETOPTS:
if (inp->inp_options)
error = sooptcopyout(sopt,
mtod(inp->inp_options,
char *),
inp->inp_options->m_len);
else
sopt->sopt_valsize = 0;
break;
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVTTL:
case IP_RECVIF:
case IP_PORTRANGE:
case IP_FAITH:
case IP_ONESBCAST:
case IP_DONTFRAG:
switch (sopt->sopt_name) {
case IP_TOS:
optval = inp->inp_ip_tos;
break;
case IP_TTL:
optval = inp->inp_ip_ttl;
break;
case IP_MINTTL:
optval = inp->inp_ip_minttl;
break;
#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
break;
case IP_RECVRETOPTS:
optval = OPTBIT(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
optval = OPTBIT(INP_RECVDSTADDR);
break;
case IP_RECVTTL:
optval = OPTBIT(INP_RECVTTL);
break;
case IP_RECVIF:
optval = OPTBIT(INP_RECVIF);
break;
case IP_PORTRANGE:
if (inp->inp_flags & INP_HIGHPORT)
optval = IP_PORTRANGE_HIGH;
else if (inp->inp_flags & INP_LOWPORT)
optval = IP_PORTRANGE_LOW;
else
optval = 0;
break;
case IP_FAITH:
optval = OPTBIT(INP_FAITH);
break;
case IP_ONESBCAST:
optval = OPTBIT(INP_ONESBCAST);
break;
case IP_DONTFRAG:
optval = OPTBIT(INP_DONTFRAG);
break;
}
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
/*
* Multicast socket options are processed by the in_mcast
* module.
*/
case IP_MULTICAST_IF:
case IP_MULTICAST_VIF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_MSFILTER:
error = inp_getmoptions(inp, sopt);
break;
#ifdef IPSEC
case IP_IPSEC_POLICY:
{
struct mbuf *m = NULL;
caddr_t req = NULL;
size_t len = 0;
if (m != 0) {
req = mtod(m, caddr_t);
len = m->m_len;
}
error = ipsec_get_policy(sotoinpcb(so), req, len, &m);
if (error == 0)
error = soopt_mcopyout(sopt, m); /* XXX */
if (error == 0)
m_freem(m);
break;
}
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
/*
* Routine called from ip_output() to loop back a copy of an IP multicast
* packet to the input queue of a specified interface. Note that this
* calls the output routine of the loopback "driver", but with an interface
* pointer that might NOT be a loopback interface -- evil, but easier than
* replicating that code here.
*/
static void
ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst,
int hlen)
{
register struct ip *ip;
struct mbuf *copym;
/*
* Make a deep copy of the packet because we're going to
* modify the pack in order to generate checksums.
*/
copym = m_dup(m, M_DONTWAIT);
if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
copym = m_pullup(copym, hlen);
if (copym != NULL) {
/* If needed, compute the checksum and mark it as valid. */
if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
in_delayed_cksum(copym);
copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
copym->m_pkthdr.csum_flags |=
CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
copym->m_pkthdr.csum_data = 0xffff;
}
/*
* We don't bother to fragment if the IP length is greater
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
ip->ip_len = htons(ip->ip_len);
ip->ip_off = htons(ip->ip_off);
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, hlen);
#if 1 /* XXX */
if (dst->sin_family != AF_INET) {
printf("ip_mloopback: bad address family %d\n",
dst->sin_family);
dst->sin_family = AF_INET;
}
#endif
if_simloop(ifp, copym, dst->sin_family, 0);
}
}
Index: stable/8/sys
===================================================================
--- stable/8/sys (revision 209276)
+++ stable/8/sys (revision 209277)
Property changes on: stable/8/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /head/sys:r208553
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Sun, Mar 29, 12:56 AM (2 d)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
30450978
Default Alt Text
(350 KB)
Attached To
Mode
rS FreeBSD src repository - subversion
Attached
Detach File
Event Timeline
Log In to Comment