No OneTemporary
Actions

Size

4 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.

	Index: head/sys/compat/linprocfs/linprocfs.c
	===================================================================
	--- head/sys/compat/linprocfs/linprocfs.c (revision 183549)
	+++ head/sys/compat/linprocfs/linprocfs.c (revision 183550)
	@@ -1,1278 +1,1279 @@
	/*-
	* Copyright (c) 2000 Dag-Erling Coïdan Smørgrav
	* Copyright (c) 1999 Pierre Beyssac
	* Copyright (c) 1993 Jan-Simon Pendry
	* Copyright (c) 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Jan-Simon Pendry.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)procfs_status.c 8.4 (Berkeley) 6/15/94
	*/

	#include "opt_compat.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/blist.h>
	#include <sys/conf.h>
	#include <sys/exec.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/msg.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sbuf.h>
	#include <sys/sem.h>
	#include <sys/smp.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/tty.h>
	#include <sys/user.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/swap_pager.h>

	#include <machine/clock.h>

	#if defined(__i386__) \|\| defined(__amd64__)
	#include <machine/cputypes.h>
	#include <machine/md_var.h>
	#endif /* __i386__ \|\| __amd64__ */

	#ifdef COMPAT_LINUX32 /* XXX */
	#include <machine/../linux32/linux.h>
	#else
	#include <machine/../linux/linux.h>
	#endif
	#include <compat/linux/linux_ioctl.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_util.h>
	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	/*
	* Various conversion macros
	*/
	#define T2J(x) (((x) * 100UL) / (stathz ? stathz : hz)) /* ticks to jiffies */
	#define T2S(x) ((x) / (stathz ? stathz : hz)) /* ticks to seconds */
	#define B2K(x) ((x) >> 10) /* bytes to kbytes */
	#define B2P(x) ((x) >> PAGE_SHIFT) /* bytes to pages */
	#define P2B(x) ((x) << PAGE_SHIFT) /* pages to bytes */
	#define P2K(x) ((x) << (PAGE_SHIFT - 10)) /* pages to kbytes */

	/**
	* @brief Mapping of ki_stat in struct kinfo_proc to the linux state
	*
	* The linux procfs state field displays one of the characters RSDZTW to
	* denote running, sleeping in an interruptible wait, waiting in an
	* uninterruptible disk sleep, a zombie process, process is being traced
	* or stopped, or process is paging respectively.
	*
	* Our struct kinfo_proc contains the variable ki_stat which contains a
	* value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK.
	*
	* This character array is used with ki_stati-1 as an index and tries to
	* map our states to suitable linux states.
	*/
	static char linux_state[] = "RRSTZDD";

	/*
	* Filler function for proc/meminfo
	*/
	static int
	linprocfs_domeminfo(PFS_FILL_ARGS)
	{
	unsigned long memtotal; /* total memory in bytes */
	unsigned long memused; /* used memory in bytes */
	unsigned long memfree; /* free memory in bytes */
	unsigned long memshared; /* shared memory ??? */
	unsigned long buffers, cached; /* buffer / cache memory ??? */
	unsigned long long swaptotal; /* total swap space in bytes */
	unsigned long long swapused; /* used swap space in bytes */
	unsigned long long swapfree; /* free swap space in bytes */
	vm_object_t object;
	int i, j;

	memtotal = physmem * PAGE_SIZE;
	/*
	* The correct thing here would be:
	*
	memfree = cnt.v_free_count * PAGE_SIZE;
	memused = memtotal - memfree;
	*
	* but it might mislead linux binaries into thinking there
	* is very little memory left, so we cheat and tell them that
	* all memory that isn't wired down is free.
	*/
	memused = cnt.v_wire_count * PAGE_SIZE;
	memfree = memtotal - memused;
	swap_pager_status(&i, &j);
	swaptotal = (unsigned long long)i * PAGE_SIZE;
	swapused = (unsigned long long)j * PAGE_SIZE;
	swapfree = swaptotal - swapused;
	memshared = 0;
	mtx_lock(&vm_object_list_mtx);
	TAILQ_FOREACH(object, &vm_object_list, object_list)
	if (object->shadow_count > 1)
	memshared += object->resident_page_count;
	mtx_unlock(&vm_object_list_mtx);
	memshared *= PAGE_SIZE;
	/*
	* We'd love to be able to write:
	*
	buffers = bufspace;
	*
	* but bufspace is internal to vfs_bio.c and we don't feel
	* like unstaticizing it just for linprocfs's sake.
	*/
	buffers = 0;
	cached = cnt.v_cache_count * PAGE_SIZE;

	sbuf_printf(sb,
	" total: used: free: shared: buffers: cached:\n"
	"Mem: %lu %lu %lu %lu %lu %lu\n"
	"Swap: %llu %llu %llu\n"
	"MemTotal: %9lu kB\n"
	"MemFree: %9lu kB\n"
	"MemShared:%9lu kB\n"
	"Buffers: %9lu kB\n"
	"Cached: %9lu kB\n"
	"SwapTotal:%9llu kB\n"
	"SwapFree: %9llu kB\n",
	memtotal, memused, memfree, memshared, buffers, cached,
	swaptotal, swapused, swapfree,
	B2K(memtotal), B2K(memfree),
	B2K(memshared), B2K(buffers), B2K(cached),
	B2K(swaptotal), B2K(swapfree));

	return (0);
	}

	#if defined(__i386__) \|\| defined(__amd64__)
	/*
	* Filler function for proc/cpuinfo (i386 & amd64 version)
	*/
	static int
	linprocfs_docpuinfo(PFS_FILL_ARGS)
	{
	int hw_model[2];
	char model[128];
	size_t size;
	int class, fqmhz, fqkhz;
	int i;

	/*
	* We default the flags to include all non-conflicting flags,
	* and the Intel versions of conflicting flags.
	*/
	static char *flags[] = {
	"fpu", "vme", "de", "pse", "tsc",
	"msr", "pae", "mce", "cx8", "apic",
	"sep", "sep", "mtrr", "pge", "mca",
	"cmov", "pat", "pse36", "pn", "b19",
	"b20", "b21", "mmxext", "mmx", "fxsr",
	"xmm", "sse2", "b27", "b28", "b29",
	"3dnowext", "3dnow"
	};

	switch (cpu_class) {
	#ifdef __i386__
	case CPUCLASS_286:
	class = 2;
	break;
	case CPUCLASS_386:
	class = 3;
	break;
	case CPUCLASS_486:
	class = 4;
	break;
	case CPUCLASS_586:
	class = 5;
	break;
	case CPUCLASS_686:
	class = 6;
	break;
	default:
	class = 0;
	break;
	#else /* __amd64__ */
	default:
	class = 15;
	break;
	#endif
	}

	hw_model[0] = CTL_HW;
	hw_model[1] = HW_MODEL;
	model[0] = '\0';
	size = sizeof(model);
	if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0)
	strcpy(model, "unknown");
	for (i = 0; i < mp_ncpus; ++i) {
	sbuf_printf(sb,
	"processor\t: %d\n"
	"vendor_id\t: %.20s\n"
	"cpu family\t: %d\n"
	"model\t\t: %d\n"
	"model name\t: %s\n"
	"stepping\t: %d\n",
	i, cpu_vendor, class, cpu, model, cpu_id & 0xf);
	/* XXX per-cpu vendor / class / model / id? */
	}

	sbuf_cat(sb,
	"flags\t\t:");

	if (!strcmp(cpu_vendor, "AuthenticAMD") && (class < 6)) {
	flags[16] = "fcmov";
	} else if (!strcmp(cpu_vendor, "CyrixInstead")) {
	flags[24] = "cxmmx";
	}

	for (i = 0; i < 32; i++)
	if (cpu_feature & (1 << i))
	sbuf_printf(sb, " %s", flags[i]);
	sbuf_cat(sb, "\n");
	if (class >= 5) {
	fqmhz = (tsc_freq + 4999) / 1000000;
	fqkhz = ((tsc_freq + 4999) / 10000) % 100;
	sbuf_printf(sb,
	"cpu MHz\t\t: %d.%02d\n"
	"bogomips\t: %d.%02d\n",
	fqmhz, fqkhz, fqmhz, fqkhz);
	}

	return (0);
	}
	#endif /* __i386__ \|\| __amd64__ */

	/*
	* Filler function for proc/mtab
	*
	* This file doesn't exist in Linux' procfs, but is included here so
	* users can symlink /compat/linux/etc/mtab to /proc/mtab
	*/
	static int
	linprocfs_domtab(PFS_FILL_ARGS)
	{
	struct nameidata nd;
	struct mount *mp;
	const char *lep;
	char dlep, flep, mntto, mntfrom, *fstype;
	size_t lep_len;
	int error;

	/* resolve symlinks etc. in the emulation tree prefix */
	NDINIT(&nd, LOOKUP, FOLLOW \| MPSAFE, UIO_SYSSPACE, linux_emul_path, td);
	flep = NULL;
	error = namei(&nd);
	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
	if (error != 0 \|\| vn_fullpath(td, nd.ni_vp, &dlep, &flep) != 0)
	lep = linux_emul_path;
	else
	lep = dlep;
	lep_len = strlen(lep);

	mtx_lock(&mountlist_mtx);
	error = 0;
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	/* determine device name */
	mntfrom = mp->mnt_stat.f_mntfromname;

	/* determine mount point */
	mntto = mp->mnt_stat.f_mntonname;
	if (strncmp(mntto, lep, lep_len) == 0 &&
	mntto[lep_len] == '/')
	mntto += lep_len;

	/* determine fs type */
	fstype = mp->mnt_stat.f_fstypename;
	if (strcmp(fstype, pn->pn_info->pi_name) == 0)
	mntfrom = fstype = "proc";
	else if (strcmp(fstype, "procfs") == 0)
	continue;

	if (strcmp(fstype, "linsysfs") == 0) {
	sbuf_printf(sb, "/sys %s sysfs %s", mntto,
	mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
	} else {
	sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype,
	mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
	}
	#define ADD_OPTION(opt, name) \
	if (mp->mnt_stat.f_flags & (opt)) sbuf_printf(sb, "," name);
	ADD_OPTION(MNT_SYNCHRONOUS, "sync");
	ADD_OPTION(MNT_NOEXEC, "noexec");
	ADD_OPTION(MNT_NOSUID, "nosuid");
	ADD_OPTION(MNT_UNION, "union");
	ADD_OPTION(MNT_ASYNC, "async");
	ADD_OPTION(MNT_SUIDDIR, "suiddir");
	ADD_OPTION(MNT_NOSYMFOLLOW, "nosymfollow");
	ADD_OPTION(MNT_NOATIME, "noatime");
	#undef ADD_OPTION
	/* a real Linux mtab will also show NFS options */
	sbuf_printf(sb, " 0 0\n");
	}
	mtx_unlock(&mountlist_mtx);
	if (flep != NULL)
	free(flep, M_TEMP);
	return (error);
	}

	/*
	* Filler function for proc/stat
	*/
	static int
	linprocfs_dostat(PFS_FILL_ARGS)
	{
	struct pcpu *pcpu;
	long cp_time[CPUSTATES];
	long *cp;
	int i;

	read_cpu_time(cp_time);
	sbuf_printf(sb, "cpu %ld %ld %ld %ld\n",
	T2J(cp_time[CP_USER]),
	T2J(cp_time[CP_NICE]),
	T2J(cp_time[CP_SYS] /+ cp_time[CP_INTR]/),
	T2J(cp_time[CP_IDLE]));
	for (i = 0; i <= mp_maxid; ++i) {
	if (CPU_ABSENT(i))
	continue;
	pcpu = pcpu_find(i);
	cp = pcpu->pc_cp_time;
	sbuf_printf(sb, "cpu%d %ld %ld %ld %ld\n", i,
	T2J(cp[CP_USER]),
	T2J(cp[CP_NICE]),
	T2J(cp[CP_SYS] /+ cp[CP_INTR]/),
	T2J(cp[CP_IDLE]));
	}
	sbuf_printf(sb,
	"disk 0 0 0 0\n"
	"page %u %u\n"
	"swap %u %u\n"
	"intr %u\n"
	"ctxt %u\n"
	"btime %lld\n",
	cnt.v_vnodepgsin,
	cnt.v_vnodepgsout,
	cnt.v_swappgsin,
	cnt.v_swappgsout,
	cnt.v_intr,
	cnt.v_swtch,
	(long long)boottime.tv_sec);
	return (0);
	}

	/*
	* Filler function for proc/uptime
	*/
	static int
	linprocfs_douptime(PFS_FILL_ARGS)
	{
	long cp_time[CPUSTATES];
	struct timeval tv;

	getmicrouptime(&tv);
	read_cpu_time(cp_time);
	sbuf_printf(sb, "%lld.%02ld %ld.%02ld\n",
	(long long)tv.tv_sec, tv.tv_usec / 10000,
	T2S(cp_time[CP_IDLE]), T2J(cp_time[CP_IDLE]) % 100);
	return (0);
	}

	/*
	* Get OS build date
	*/
	static void
	linprocfs_osbuild(struct thread td, struct sbuf sb)
	{
	#if 0
	char osbuild[256];
	char cp1, cp2;

	strncpy(osbuild, version, 256);
	osbuild[255] = '\0';
	cp1 = strstr(osbuild, "\n");
	cp2 = strstr(osbuild, ":");
	if (cp1 && cp2) {
	cp1 = cp2 = '\0';
	cp1 = strstr(osbuild, "#");
	} else
	cp1 = NULL;
	if (cp1)
	sbuf_printf(sb, "%s%s", cp1, cp2 + 1);
	else
	#endif
	sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977");
	}

	/*
	* Get OS builder
	*/
	static void
	linprocfs_osbuilder(struct thread td, struct sbuf sb)
	{
	#if 0
	char builder[256];
	char *cp;

	cp = strstr(version, "\n ");
	if (cp) {
	strncpy(builder, cp + 5, 256);
	builder[255] = '\0';
	cp = strstr(builder, ":");
	if (cp)
	*cp = '\0';
	}
	if (cp)
	sbuf_cat(sb, builder);
	else
	#endif
	sbuf_cat(sb, "des@freebsd.org");
	}

	/*
	* Filler function for proc/version
	*/
	static int
	linprocfs_doversion(PFS_FILL_ARGS)
	{
	char osname[LINUX_MAX_UTSNAME];
	char osrelease[LINUX_MAX_UTSNAME];

	linux_get_osname(td, osname);
	linux_get_osrelease(td, osrelease);
	sbuf_printf(sb, "%s version %s (", osname, osrelease);
	linprocfs_osbuilder(td, sb);
	sbuf_cat(sb, ") (gcc version " __VERSION__ ") ");
	linprocfs_osbuild(td, sb);
	sbuf_cat(sb, "\n");

	return (0);
	}

	/*
	* Filler function for proc/loadavg
	*/
	static int
	linprocfs_doloadavg(PFS_FILL_ARGS)
	{

	sbuf_printf(sb,
	"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
	(int)(averunnable.ldavg[0] / averunnable.fscale),
	(int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100),
	(int)(averunnable.ldavg[1] / averunnable.fscale),
	(int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100),
	(int)(averunnable.ldavg[2] / averunnable.fscale),
	(int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
	1, /* number of running tasks */
	nprocs, /* number of tasks */
	lastpid /* the last pid */
	);
	return (0);
	}

	/*
	* Filler function for proc/pid/stat
	*/
	static int
	linprocfs_doprocstat(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	char state;
	static int ratelimit = 0;

	PROC_LOCK(p);
	fill_kinfo_proc(p, &kp);
	sbuf_printf(sb, "%d", p->p_pid);
	#define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg)
	PS_ADD("comm", "(%s)", p->p_comm);
	if (kp.ki_stat > sizeof(linux_state)) {
	state = 'R';

	if (ratelimit == 0) {
	printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n",
	kp.ki_stat, sizeof(linux_state));
	++ratelimit;
	}
	} else
	state = linux_state[kp.ki_stat - 1];
	PS_ADD("state", "%c", state);
	PS_ADD("ppid", "%d", p->p_pptr ? p->p_pptr->p_pid : 0);
	PS_ADD("pgrp", "%d", p->p_pgid);
	PS_ADD("session", "%d", p->p_session->s_sid);
	PROC_UNLOCK(p);
	PS_ADD("tty", "%d", 0); /* XXX */
	PS_ADD("tpgid", "%d", kp.ki_tpgid);
	PS_ADD("flags", "%u", 0); /* XXX */
	PS_ADD("minflt", "%lu", kp.ki_rusage.ru_minflt);
	PS_ADD("cminflt", "%lu", kp.ki_rusage_ch.ru_minflt);
	PS_ADD("majflt", "%lu", kp.ki_rusage.ru_majflt);
	PS_ADD("cmajflt", "%lu", kp.ki_rusage_ch.ru_majflt);
	PS_ADD("utime", "%ld", T2J(tvtohz(&kp.ki_rusage.ru_utime)));
	PS_ADD("stime", "%ld", T2J(tvtohz(&kp.ki_rusage.ru_stime)));
	PS_ADD("cutime", "%ld", T2J(tvtohz(&kp.ki_rusage_ch.ru_utime)));
	PS_ADD("cstime", "%ld", T2J(tvtohz(&kp.ki_rusage_ch.ru_stime)));
	PS_ADD("priority", "%d", kp.ki_pri.pri_user);
	PS_ADD("nice", "%d", kp.ki_nice); /* 19 (nicest) to -19 */
	PS_ADD("0", "%d", 0); /* removed field */
	PS_ADD("itrealvalue", "%d", 0); /* XXX */
	/* XXX: starttime is not right, it is the _same_ for _every_ process.
	It should be the number of jiffies between system boot and process
	start. */
	PS_ADD("starttime", "%lu", T2J(tvtohz(&kp.ki_start)));
	PS_ADD("vsize", "%ju", P2K((uintmax_t)kp.ki_size));
	PS_ADD("rss", "%ju", (uintmax_t)kp.ki_rssize);
	PS_ADD("rlim", "%lu", kp.ki_rusage.ru_maxrss);
	PS_ADD("startcode", "%u", (unsigned)0);
	PS_ADD("endcode", "%u", 0); /* XXX */
	PS_ADD("startstack", "%u", 0); /* XXX */
	PS_ADD("kstkesp", "%u", 0); /* XXX */
	PS_ADD("kstkeip", "%u", 0); /* XXX */
	PS_ADD("signal", "%u", 0); /* XXX */
	PS_ADD("blocked", "%u", 0); /* XXX */
	PS_ADD("sigignore", "%u", 0); /* XXX */
	PS_ADD("sigcatch", "%u", 0); /* XXX */
	PS_ADD("wchan", "%u", 0); /* XXX */
	PS_ADD("nswap", "%lu", kp.ki_rusage.ru_nswap);
	PS_ADD("cnswap", "%lu", kp.ki_rusage_ch.ru_nswap);
	PS_ADD("exitsignal", "%d", 0); /* XXX */
	PS_ADD("processor", "%u", kp.ki_lastcpu);
	PS_ADD("rt_priority", "%u", 0); /* XXX / / >= 2.5.19 */
	PS_ADD("policy", "%u", kp.ki_pri.pri_class); /* >= 2.5.19 */
	#undef PS_ADD
	sbuf_putc(sb, '\n');

	return (0);
	}

	/*
	* Filler function for proc/pid/statm
	*/
	static int
	linprocfs_doprocstatm(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	segsz_t lsize;

	PROC_LOCK(p);
	fill_kinfo_proc(p, &kp);
	PROC_UNLOCK(p);

	/*
	* See comments in linprocfs_doprocstatus() regarding the
	* computation of lsize.
	*/
	/* size resident share trs drs lrs dt */
	sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size));
	sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize);
	sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */
	sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_tsize);
	sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize));
	lsize = B2P(kp.ki_size) - kp.ki_dsize -
	kp.ki_ssize - kp.ki_tsize - 1;
	sbuf_printf(sb, "%ju ", (uintmax_t)lsize);
	sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */

	return (0);
	}

	/*
	* Filler function for proc/pid/status
	*/
	static int
	linprocfs_doprocstatus(PFS_FILL_ARGS)
	{
	struct kinfo_proc kp;
	char *state;
	segsz_t lsize;
	struct thread *td2;
	struct sigacts *ps;
	int i;

	PROC_LOCK(p);
	td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */

	if (P_SHOULDSTOP(p)) {
	state = "T (stopped)";
	} else {
	PROC_SLOCK(p);
	switch(p->p_state) {
	case PRS_NEW:
	state = "I (idle)";
	break;
	case PRS_NORMAL:
	if (p->p_flag & P_WEXIT) {
	state = "X (exiting)";
	break;
	}
	switch(td2->td_state) {
	case TDS_INHIBITED:
	state = "S (sleeping)";
	break;
	case TDS_RUNQ:
	case TDS_RUNNING:
	state = "R (running)";
	break;
	default:
	state = "? (unknown)";
	break;
	}
	break;
	case PRS_ZOMBIE:
	state = "Z (zombie)";
	break;
	default:
	state = "? (unknown)";
	break;
	}
	PROC_SUNLOCK(p);
	}

	fill_kinfo_proc(p, &kp);
	sbuf_printf(sb, "Name:\t%s\n", p->p_comm); /* XXX escape */
	sbuf_printf(sb, "State:\t%s\n", state);

	/*
	* Credentials
	*/
	sbuf_printf(sb, "Pid:\t%d\n", p->p_pid);
	sbuf_printf(sb, "PPid:\t%d\n", p->p_pptr ?
	p->p_pptr->p_pid : 0);
	sbuf_printf(sb, "Uid:\t%d %d %d %d\n", p->p_ucred->cr_ruid,
	p->p_ucred->cr_uid,
	p->p_ucred->cr_svuid,
	/* FreeBSD doesn't have fsuid */
	p->p_ucred->cr_uid);
	sbuf_printf(sb, "Gid:\t%d %d %d %d\n", p->p_ucred->cr_rgid,
	p->p_ucred->cr_gid,
	p->p_ucred->cr_svgid,
	/* FreeBSD doesn't have fsgid */
	p->p_ucred->cr_gid);
	sbuf_cat(sb, "Groups:\t");
	for (i = 0; i < p->p_ucred->cr_ngroups; i++)
	sbuf_printf(sb, "%d ", p->p_ucred->cr_groups[i]);
	PROC_UNLOCK(p);
	sbuf_putc(sb, '\n');

	/*
	* Memory
	*
	* While our approximation of VmLib may not be accurate (I
	* don't know of a simple way to verify it, and I'm not sure
	* it has much meaning anyway), I believe it's good enough.
	*
	* The same code that could (I think) accurately compute VmLib
	* could also compute VmLck, but I don't really care enough to
	* implement it. Submissions are welcome.
	*/
	sbuf_printf(sb, "VmSize:\t%8ju kB\n", B2K((uintmax_t)kp.ki_size));
	sbuf_printf(sb, "VmLck:\t%8u kB\n", P2K(0)); /* XXX */
	sbuf_printf(sb, "VmRss:\t%8ju kB\n", P2K((uintmax_t)kp.ki_rssize));
	sbuf_printf(sb, "VmData:\t%8ju kB\n", P2K((uintmax_t)kp.ki_dsize));
	sbuf_printf(sb, "VmStk:\t%8ju kB\n", P2K((uintmax_t)kp.ki_ssize));
	sbuf_printf(sb, "VmExe:\t%8ju kB\n", P2K((uintmax_t)kp.ki_tsize));
	lsize = B2P(kp.ki_size) - kp.ki_dsize -
	kp.ki_ssize - kp.ki_tsize - 1;
	sbuf_printf(sb, "VmLib:\t%8ju kB\n", P2K((uintmax_t)lsize));

	/*
	* Signal masks
	*
	* We support up to 128 signals, while Linux supports 32,
	* but we only define 32 (the same 32 as Linux, to boot), so
	* just show the lower 32 bits of each mask. XXX hack.
	*
	* NB: on certain platforms (Sparc at least) Linux actually
	* supports 64 signals, but this code is a long way from
	* running on anything but i386, so ignore that for now.
	*/
	PROC_LOCK(p);
	sbuf_printf(sb, "SigPnd:\t%08x\n", p->p_siglist.__bits[0]);
	/*
	* I can't seem to find out where the signal mask is in
	* relation to struct proc, so SigBlk is left unimplemented.
	*/
	sbuf_printf(sb, "SigBlk:\t%08x\n", 0); /* XXX */
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	sbuf_printf(sb, "SigIgn:\t%08x\n", ps->ps_sigignore.__bits[0]);
	sbuf_printf(sb, "SigCgt:\t%08x\n", ps->ps_sigcatch.__bits[0]);
	mtx_unlock(&ps->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* Linux also prints the capability masks, but we don't have
	* capabilities yet, and when we do get them they're likely to
	* be meaningless to Linux programs, so we lie. XXX
	*/
	sbuf_printf(sb, "CapInh:\t%016x\n", 0);
	sbuf_printf(sb, "CapPrm:\t%016x\n", 0);
	sbuf_printf(sb, "CapEff:\t%016x\n", 0);

	return (0);
	}


	/*
	* Filler function for proc/pid/cwd
	*/
	static int
	linprocfs_doproccwd(PFS_FILL_ARGS)
	{
	char *fullpath = "unknown";
	char *freepath = NULL;

	vn_fullpath(td, p->p_fd->fd_cdir, &fullpath, &freepath);
	sbuf_printf(sb, "%s", fullpath);
	if (freepath)
	free(freepath, M_TEMP);
	return (0);
	}

	/*
	* Filler function for proc/pid/root
	*/
	static int
	linprocfs_doprocroot(PFS_FILL_ARGS)
	{
	struct vnode *rvp;
	char *fullpath = "unknown";
	char *freepath = NULL;

	rvp = jailed(p->p_ucred) ? p->p_fd->fd_jdir : p->p_fd->fd_rdir;
	vn_fullpath(td, rvp, &fullpath, &freepath);
	sbuf_printf(sb, "%s", fullpath);
	if (freepath)
	free(freepath, M_TEMP);
	return (0);
	}

	/*
	* Filler function for proc/pid/cmdline
	*/
	static int
	linprocfs_doproccmdline(PFS_FILL_ARGS)
	{
	struct ps_strings pstr;
	char **ps_argvstr;
	int error, i;

	/*
	* If we are using the ps/cmdline caching, use that. Otherwise
	* revert back to the old way which only implements full cmdline
	* for the currept process and just p->p_comm for all other
	* processes.
	* Note that if the argv is no longer available, we deliberately
	* don't fall back on p->p_comm or return an error: the authentic
	* Linux behaviour is to return zero-length in this case.
	*/

	PROC_LOCK(p);
	if (p->p_args && p_cansee(td, p) == 0) {
	sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
	PROC_UNLOCK(p);
	} else if (p != td->td_proc) {
	PROC_UNLOCK(p);
	sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
	} else {
	PROC_UNLOCK(p);
	error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
	sizeof(pstr));
	if (error)
	return (error);
	if (pstr.ps_nargvstr > ARG_MAX)
	return (E2BIG);
	ps_argvstr = malloc(pstr.ps_nargvstr * sizeof(char *),
	M_TEMP, M_WAITOK);
	error = copyin((void *)pstr.ps_argvstr, ps_argvstr,
	pstr.ps_nargvstr * sizeof(char *));
	if (error) {
	free(ps_argvstr, M_TEMP);
	return (error);
	}
	for (i = 0; i < pstr.ps_nargvstr; i++) {
	sbuf_copyin(sb, ps_argvstr[i], 0);
	sbuf_printf(sb, "%c", '\0');
	}
	free(ps_argvstr, M_TEMP);
	}

	return (0);
	}

	/*
	* Filler function for proc/pid/environ
	*/
	static int
	linprocfs_doprocenviron(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "doprocenviron\n%c", '\0');
	return (0);
	}

	/*
	* Filler function for proc/pid/maps
	*/
	static int
	linprocfs_doprocmaps(PFS_FILL_ARGS)
	{
	char mebuffer[512];
	vm_map_t map = &p->p_vmspace->vm_map;
	vm_map_entry_t entry, tmp_entry;
	vm_object_t obj, tobj, lobj;
	vm_offset_t saved_end;
	vm_ooffset_t off = 0;
	char name = "", freename = NULL;
	size_t len;
	ino_t ino;
	unsigned int last_timestamp;
	int ref_count, shadow_count, flags;
	int error;
	struct vnode *vp;
	struct vattr vat;
	int locked;

	PROC_LOCK(p);
	error = p_candebug(td, p);
	PROC_UNLOCK(p);
	if (error)
	return (error);

	if (uio->uio_rw != UIO_READ)
	return (EOPNOTSUPP);

	if (uio->uio_offset != 0)
	return (0);

	error = 0;
	vm_map_lock_read(map);
	for (entry = map->header.next;
	((uio->uio_resid > 0) && (entry != &map->header));
	entry = entry->next) {
	name = "";
	freename = NULL;
	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	continue;
	saved_end = entry->end;
	obj = entry->object.vm_object;
	for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
	VM_OBJECT_LOCK(tobj);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	lobj = tobj;
	}
	ino = 0;
	if (lobj) {
	off = IDX_TO_OFF(lobj->size);
	if (lobj->type == OBJT_VNODE) {
	vp = lobj->handle;
	if (vp)
	vref(vp);
	}
	else
	vp = NULL;
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	flags = obj->flags;
	ref_count = obj->ref_count;
	shadow_count = obj->shadow_count;
	VM_OBJECT_UNLOCK(obj);
	if (vp) {
	vn_fullpath(td, vp, &name, &freename);
	locked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	VOP_GETATTR(vp, &vat, td->td_ucred);
	ino = vat.va_fileid;
	vput(vp);
	VFS_UNLOCK_GIANT(locked);
	}
	} else {
	flags = 0;
	ref_count = 0;
	shadow_count = 0;
	}

	/*
	* format:
	* start, end, access, offset, major, minor, inode, name.
	*/
	snprintf(mebuffer, sizeof mebuffer,
	"%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n",
	(u_long)entry->start, (u_long)entry->end,
	(entry->protection & VM_PROT_READ)?"r":"-",
	(entry->protection & VM_PROT_WRITE)?"w":"-",
	(entry->protection & VM_PROT_EXECUTE)?"x":"-",
	"p",
	(u_long)off,
	0,
	0,
	(u_long)ino,
	*name ? " " : "",
	name
	);
	if (freename)
	free(freename, M_TEMP);
	len = strlen(mebuffer);
	if (len > uio->uio_resid)
	len = uio->uio_resid; /*
	* XXX We should probably return
	* EFBIG here, as in procfs.
	*/
	last_timestamp = map->timestamp;
	vm_map_unlock_read(map);
	error = uiomove(mebuffer, len, uio);
	vm_map_lock_read(map);
	if (error)
	break;
	if (last_timestamp + 1 != map->timestamp) {
	/*
	* Look again for the entry because the map was
	* modified while it was unlocked. Specifically,
	* the entry may have been clipped, merged, or deleted.
	*/
	vm_map_lookup_entry(map, saved_end - 1, &tmp_entry);
	entry = tmp_entry;
	}
	}
	vm_map_unlock_read(map);

	return (error);
	}

	/*
	* Filler function for proc/net/dev
	*/
	static int
	linprocfs_donetdev(PFS_FILL_ARGS)
	{
	+ INIT_VNET_NET(TD_TO_VNET(curthread));
	char ifname[16]; /* XXX LINUX_IFNAMSIZ */
	struct ifnet *ifp;

	sbuf_printf(sb, "%6s\|%58s\|%s\n%6s\|%58s\|%58s\n",
	"Inter-", " Receive", " Transmit", " face",
	"bytes packets errs drop fifo frame compressed",
	"bytes packets errs drop fifo frame compressed");

	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	linux_ifname(ifp, ifname, sizeof ifname);
	sbuf_printf(sb, "%6.6s:", ifname);
	sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ",
	0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
	sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
	0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
	}
	IFNET_RUNLOCK();

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/osrelease
	*/
	static int
	linprocfs_doosrelease(PFS_FILL_ARGS)
	{
	char osrelease[LINUX_MAX_UTSNAME];

	linux_get_osrelease(td, osrelease);
	sbuf_printf(sb, "%s\n", osrelease);

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/ostype
	*/
	static int
	linprocfs_doostype(PFS_FILL_ARGS)
	{
	char osname[LINUX_MAX_UTSNAME];

	linux_get_osname(td, osname);
	sbuf_printf(sb, "%s\n", osname);

	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/version
	*/
	static int
	linprocfs_doosbuild(PFS_FILL_ARGS)
	{

	linprocfs_osbuild(td, sb);
	sbuf_cat(sb, "\n");
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/msgmni
	*/
	static int
	linprocfs_domsgmni(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d\n", msginfo.msgmni);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/pid_max
	*/
	static int
	linprocfs_dopid_max(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%i\n", PID_MAX);
	return (0);
	}

	/*
	* Filler function for proc/sys/kernel/sem
	*/
	static int
	linprocfs_dosem(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns,
	seminfo.semopm, seminfo.semmni);
	return (0);
	}

	/*
	* Filler function for proc/scsi/device_info
	*/
	static int
	linprocfs_doscsidevinfo(PFS_FILL_ARGS)
	{

	return (0);
	}

	/*
	* Filler function for proc/scsi/scsi
	*/
	static int
	linprocfs_doscsiscsi(PFS_FILL_ARGS)
	{

	return (0);
	}

	extern struct cdevsw *cdevsw[];

	/*
	* Filler function for proc/devices
	*/
	static int
	linprocfs_dodevices(PFS_FILL_ARGS)
	{
	char *char_devices;
	sbuf_printf(sb, "Character devices:\n");

	char_devices = linux_get_char_devices();
	sbuf_printf(sb, "%s", char_devices);
	linux_free_get_char_devices(char_devices);

	sbuf_printf(sb, "\nBlock devices:\n");

	return (0);
	}

	/*
	* Filler function for proc/cmdline
	*/
	static int
	linprocfs_docmdline(PFS_FILL_ARGS)
	{

	sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname);
	sbuf_printf(sb, " ro root=302\n");
	return (0);
	}

	#if 0
	/*
	* Filler function for proc/modules
	*/
	static int
	linprocfs_domodules(PFS_FILL_ARGS)
	{
	struct linker_file *lf;

	TAILQ_FOREACH(lf, &linker_files, link) {
	sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename,
	(unsigned long)lf->size, lf->refs);
	}
	return (0);
	}
	#endif

	/*
	* Constructor
	*/
	static int
	linprocfs_init(PFS_INIT_ARGS)
	{
	struct pfs_node *root;
	struct pfs_node *dir;

	root = pi->pi_root;

	/* /proc/... */
	pfs_create_file(root, "cmdline", &linprocfs_docmdline,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "devices", &linprocfs_dodevices,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "loadavg", &linprocfs_doloadavg,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "meminfo", &linprocfs_domeminfo,
	NULL, NULL, NULL, PFS_RD);
	#if 0
	pfs_create_file(root, "modules", &linprocfs_domodules,
	NULL, NULL, NULL, PFS_RD);
	#endif
	pfs_create_file(root, "mounts", &linprocfs_domtab,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "mtab", &linprocfs_domtab,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(root, "self", &procfs_docurproc,
	NULL, NULL, NULL, 0);
	pfs_create_file(root, "stat", &linprocfs_dostat,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "uptime", &linprocfs_douptime,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(root, "version", &linprocfs_doversion,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/net/... */
	dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "dev", &linprocfs_donetdev,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/<pid>/... */
	dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP);
	pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(dir, "cwd", &linprocfs_doproccwd,
	NULL, NULL, NULL, 0);
	pfs_create_file(dir, "environ", &linprocfs_doprocenviron,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_link(dir, "exe", &procfs_doprocfile,
	NULL, &procfs_notsystem, NULL, 0);
	pfs_create_file(dir, "maps", &linprocfs_doprocmaps,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "mem", &procfs_doprocmem,
	&procfs_attr, &procfs_candebug, NULL, PFS_RDWR\|PFS_RAW);
	pfs_create_link(dir, "root", &linprocfs_doprocroot,
	NULL, NULL, NULL, 0);
	pfs_create_file(dir, "stat", &linprocfs_doprocstat,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "statm", &linprocfs_doprocstatm,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "status", &linprocfs_doprocstatus,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/scsi/... */
	dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi,
	NULL, NULL, NULL, PFS_RD);

	/* /proc/sys/... */
	dir = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0);
	/* /proc/sys/kernel/... */
	dir = pfs_create_dir(dir, "kernel", NULL, NULL, NULL, 0);
	pfs_create_file(dir, "osrelease", &linprocfs_doosrelease,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "ostype", &linprocfs_doostype,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "version", &linprocfs_doosbuild,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "msgmni", &linprocfs_domsgmni,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "pid_max", &linprocfs_dopid_max,
	NULL, NULL, NULL, PFS_RD);
	pfs_create_file(dir, "sem", &linprocfs_dosem,
	NULL, NULL, NULL, PFS_RD);

	return (0);
	}

	/*
	* Destructor
	*/
	static int
	linprocfs_uninit(PFS_INIT_ARGS)
	{

	/* nothing to do, pseudofs will GC */
	return (0);
	}

	PSEUDOFS(linprocfs, 1);
	MODULE_DEPEND(linprocfs, linux, 1, 1, 1);
	MODULE_DEPEND(linprocfs, procfs, 1, 1, 1);
	MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1);
	MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1);
	Index: head/sys/compat/linux/linux_ioctl.c
	===================================================================
	--- head/sys/compat/linux/linux_ioctl.c (revision 183549)
	+++ head/sys/compat/linux/linux_ioctl.c (revision 183550)
	@@ -1,2718 +1,2721 @@
	/*-
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "opt_compat.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/cdio.h>
	#include <sys/dvdio.h>
	#include <sys/conf.h>
	#include <sys/disk.h>
	#include <sys/consio.h>
	#include <sys/ctype.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/kbio.h>
	#include <sys/kernel.h>
	#include <sys/linker_set.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/soundcard.h>
	#include <sys/stdint.h>
	#include <sys/sx.h>
	#include <sys/tty.h>
	#include <sys/uio.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_ioctl.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_util.h>

	CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ);

	static linux_ioctl_function_t linux_ioctl_cdrom;
	static linux_ioctl_function_t linux_ioctl_vfat;
	static linux_ioctl_function_t linux_ioctl_console;
	static linux_ioctl_function_t linux_ioctl_hdio;
	static linux_ioctl_function_t linux_ioctl_disk;
	static linux_ioctl_function_t linux_ioctl_socket;
	static linux_ioctl_function_t linux_ioctl_sound;
	static linux_ioctl_function_t linux_ioctl_termio;
	static linux_ioctl_function_t linux_ioctl_private;
	static linux_ioctl_function_t linux_ioctl_drm;
	static linux_ioctl_function_t linux_ioctl_sg;
	static linux_ioctl_function_t linux_ioctl_special;

	static struct linux_ioctl_handler cdrom_handler =
	{ linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX };
	static struct linux_ioctl_handler vfat_handler =
	{ linux_ioctl_vfat, LINUX_IOCTL_VFAT_MIN, LINUX_IOCTL_VFAT_MAX };
	static struct linux_ioctl_handler console_handler =
	{ linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX };
	static struct linux_ioctl_handler hdio_handler =
	{ linux_ioctl_hdio, LINUX_IOCTL_HDIO_MIN, LINUX_IOCTL_HDIO_MAX };
	static struct linux_ioctl_handler disk_handler =
	{ linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX };
	static struct linux_ioctl_handler socket_handler =
	{ linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX };
	static struct linux_ioctl_handler sound_handler =
	{ linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX };
	static struct linux_ioctl_handler termio_handler =
	{ linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX };
	static struct linux_ioctl_handler private_handler =
	{ linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX };
	static struct linux_ioctl_handler drm_handler =
	{ linux_ioctl_drm, LINUX_IOCTL_DRM_MIN, LINUX_IOCTL_DRM_MAX };
	static struct linux_ioctl_handler sg_handler =
	{ linux_ioctl_sg, LINUX_IOCTL_SG_MIN, LINUX_IOCTL_SG_MAX };

	DATA_SET(linux_ioctl_handler_set, cdrom_handler);
	DATA_SET(linux_ioctl_handler_set, vfat_handler);
	DATA_SET(linux_ioctl_handler_set, console_handler);
	DATA_SET(linux_ioctl_handler_set, hdio_handler);
	DATA_SET(linux_ioctl_handler_set, disk_handler);
	DATA_SET(linux_ioctl_handler_set, socket_handler);
	DATA_SET(linux_ioctl_handler_set, sound_handler);
	DATA_SET(linux_ioctl_handler_set, termio_handler);
	DATA_SET(linux_ioctl_handler_set, private_handler);
	DATA_SET(linux_ioctl_handler_set, drm_handler);
	DATA_SET(linux_ioctl_handler_set, sg_handler);

	struct handler_element
	{
	TAILQ_ENTRY(handler_element) list;
	int (func)(struct thread , struct linux_ioctl_args *);
	int low, high, span;
	};

	static TAILQ_HEAD(, handler_element) handlers =
	TAILQ_HEAD_INITIALIZER(handlers);
	static struct sx linux_ioctl_sx;
	SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "linux ioctl handlers");

	/*
	* hdio related ioctls for VMWare support
	*/

	struct linux_hd_geometry {
	u_int8_t heads;
	u_int8_t sectors;
	u_int16_t cylinders;
	u_int32_t start;
	};

	struct linux_hd_big_geometry {
	u_int8_t heads;
	u_int8_t sectors;
	u_int32_t cylinders;
	u_int32_t start;
	};

	static int
	linux_ioctl_hdio(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	u_int sectorsize, fwcylinders, fwheads, fwsectors;
	off_t mediasize, bytespercyl;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_HDIO_GET_GEO:
	case LINUX_HDIO_GET_GEO_BIG:
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGFWHEADS,
	(caddr_t)&fwheads, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGFWSECTORS,
	(caddr_t)&fwsectors, td->td_ucred, td);
	/*
	* XXX: DIOCGFIRSTOFFSET is not yet implemented, so
	* so pretend that GEOM always says 0. This is NOT VALID
	* for slices or partitions, only the per-disk raw devices.
	*/

	fdrop(fp, td);
	if (error)
	return (error);
	/*
	* 1. Calculate the number of bytes in a cylinder,
	* given the firmware's notion of heads and sectors
	* per cylinder.
	* 2. Calculate the number of cylinders, given the total
	* size of the media.
	* All internal calculations should have 64-bit precision.
	*/
	bytespercyl = (off_t) sectorsize * fwheads * fwsectors;
	fwcylinders = mediasize / bytespercyl;
	#if defined(DEBUG)
	linux_msg(td, "HDIO_GET_GEO: mediasize %jd, c/h/s %d/%d/%d, "
	"bpc %jd",
	(intmax_t)mediasize, fwcylinders, fwheads, fwsectors,
	(intmax_t)bytespercyl);
	#endif
	if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO) {
	struct linux_hd_geometry hdg;

	hdg.cylinders = fwcylinders;
	hdg.heads = fwheads;
	hdg.sectors = fwsectors;
	hdg.start = 0;
	error = copyout(&hdg, (void *)args->arg, sizeof(hdg));
	} else if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO_BIG) {
	struct linux_hd_big_geometry hdbg;

	hdbg.cylinders = fwcylinders;
	hdbg.heads = fwheads;
	hdbg.sectors = fwsectors;
	hdbg.start = 0;
	error = copyout(&hdbg, (void *)args->arg, sizeof(hdbg));
	}
	return (error);
	break;
	default:
	/* XXX */
	linux_msg(td,
	"ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented",
	args->fd, (int)(args->cmd & 0xffff),
	(int)(args->cmd & 0xff00) >> 8,
	(int)(args->cmd & 0xff));
	break;
	}
	fdrop(fp, td);
	return (ENOIOCTL);
	}

	static int
	linux_ioctl_disk(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	u_int sectorsize;
	off_t mediasize;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_BLKGETSIZE:
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	fdrop(fp, td);
	if (error)
	return (error);
	sectorsize = mediasize / sectorsize;
	/*
	* XXX: How do we know we return the right size of integer ?
	*/
	return (copyout(&sectorsize, (void *)args->arg,
	sizeof(sectorsize)));
	break;
	}
	fdrop(fp, td);
	return (ENOIOCTL);
	}

	/*
	* termio related ioctls
	*/

	struct linux_termio {
	unsigned short c_iflag;
	unsigned short c_oflag;
	unsigned short c_cflag;
	unsigned short c_lflag;
	unsigned char c_line;
	unsigned char c_cc[LINUX_NCC];
	};

	struct linux_termios {
	unsigned int c_iflag;
	unsigned int c_oflag;
	unsigned int c_cflag;
	unsigned int c_lflag;
	unsigned char c_line;
	unsigned char c_cc[LINUX_NCCS];
	};

	struct linux_winsize {
	unsigned short ws_row, ws_col;
	unsigned short ws_xpixel, ws_ypixel;
	};

	struct speedtab {
	int sp_speed; /* Speed. */
	int sp_code; /* Code. */
	};

	static struct speedtab sptab[] = {
	{ B0, LINUX_B0 }, { B50, LINUX_B50 },
	{ B75, LINUX_B75 }, { B110, LINUX_B110 },
	{ B134, LINUX_B134 }, { B150, LINUX_B150 },
	{ B200, LINUX_B200 }, { B300, LINUX_B300 },
	{ B600, LINUX_B600 }, { B1200, LINUX_B1200 },
	{ B1800, LINUX_B1800 }, { B2400, LINUX_B2400 },
	{ B4800, LINUX_B4800 }, { B9600, LINUX_B9600 },
	{ B19200, LINUX_B19200 }, { B38400, LINUX_B38400 },
	{ B57600, LINUX_B57600 }, { B115200, LINUX_B115200 },
	{-1, -1 }
	};

	struct linux_serial_struct {
	int type;
	int line;
	int port;
	int irq;
	int flags;
	int xmit_fifo_size;
	int custom_divisor;
	int baud_base;
	unsigned short close_delay;
	char reserved_char[2];
	int hub6;
	unsigned short closing_wait;
	unsigned short closing_wait2;
	int reserved[4];
	};

	static int
	linux_to_bsd_speed(int code, struct speedtab *table)
	{
	for ( ; table->sp_code != -1; table++)
	if (table->sp_code == code)
	return (table->sp_speed);
	return -1;
	}

	static int
	bsd_to_linux_speed(int speed, struct speedtab *table)
	{
	for ( ; table->sp_speed != -1; table++)
	if (table->sp_speed == speed)
	return (table->sp_code);
	return -1;
	}

	static void
	bsd_to_linux_termios(struct termios bios, struct linux_termios lios)
	{
	int i;

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: BSD termios structure (input):\n");
	printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
	bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
	bios->c_ispeed, bios->c_ospeed);
	printf("c_cc ");
	for (i=0; i<NCCS; i++)
	printf("%02x ", bios->c_cc[i]);
	printf("\n");
	}
	#endif

	lios->c_iflag = 0;
	if (bios->c_iflag & IGNBRK)
	lios->c_iflag \|= LINUX_IGNBRK;
	if (bios->c_iflag & BRKINT)
	lios->c_iflag \|= LINUX_BRKINT;
	if (bios->c_iflag & IGNPAR)
	lios->c_iflag \|= LINUX_IGNPAR;
	if (bios->c_iflag & PARMRK)
	lios->c_iflag \|= LINUX_PARMRK;
	if (bios->c_iflag & INPCK)
	lios->c_iflag \|= LINUX_INPCK;
	if (bios->c_iflag & ISTRIP)
	lios->c_iflag \|= LINUX_ISTRIP;
	if (bios->c_iflag & INLCR)
	lios->c_iflag \|= LINUX_INLCR;
	if (bios->c_iflag & IGNCR)
	lios->c_iflag \|= LINUX_IGNCR;
	if (bios->c_iflag & ICRNL)
	lios->c_iflag \|= LINUX_ICRNL;
	if (bios->c_iflag & IXON)
	lios->c_iflag \|= LINUX_IXON;
	if (bios->c_iflag & IXANY)
	lios->c_iflag \|= LINUX_IXANY;
	if (bios->c_iflag & IXOFF)
	lios->c_iflag \|= LINUX_IXOFF;
	if (bios->c_iflag & IMAXBEL)
	lios->c_iflag \|= LINUX_IMAXBEL;

	lios->c_oflag = 0;
	if (bios->c_oflag & OPOST)
	lios->c_oflag \|= LINUX_OPOST;
	if (bios->c_oflag & ONLCR)
	lios->c_oflag \|= LINUX_ONLCR;
	if (bios->c_oflag & TAB3)
	lios->c_oflag \|= LINUX_XTABS;

	lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab);
	lios->c_cflag \|= (bios->c_cflag & CSIZE) >> 4;
	if (bios->c_cflag & CSTOPB)
	lios->c_cflag \|= LINUX_CSTOPB;
	if (bios->c_cflag & CREAD)
	lios->c_cflag \|= LINUX_CREAD;
	if (bios->c_cflag & PARENB)
	lios->c_cflag \|= LINUX_PARENB;
	if (bios->c_cflag & PARODD)
	lios->c_cflag \|= LINUX_PARODD;
	if (bios->c_cflag & HUPCL)
	lios->c_cflag \|= LINUX_HUPCL;
	if (bios->c_cflag & CLOCAL)
	lios->c_cflag \|= LINUX_CLOCAL;
	if (bios->c_cflag & CRTSCTS)
	lios->c_cflag \|= LINUX_CRTSCTS;

	lios->c_lflag = 0;
	if (bios->c_lflag & ISIG)
	lios->c_lflag \|= LINUX_ISIG;
	if (bios->c_lflag & ICANON)
	lios->c_lflag \|= LINUX_ICANON;
	if (bios->c_lflag & ECHO)
	lios->c_lflag \|= LINUX_ECHO;
	if (bios->c_lflag & ECHOE)
	lios->c_lflag \|= LINUX_ECHOE;
	if (bios->c_lflag & ECHOK)
	lios->c_lflag \|= LINUX_ECHOK;
	if (bios->c_lflag & ECHONL)
	lios->c_lflag \|= LINUX_ECHONL;
	if (bios->c_lflag & NOFLSH)
	lios->c_lflag \|= LINUX_NOFLSH;
	if (bios->c_lflag & TOSTOP)
	lios->c_lflag \|= LINUX_TOSTOP;
	if (bios->c_lflag & ECHOCTL)
	lios->c_lflag \|= LINUX_ECHOCTL;
	if (bios->c_lflag & ECHOPRT)
	lios->c_lflag \|= LINUX_ECHOPRT;
	if (bios->c_lflag & ECHOKE)
	lios->c_lflag \|= LINUX_ECHOKE;
	if (bios->c_lflag & FLUSHO)
	lios->c_lflag \|= LINUX_FLUSHO;
	if (bios->c_lflag & PENDIN)
	lios->c_lflag \|= LINUX_PENDIN;
	if (bios->c_lflag & IEXTEN)
	lios->c_lflag \|= LINUX_IEXTEN;

	for (i=0; i<LINUX_NCCS; i++)
	lios->c_cc[i] = LINUX_POSIX_VDISABLE;
	lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR];
	lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT];
	lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE];
	lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL];
	lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF];
	lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL];
	lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN];
	lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME];
	lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2];
	lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP];
	lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART];
	lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP];
	lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT];
	lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD];
	lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE];
	lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT];

	for (i=0; i<LINUX_NCCS; i++) {
	if (i != LINUX_VMIN && i != LINUX_VTIME &&
	lios->c_cc[i] == _POSIX_VDISABLE)
	lios->c_cc[i] = LINUX_POSIX_VDISABLE;
	}
	lios->c_line = 0;

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: LINUX termios structure (output):\n");
	printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
	lios->c_iflag, lios->c_oflag, lios->c_cflag,
	lios->c_lflag, (int)lios->c_line);
	printf("c_cc ");
	for (i=0; i<LINUX_NCCS; i++)
	printf("%02x ", lios->c_cc[i]);
	printf("\n");
	}
	#endif
	}

	static void
	linux_to_bsd_termios(struct linux_termios lios, struct termios bios)
	{
	int i;

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: LINUX termios structure (input):\n");
	printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
	lios->c_iflag, lios->c_oflag, lios->c_cflag,
	lios->c_lflag, (int)lios->c_line);
	printf("c_cc ");
	for (i=0; i<LINUX_NCCS; i++)
	printf("%02x ", lios->c_cc[i]);
	printf("\n");
	}
	#endif

	bios->c_iflag = 0;
	if (lios->c_iflag & LINUX_IGNBRK)
	bios->c_iflag \|= IGNBRK;
	if (lios->c_iflag & LINUX_BRKINT)
	bios->c_iflag \|= BRKINT;
	if (lios->c_iflag & LINUX_IGNPAR)
	bios->c_iflag \|= IGNPAR;
	if (lios->c_iflag & LINUX_PARMRK)
	bios->c_iflag \|= PARMRK;
	if (lios->c_iflag & LINUX_INPCK)
	bios->c_iflag \|= INPCK;
	if (lios->c_iflag & LINUX_ISTRIP)
	bios->c_iflag \|= ISTRIP;
	if (lios->c_iflag & LINUX_INLCR)
	bios->c_iflag \|= INLCR;
	if (lios->c_iflag & LINUX_IGNCR)
	bios->c_iflag \|= IGNCR;
	if (lios->c_iflag & LINUX_ICRNL)
	bios->c_iflag \|= ICRNL;
	if (lios->c_iflag & LINUX_IXON)
	bios->c_iflag \|= IXON;
	if (lios->c_iflag & LINUX_IXANY)
	bios->c_iflag \|= IXANY;
	if (lios->c_iflag & LINUX_IXOFF)
	bios->c_iflag \|= IXOFF;
	if (lios->c_iflag & LINUX_IMAXBEL)
	bios->c_iflag \|= IMAXBEL;

	bios->c_oflag = 0;
	if (lios->c_oflag & LINUX_OPOST)
	bios->c_oflag \|= OPOST;
	if (lios->c_oflag & LINUX_ONLCR)
	bios->c_oflag \|= ONLCR;
	if (lios->c_oflag & LINUX_XTABS)
	bios->c_oflag \|= TAB3;

	bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4;
	if (lios->c_cflag & LINUX_CSTOPB)
	bios->c_cflag \|= CSTOPB;
	if (lios->c_cflag & LINUX_CREAD)
	bios->c_cflag \|= CREAD;
	if (lios->c_cflag & LINUX_PARENB)
	bios->c_cflag \|= PARENB;
	if (lios->c_cflag & LINUX_PARODD)
	bios->c_cflag \|= PARODD;
	if (lios->c_cflag & LINUX_HUPCL)
	bios->c_cflag \|= HUPCL;
	if (lios->c_cflag & LINUX_CLOCAL)
	bios->c_cflag \|= CLOCAL;
	if (lios->c_cflag & LINUX_CRTSCTS)
	bios->c_cflag \|= CRTSCTS;

	bios->c_lflag = 0;
	if (lios->c_lflag & LINUX_ISIG)
	bios->c_lflag \|= ISIG;
	if (lios->c_lflag & LINUX_ICANON)
	bios->c_lflag \|= ICANON;
	if (lios->c_lflag & LINUX_ECHO)
	bios->c_lflag \|= ECHO;
	if (lios->c_lflag & LINUX_ECHOE)
	bios->c_lflag \|= ECHOE;
	if (lios->c_lflag & LINUX_ECHOK)
	bios->c_lflag \|= ECHOK;
	if (lios->c_lflag & LINUX_ECHONL)
	bios->c_lflag \|= ECHONL;
	if (lios->c_lflag & LINUX_NOFLSH)
	bios->c_lflag \|= NOFLSH;
	if (lios->c_lflag & LINUX_TOSTOP)
	bios->c_lflag \|= TOSTOP;
	if (lios->c_lflag & LINUX_ECHOCTL)
	bios->c_lflag \|= ECHOCTL;
	if (lios->c_lflag & LINUX_ECHOPRT)
	bios->c_lflag \|= ECHOPRT;
	if (lios->c_lflag & LINUX_ECHOKE)
	bios->c_lflag \|= ECHOKE;
	if (lios->c_lflag & LINUX_FLUSHO)
	bios->c_lflag \|= FLUSHO;
	if (lios->c_lflag & LINUX_PENDIN)
	bios->c_lflag \|= PENDIN;
	if (lios->c_lflag & LINUX_IEXTEN)
	bios->c_lflag \|= IEXTEN;

	for (i=0; i<NCCS; i++)
	bios->c_cc[i] = _POSIX_VDISABLE;
	bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR];
	bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT];
	bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE];
	bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL];
	bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF];
	bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL];
	bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN];
	bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME];
	bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2];
	bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP];
	bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART];
	bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP];
	bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT];
	bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD];
	bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE];
	bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT];

	for (i=0; i<NCCS; i++) {
	if (i != VMIN && i != VTIME &&
	bios->c_cc[i] == LINUX_POSIX_VDISABLE)
	bios->c_cc[i] = _POSIX_VDISABLE;
	}

	bios->c_ispeed = bios->c_ospeed =
	linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab);

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: BSD termios structure (output):\n");
	printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
	bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
	bios->c_ispeed, bios->c_ospeed);
	printf("c_cc ");
	for (i=0; i<NCCS; i++)
	printf("%02x ", bios->c_cc[i]);
	printf("\n");
	}
	#endif
	}

	static void
	bsd_to_linux_termio(struct termios bios, struct linux_termio lio)
	{
	struct linux_termios lios;

	bsd_to_linux_termios(bios, &lios);
	lio->c_iflag = lios.c_iflag;
	lio->c_oflag = lios.c_oflag;
	lio->c_cflag = lios.c_cflag;
	lio->c_lflag = lios.c_lflag;
	lio->c_line = lios.c_line;
	memcpy(lio->c_cc, lios.c_cc, LINUX_NCC);
	}

	static void
	linux_to_bsd_termio(struct linux_termio lio, struct termios bios)
	{
	struct linux_termios lios;
	int i;

	lios.c_iflag = lio->c_iflag;
	lios.c_oflag = lio->c_oflag;
	lios.c_cflag = lio->c_cflag;
	lios.c_lflag = lio->c_lflag;
	for (i=LINUX_NCC; i<LINUX_NCCS; i++)
	lios.c_cc[i] = LINUX_POSIX_VDISABLE;
	memcpy(lios.c_cc, lio->c_cc, LINUX_NCC);
	linux_to_bsd_termios(&lios, bios);
	}

	static int
	linux_ioctl_termio(struct thread td, struct linux_ioctl_args args)
	{
	struct termios bios;
	struct linux_termios lios;
	struct linux_termio lio;
	struct file *fp;
	int error;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);

	switch (args->cmd & 0xffff) {

	case LINUX_TCGETS:
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
	td);
	if (error)
	break;
	bsd_to_linux_termios(&bios, &lios);
	error = copyout(&lios, (void *)args->arg, sizeof(lios));
	break;

	case LINUX_TCSETS:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETSW:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETSF:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCGETA:
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
	td);
	if (error)
	break;
	bsd_to_linux_termio(&bios, &lio);
	error = (copyout(&lio, (void *)args->arg, sizeof(lio)));
	break;

	case LINUX_TCSETA:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETAW:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETAF:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	/* LINUX_TCSBRK */

	case LINUX_TCXONC: {
	switch (args->arg) {
	case LINUX_TCOOFF:
	args->cmd = TIOCSTOP;
	break;
	case LINUX_TCOON:
	args->cmd = TIOCSTART;
	break;
	case LINUX_TCIOFF:
	case LINUX_TCION: {
	int c;
	struct write_args wr;
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios,
	td->td_ucred, td);
	if (error)
	break;
	fdrop(fp, td);
	c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART;
	c = bios.c_cc[c];
	if (c != _POSIX_VDISABLE) {
	wr.fd = args->fd;
	wr.buf = &c;
	wr.nbyte = sizeof(c);
	return (write(td, &wr));
	} else
	return (0);
	}
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	args->arg = 0;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;
	}

	case LINUX_TCFLSH: {
	int val;
	switch (args->arg) {
	case LINUX_TCIFLUSH:
	val = FREAD;
	break;
	case LINUX_TCOFLUSH:
	val = FWRITE;
	break;
	case LINUX_TCIOFLUSH:
	val = FREAD \| FWRITE;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp,TIOCFLUSH,(caddr_t)&val,td->td_ucred,td));
	break;
	}

	case LINUX_TIOCEXCL:
	args->cmd = TIOCEXCL;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCNXCL:
	args->cmd = TIOCNXCL;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSCTTY:
	args->cmd = TIOCSCTTY;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCGPGRP:
	args->cmd = TIOCGPGRP;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSPGRP:
	args->cmd = TIOCSPGRP;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCOUTQ */
	/* LINUX_TIOCSTI */

	case LINUX_TIOCGWINSZ:
	args->cmd = TIOCGWINSZ;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSWINSZ:
	args->cmd = TIOCSWINSZ;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMGET:
	args->cmd = TIOCMGET;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMBIS:
	args->cmd = TIOCMBIS;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMBIC:
	args->cmd = TIOCMBIC;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMSET:
	args->cmd = TIOCMSET;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	/* TIOCGSOFTCAR */
	/* TIOCSSOFTCAR */

	case LINUX_FIONREAD: /* LINUX_TIOCINQ */
	args->cmd = FIONREAD;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCLINUX */

	case LINUX_TIOCCONS:
	args->cmd = TIOCCONS;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCGSERIAL: {
	struct linux_serial_struct lss;
	lss.type = LINUX_PORT_16550A;
	lss.flags = 0;
	lss.close_delay = 0;
	error = copyout(&lss, (void *)args->arg, sizeof(lss));
	break;
	}

	case LINUX_TIOCSSERIAL: {
	struct linux_serial_struct lss;
	error = copyin((void *)args->arg, &lss, sizeof(lss));
	if (error)
	break;
	/* XXX - It really helps to have an implementation that
	* does nothing. NOT!
	*/
	error = 0;
	break;
	}

	case LINUX_TIOCPKT:
	args->cmd = TIOCPKT;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIONBIO:
	args->cmd = FIONBIO;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCNOTTY:
	args->cmd = TIOCNOTTY;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSETD: {
	int line;
	switch (args->arg) {
	case LINUX_N_TTY:
	line = TTYDISC;
	break;
	case LINUX_N_SLIP:
	line = SLIPDISC;
	break;
	case LINUX_N_PPP:
	line = PPPDISC;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td->td_ucred,
	td));
	break;
	}

	case LINUX_TIOCGETD: {
	int linux_line;
	int bsd_line = TTYDISC;
	error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line,
	td->td_ucred, td);
	if (error)
	return (error);
	switch (bsd_line) {
	case TTYDISC:
	linux_line = LINUX_N_TTY;
	break;
	case SLIPDISC:
	linux_line = LINUX_N_SLIP;
	break;
	case PPPDISC:
	linux_line = LINUX_N_PPP;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (copyout(&linux_line, (void *)args->arg, sizeof(int)));
	break;
	}

	/* LINUX_TCSBRKP */
	/* LINUX_TIOCTTYGSTRUCT */

	case LINUX_FIONCLEX:
	args->cmd = FIONCLEX;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIOCLEX:
	args->cmd = FIOCLEX;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIOASYNC:
	args->cmd = FIOASYNC;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCSERCONFIG */
	/* LINUX_TIOCSERGWILD */
	/* LINUX_TIOCSERSWILD */
	/* LINUX_TIOCGLCKTRMIOS */
	/* LINUX_TIOCSLCKTRMIOS */

	case LINUX_TIOCSBRK:
	args->cmd = TIOCSBRK;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCCBRK:
	args->cmd = TIOCCBRK;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;
	case LINUX_TIOCGPTN: {
	int nb;

	error = fo_ioctl(fp, TIOCGPTN, (caddr_t)&nb, td->td_ucred, td);
	if (!error)
	error = copyout(&nb, (void *)args->arg,
	sizeof(int));
	break;
	}
	case LINUX_TIOCSPTLCK:
	/* Our unlockpt() does nothing. */
	error = 0;
	break;
	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	/*
	* CDROM related ioctls
	*/

	struct linux_cdrom_msf
	{
	u_char cdmsf_min0;
	u_char cdmsf_sec0;
	u_char cdmsf_frame0;
	u_char cdmsf_min1;
	u_char cdmsf_sec1;
	u_char cdmsf_frame1;
	};

	struct linux_cdrom_tochdr
	{
	u_char cdth_trk0;
	u_char cdth_trk1;
	};

	union linux_cdrom_addr
	{
	struct {
	u_char minute;
	u_char second;
	u_char frame;
	} msf;
	int lba;
	};

	struct linux_cdrom_tocentry
	{
	u_char cdte_track;
	u_char cdte_adr:4;
	u_char cdte_ctrl:4;
	u_char cdte_format;
	union linux_cdrom_addr cdte_addr;
	u_char cdte_datamode;
	};

	struct linux_cdrom_subchnl
	{
	u_char cdsc_format;
	u_char cdsc_audiostatus;
	u_char cdsc_adr:4;
	u_char cdsc_ctrl:4;
	u_char cdsc_trk;
	u_char cdsc_ind;
	union linux_cdrom_addr cdsc_absaddr;
	union linux_cdrom_addr cdsc_reladdr;
	};

	struct l_cdrom_read_audio {
	union linux_cdrom_addr addr;
	u_char addr_format;
	l_int nframes;
	u_char *buf;
	};

	struct l_dvd_layer {
	u_char book_version:4;
	u_char book_type:4;
	u_char min_rate:4;
	u_char disc_size:4;
	u_char layer_type:4;
	u_char track_path:1;
	u_char nlayers:2;
	u_char track_density:4;
	u_char linear_density:4;
	u_char bca:1;
	u_int32_t start_sector;
	u_int32_t end_sector;
	u_int32_t end_sector_l0;
	};

	struct l_dvd_physical {
	u_char type;
	u_char layer_num;
	struct l_dvd_layer layer[4];
	};

	struct l_dvd_copyright {
	u_char type;
	u_char layer_num;
	u_char cpst;
	u_char rmi;
	};

	struct l_dvd_disckey {
	u_char type;
	l_uint agid:2;
	u_char value[2048];
	};

	struct l_dvd_bca {
	u_char type;
	l_int len;
	u_char value[188];
	};

	struct l_dvd_manufact {
	u_char type;
	u_char layer_num;
	l_int len;
	u_char value[2048];
	};

	typedef union {
	u_char type;
	struct l_dvd_physical physical;
	struct l_dvd_copyright copyright;
	struct l_dvd_disckey disckey;
	struct l_dvd_bca bca;
	struct l_dvd_manufact manufact;
	} l_dvd_struct;

	typedef u_char l_dvd_key[5];
	typedef u_char l_dvd_challenge[10];

	struct l_dvd_lu_send_agid {
	u_char type;
	l_uint agid:2;
	};

	struct l_dvd_host_send_challenge {
	u_char type;
	l_uint agid:2;
	l_dvd_challenge chal;
	};

	struct l_dvd_send_key {
	u_char type;
	l_uint agid:2;
	l_dvd_key key;
	};

	struct l_dvd_lu_send_challenge {
	u_char type;
	l_uint agid:2;
	l_dvd_challenge chal;
	};

	struct l_dvd_lu_send_title_key {
	u_char type;
	l_uint agid:2;
	l_dvd_key title_key;
	l_int lba;
	l_uint cpm:1;
	l_uint cp_sec:1;
	l_uint cgms:2;
	};

	struct l_dvd_lu_send_asf {
	u_char type;
	l_uint agid:2;
	l_uint asf:1;
	};

	struct l_dvd_host_send_rpcstate {
	u_char type;
	u_char pdrc;
	};

	struct l_dvd_lu_send_rpcstate {
	u_char type:2;
	u_char vra:3;
	u_char ucca:3;
	u_char region_mask;
	u_char rpc_scheme;
	};

	typedef union {
	u_char type;
	struct l_dvd_lu_send_agid lsa;
	struct l_dvd_host_send_challenge hsc;
	struct l_dvd_send_key lsk;
	struct l_dvd_lu_send_challenge lsc;
	struct l_dvd_send_key hsk;
	struct l_dvd_lu_send_title_key lstk;
	struct l_dvd_lu_send_asf lsasf;
	struct l_dvd_host_send_rpcstate hrpcs;
	struct l_dvd_lu_send_rpcstate lrpcs;
	} l_dvd_authinfo;

	static void
	bsd_to_linux_msf_lba(u_char af, union msf_lba bp, union linux_cdrom_addr lp)
	{
	if (af == CD_LBA_FORMAT)
	lp->lba = bp->lba;
	else {
	lp->msf.minute = bp->msf.minute;
	lp->msf.second = bp->msf.second;
	lp->msf.frame = bp->msf.frame;
	}
	}

	static void
	set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba)
	{
	if (format == LINUX_CDROM_MSF) {
	addr->msf.frame = lba % 75;
	lba /= 75;
	lba += 2;
	addr->msf.second = lba % 60;
	addr->msf.minute = lba / 60;
	} else
	addr->lba = lba;
	}

	static int
	linux_to_bsd_dvd_struct(l_dvd_struct lp, struct dvd_struct bp)
	{
	bp->format = lp->type;
	switch (bp->format) {
	case DVD_STRUCT_PHYSICAL:
	if (bp->layer_num >= 4)
	return (EINVAL);
	bp->layer_num = lp->physical.layer_num;
	break;
	case DVD_STRUCT_COPYRIGHT:
	bp->layer_num = lp->copyright.layer_num;
	break;
	case DVD_STRUCT_DISCKEY:
	bp->agid = lp->disckey.agid;
	break;
	case DVD_STRUCT_BCA:
	case DVD_STRUCT_MANUFACT:
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	bsd_to_linux_dvd_struct(struct dvd_struct bp, l_dvd_struct lp)
	{
	switch (bp->format) {
	case DVD_STRUCT_PHYSICAL: {
	struct dvd_layer blp = (struct dvd_layer )bp->data;
	struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num];
	memset(llp, 0, sizeof(*llp));
	llp->book_version = blp->book_version;
	llp->book_type = blp->book_type;
	llp->min_rate = blp->max_rate;
	llp->disc_size = blp->disc_size;
	llp->layer_type = blp->layer_type;
	llp->track_path = blp->track_path;
	llp->nlayers = blp->nlayers;
	llp->track_density = blp->track_density;
	llp->linear_density = blp->linear_density;
	llp->bca = blp->bca;
	llp->start_sector = blp->start_sector;
	llp->end_sector = blp->end_sector;
	llp->end_sector_l0 = blp->end_sector_l0;
	break;
	}
	case DVD_STRUCT_COPYRIGHT:
	lp->copyright.cpst = bp->cpst;
	lp->copyright.rmi = bp->rmi;
	break;
	case DVD_STRUCT_DISCKEY:
	memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value));
	break;
	case DVD_STRUCT_BCA:
	lp->bca.len = bp->length;
	memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value));
	break;
	case DVD_STRUCT_MANUFACT:
	lp->manufact.len = bp->length;
	memcpy(lp->manufact.value, bp->data,
	sizeof(lp->manufact.value));
	/* lp->manufact.layer_num is unused in linux (redhat 7.0) */
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	linux_to_bsd_dvd_authinfo(l_dvd_authinfo lp, int bcode,
	struct dvd_authinfo *bp)
	{
	switch (lp->type) {
	case LINUX_DVD_LU_SEND_AGID:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_AGID;
	bp->agid = lp->lsa.agid;
	break;
	case LINUX_DVD_HOST_SEND_CHALLENGE:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_CHALLENGE;
	bp->agid = lp->hsc.agid;
	memcpy(bp->keychal, lp->hsc.chal, 10);
	break;
	case LINUX_DVD_LU_SEND_KEY1:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_KEY1;
	bp->agid = lp->lsk.agid;
	break;
	case LINUX_DVD_LU_SEND_CHALLENGE:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_CHALLENGE;
	bp->agid = lp->lsc.agid;
	break;
	case LINUX_DVD_HOST_SEND_KEY2:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_KEY2;
	bp->agid = lp->hsk.agid;
	memcpy(bp->keychal, lp->hsk.key, 5);
	break;
	case LINUX_DVD_LU_SEND_TITLE_KEY:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_TITLE_KEY;
	bp->agid = lp->lstk.agid;
	bp->lba = lp->lstk.lba;
	break;
	case LINUX_DVD_LU_SEND_ASF:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_ASF;
	bp->agid = lp->lsasf.agid;
	break;
	case LINUX_DVD_INVALIDATE_AGID:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_INVALIDATE_AGID;
	bp->agid = lp->lsa.agid;
	break;
	case LINUX_DVD_LU_SEND_RPC_STATE:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_RPC;
	break;
	case LINUX_DVD_HOST_SEND_RPC_STATE:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_RPC;
	bp->region = lp->hrpcs.pdrc;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	bsd_to_linux_dvd_authinfo(struct dvd_authinfo bp, l_dvd_authinfo lp)
	{
	switch (lp->type) {
	case LINUX_DVD_LU_SEND_AGID:
	lp->lsa.agid = bp->agid;
	break;
	case LINUX_DVD_HOST_SEND_CHALLENGE:
	lp->type = LINUX_DVD_LU_SEND_KEY1;
	break;
	case LINUX_DVD_LU_SEND_KEY1:
	memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key));
	break;
	case LINUX_DVD_LU_SEND_CHALLENGE:
	memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal));
	break;
	case LINUX_DVD_HOST_SEND_KEY2:
	lp->type = LINUX_DVD_AUTH_ESTABLISHED;
	break;
	case LINUX_DVD_LU_SEND_TITLE_KEY:
	memcpy(lp->lstk.title_key, bp->keychal,
	sizeof(lp->lstk.title_key));
	lp->lstk.cpm = bp->cpm;
	lp->lstk.cp_sec = bp->cp_sec;
	lp->lstk.cgms = bp->cgms;
	break;
	case LINUX_DVD_LU_SEND_ASF:
	lp->lsasf.asf = bp->asf;
	break;
	case LINUX_DVD_INVALIDATE_AGID:
	break;
	case LINUX_DVD_LU_SEND_RPC_STATE:
	lp->lrpcs.type = bp->reg_type;
	lp->lrpcs.vra = bp->vend_rsts;
	lp->lrpcs.ucca = bp->user_rsts;
	lp->lrpcs.region_mask = bp->region;
	lp->lrpcs.rpc_scheme = bp->rpc_scheme;
	break;
	case LINUX_DVD_HOST_SEND_RPC_STATE:
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	linux_ioctl_cdrom(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {

	case LINUX_CDROMPAUSE:
	args->cmd = CDIOCPAUSE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMRESUME:
	args->cmd = CDIOCRESUME;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMPLAYMSF:
	args->cmd = CDIOCPLAYMSF;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMPLAYTRKIND:
	args->cmd = CDIOCPLAYTRACKS;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMREADTOCHDR: {
	struct ioc_toc_header th;
	struct linux_cdrom_tochdr lth;
	error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th,
	td->td_ucred, td);
	if (!error) {
	lth.cdth_trk0 = th.starting_track;
	lth.cdth_trk1 = th.ending_track;
	copyout(&lth, (void *)args->arg, sizeof(lth));
	}
	break;
	}

	case LINUX_CDROMREADTOCENTRY: {
	struct linux_cdrom_tocentry lte;
	struct ioc_read_toc_single_entry irtse;

	error = copyin((void *)args->arg, &lte, sizeof(lte));
	if (error)
	break;
	irtse.address_format = lte.cdte_format;
	irtse.track = lte.cdte_track;
	error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse,
	td->td_ucred, td);
	if (!error) {
	lte.cdte_ctrl = irtse.entry.control;
	lte.cdte_adr = irtse.entry.addr_type;
	bsd_to_linux_msf_lba(irtse.address_format,
	&irtse.entry.addr, &lte.cdte_addr);
	error = copyout(&lte, (void *)args->arg, sizeof(lte));
	}
	break;
	}

	case LINUX_CDROMSTOP:
	args->cmd = CDIOCSTOP;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMSTART:
	args->cmd = CDIOCSTART;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMEJECT:
	args->cmd = CDIOCEJECT;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_CDROMVOLCTRL */

	case LINUX_CDROMSUBCHNL: {
	struct linux_cdrom_subchnl sc;
	struct ioc_read_subchannel bsdsc;
	struct cd_sub_channel_info bsdinfo;

	bsdsc.address_format = CD_LBA_FORMAT;
	bsdsc.data_format = CD_CURRENT_POSITION;
	bsdsc.track = 0;
	bsdsc.data_len = sizeof(bsdinfo);
	bsdsc.data = &bsdinfo;
	error = fo_ioctl(fp, CDIOCREADSUBCHANNEL_SYSSPACE,
	(caddr_t)&bsdsc, td->td_ucred, td);
	if (error)
	break;
	error = copyin((void *)args->arg, &sc, sizeof(sc));
	if (error)
	break;
	sc.cdsc_audiostatus = bsdinfo.header.audio_status;
	sc.cdsc_adr = bsdinfo.what.position.addr_type;
	sc.cdsc_ctrl = bsdinfo.what.position.control;
	sc.cdsc_trk = bsdinfo.what.position.track_number;
	sc.cdsc_ind = bsdinfo.what.position.index_number;
	set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format,
	bsdinfo.what.position.absaddr.lba);
	set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format,
	bsdinfo.what.position.reladdr.lba);
	error = copyout(&sc, (void *)args->arg, sizeof(sc));
	break;
	}

	/* LINUX_CDROMREADMODE2 */
	/* LINUX_CDROMREADMODE1 */
	/* LINUX_CDROMREADAUDIO */
	/* LINUX_CDROMEJECT_SW */
	/* LINUX_CDROMMULTISESSION */
	/* LINUX_CDROM_GET_UPC */

	case LINUX_CDROMRESET:
	args->cmd = CDIOCRESET;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_CDROMVOLREAD */
	/* LINUX_CDROMREADRAW */
	/* LINUX_CDROMREADCOOKED */
	/* LINUX_CDROMSEEK */
	/* LINUX_CDROMPLAYBLK */
	/* LINUX_CDROMREADALL */
	/* LINUX_CDROMCLOSETRAY */
	/* LINUX_CDROMLOADFROMSLOT */
	/* LINUX_CDROMGETSPINDOWN */
	/* LINUX_CDROMSETSPINDOWN */
	/* LINUX_CDROM_SET_OPTIONS */
	/* LINUX_CDROM_CLEAR_OPTIONS */
	/* LINUX_CDROM_SELECT_SPEED */
	/* LINUX_CDROM_SELECT_DISC */
	/* LINUX_CDROM_MEDIA_CHANGED */
	/* LINUX_CDROM_DRIVE_STATUS */
	/* LINUX_CDROM_DISC_STATUS */
	/* LINUX_CDROM_CHANGER_NSLOTS */
	/* LINUX_CDROM_LOCKDOOR */
	/* LINUX_CDROM_DEBUG */
	/* LINUX_CDROM_GET_CAPABILITY */
	/* LINUX_CDROMAUDIOBUFSIZ */

	case LINUX_DVD_READ_STRUCT: {
	l_dvd_struct lds;
	struct dvd_struct bds;

	error = copyin((void *)args->arg, &lds, sizeof(lds));
	if (error)
	break;
	error = linux_to_bsd_dvd_struct(&lds, &bds);
	if (error)
	break;
	error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)&bds,
	td->td_ucred, td);
	if (error)
	break;
	error = bsd_to_linux_dvd_struct(&bds, &lds);
	if (error)
	break;
	error = copyout(&lds, (void *)args->arg, sizeof(lds));
	break;
	}

	/* LINUX_DVD_WRITE_STRUCT */

	case LINUX_DVD_AUTH: {
	l_dvd_authinfo lda;
	struct dvd_authinfo bda;
	int bcode;

	error = copyin((void *)args->arg, &lda, sizeof(lda));
	if (error)
	break;
	error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda);
	if (error)
	break;
	error = fo_ioctl(fp, bcode, (caddr_t)&bda, td->td_ucred,
	td);
	if (error) {
	if (lda.type == LINUX_DVD_HOST_SEND_KEY2) {
	lda.type = LINUX_DVD_AUTH_FAILURE;
	copyout(&lda, (void *)args->arg, sizeof(lda));
	}
	break;
	}
	error = bsd_to_linux_dvd_authinfo(&bda, &lda);
	if (error)
	break;
	error = copyout(&lda, (void *)args->arg, sizeof(lda));
	break;
	}

	case LINUX_SCSI_GET_BUS_NUMBER:
	case LINUX_SCSI_GET_IDLUN:
	error = linux_ioctl_sg(td, args);
	break;

	/* LINUX_CDROM_SEND_PACKET */
	/* LINUX_CDROM_NEXT_WRITABLE */
	/* LINUX_CDROM_LAST_WRITTEN */

	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	static int
	linux_ioctl_vfat(struct thread td, struct linux_ioctl_args args)
	{

	return (ENOTTY);
	}

	/*
	* Sound related ioctls
	*/

	struct linux_mixer_info {
	char id[16];
	char name[32];
	int modify_counter;
	int fillers[10];
	};

	struct linux_old_mixer_info {
	char id[16];
	char name[32];
	};

	static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT };

	#define SETDIR(c) (((c) & ~IOC_DIRMASK) \| dirbits[args->cmd >> 30])

	static int
	linux_ioctl_sound(struct thread td, struct linux_ioctl_args args)
	{

	switch (args->cmd & 0xffff) {

	case LINUX_SOUND_MIXER_WRITE_VOLUME:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_BASS:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_TREBLE:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_SYNTH:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_PCM:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_SPEAKER:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_MIC:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_CD:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_CD);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_IMIX:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_ALTPCM:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_RECLEV:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_IGAIN:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_OGAIN:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE1:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE2:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE3:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_INFO: {
	/* Key on encoded length */
	switch ((args->cmd >> 16) & 0x1fff) {
	case 0x005c: { /* SOUND_MIXER_INFO */
	struct linux_mixer_info info;
	bzero(&info, sizeof(info));
	strncpy(info.id, "OSS", sizeof(info.id) - 1);
	strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
	copyout(&info, (void *)args->arg, sizeof(info));
	break;
	}
	case 0x0030: { /* SOUND_OLD_MIXER_INFO */
	struct linux_old_mixer_info info;
	bzero(&info, sizeof(info));
	strncpy(info.id, "OSS", sizeof(info.id) - 1);
	strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
	copyout(&info, (void *)args->arg, sizeof(info));
	break;
	}
	default:
	return (ENOIOCTL);
	}
	break;
	}

	case LINUX_OSS_GETVERSION: {
	int version = linux_get_oss_version(td);
	return (copyout(&version, (void *)args->arg, sizeof(int)));
	}

	case LINUX_SOUND_MIXER_READ_STEREODEVS:
	args->cmd = SOUND_MIXER_READ_STEREODEVS;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_RECMASK:
	args->cmd = SOUND_MIXER_READ_RECMASK;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_DEVMASK:
	args->cmd = SOUND_MIXER_READ_DEVMASK;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_RECSRC:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC);
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_RESET:
	args->cmd = SNDCTL_DSP_RESET;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SYNC:
	args->cmd = SNDCTL_DSP_SYNC;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SPEED:
	args->cmd = SNDCTL_DSP_SPEED;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_STEREO:
	args->cmd = SNDCTL_DSP_STEREO;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */
	args->cmd = SNDCTL_DSP_GETBLKSIZE;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETFMT:
	args->cmd = SNDCTL_DSP_SETFMT;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_PCM_WRITE_CHANNELS:
	args->cmd = SOUND_PCM_WRITE_CHANNELS;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_PCM_WRITE_FILTER:
	args->cmd = SOUND_PCM_WRITE_FILTER;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_POST:
	args->cmd = SNDCTL_DSP_POST;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SUBDIVIDE:
	args->cmd = SNDCTL_DSP_SUBDIVIDE;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETFRAGMENT:
	args->cmd = SNDCTL_DSP_SETFRAGMENT;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETFMTS:
	args->cmd = SNDCTL_DSP_GETFMTS;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETOSPACE:
	args->cmd = SNDCTL_DSP_GETOSPACE;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETISPACE:
	args->cmd = SNDCTL_DSP_GETISPACE;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_NONBLOCK:
	args->cmd = SNDCTL_DSP_NONBLOCK;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETCAPS:
	args->cmd = SNDCTL_DSP_GETCAPS;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */
	args->cmd = SNDCTL_DSP_SETTRIGGER;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETIPTR:
	args->cmd = SNDCTL_DSP_GETIPTR;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETOPTR:
	args->cmd = SNDCTL_DSP_GETOPTR;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETDUPLEX:
	args->cmd = SNDCTL_DSP_SETDUPLEX;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETODELAY:
	args->cmd = SNDCTL_DSP_GETODELAY;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_RESET:
	args->cmd = SNDCTL_SEQ_RESET;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_SYNC:
	args->cmd = SNDCTL_SEQ_SYNC;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SYNTH_INFO:
	args->cmd = SNDCTL_SYNTH_INFO;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_CTRLRATE:
	args->cmd = SNDCTL_SEQ_CTRLRATE;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_GETOUTCOUNT:
	args->cmd = SNDCTL_SEQ_GETOUTCOUNT;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_GETINCOUNT:
	args->cmd = SNDCTL_SEQ_GETINCOUNT;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_PERCMODE:
	args->cmd = SNDCTL_SEQ_PERCMODE;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_FM_LOAD_INSTR:
	args->cmd = SNDCTL_FM_LOAD_INSTR;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_TESTMIDI:
	args->cmd = SNDCTL_SEQ_TESTMIDI;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_RESETSAMPLES:
	args->cmd = SNDCTL_SEQ_RESETSAMPLES;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_NRSYNTHS:
	args->cmd = SNDCTL_SEQ_NRSYNTHS;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_NRMIDIS:
	args->cmd = SNDCTL_SEQ_NRMIDIS;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_MIDI_INFO:
	args->cmd = SNDCTL_MIDI_INFO;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_TRESHOLD:
	args->cmd = SNDCTL_SEQ_TRESHOLD;
	return (ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SYNTH_MEMAVL:
	args->cmd = SNDCTL_SYNTH_MEMAVL;
	return (ioctl(td, (struct ioctl_args *)args));

	}

	return (ENOIOCTL);
	}

	/*
	* Console related ioctls
	*/

	#define ISSIGVALID(sig) ((sig) > 0 && (sig) < NSIG)

	static int
	linux_ioctl_console(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {

	case LINUX_KIOCSOUND:
	args->cmd = KIOCSOUND;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDMKTONE:
	args->cmd = KDMKTONE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGETLED:
	args->cmd = KDGETLED;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSETLED:
	args->cmd = KDSETLED;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSETMODE:
	args->cmd = KDSETMODE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGETMODE:
	args->cmd = KDGETMODE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGKBMODE:
	args->cmd = KDGKBMODE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSKBMODE: {
	int kbdmode;
	switch (args->arg) {
	case LINUX_KBD_RAW:
	kbdmode = K_RAW;
	break;
	case LINUX_KBD_XLATE:
	kbdmode = K_XLATE;
	break;
	case LINUX_KBD_MEDIUMRAW:
	kbdmode = K_RAW;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode,
	td->td_ucred, td));
	break;
	}

	case LINUX_VT_OPENQRY:
	args->cmd = VT_OPENQRY;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_GETMODE:
	args->cmd = VT_GETMODE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_SETMODE: {
	struct vt_mode mode;
	if ((error = copyin((void *)args->arg, &mode, sizeof(mode))))
	break;
	if (!ISSIGVALID(mode.frsig) && ISSIGVALID(mode.acqsig))
	mode.frsig = mode.acqsig;
	if ((error = copyout(&mode, (void *)args->arg, sizeof(mode))))
	break;
	args->cmd = VT_SETMODE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;
	}

	case LINUX_VT_GETSTATE:
	args->cmd = VT_GETACTIVE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_RELDISP:
	args->cmd = VT_RELDISP;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_ACTIVATE:
	args->cmd = VT_ACTIVATE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_WAITACTIVE:
	args->cmd = VT_WAITACTIVE;
	error = (ioctl(td, (struct ioctl_args *)args));
	break;

	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	/*
	* Criteria for interface name translation
	*/
	#define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER)

	/*
	* Interface function used by linprocfs (at the time of writing). It's not
	* used by the Linuxulator itself.
	*/
	int
	linux_ifname(struct ifnet ifp, char buffer, size_t buflen)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	struct ifnet *ifscan;
	int ethno;

	/* Short-circuit non ethernet interfaces */
	if (!IFP_IS_ETH(ifp))
	return (strlcpy(buffer, ifp->if_xname, buflen));

	/* Determine the (relative) unit number for ethernet interfaces */
	ethno = 0;
	IFNET_RLOCK();
	TAILQ_FOREACH(ifscan, &V_ifnet, if_link) {
	if (ifscan == ifp) {
	IFNET_RUNLOCK();
	return (snprintf(buffer, buflen, "eth%d", ethno));
	}
	if (IFP_IS_ETH(ifscan))
	ethno++;
	}
	IFNET_RUNLOCK();

	return (0);
	}

	/*
	* Translate a Linux interface name to a FreeBSD interface name,
	* and return the associated ifnet structure
	* bsdname and lxname need to be least IFNAMSIZ bytes long, but
	* can point to the same buffer.
	*/

	static struct ifnet *
	ifname_linux_to_bsd(const char lxname, char bsdname)
	{
	+ INIT_VNET_NET(TD_TO_VNET(curthread));
	struct ifnet *ifp;
	int len, unit;
	char *ep;
	int is_eth, index;

	for (len = 0; len < LINUX_IFNAMSIZ; ++len)
	if (!isalpha(lxname[len]))
	break;
	if (len == 0 \|\| len == LINUX_IFNAMSIZ)
	return (NULL);
	unit = (int)strtoul(lxname + len, &ep, 10);
	if (ep == NULL \|\| ep == lxname + len \|\| ep >= lxname + LINUX_IFNAMSIZ)
	return (NULL);
	index = 0;
	is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0;
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	/*
	* Allow Linux programs to use FreeBSD names. Don't presume
	* we never have an interface named "eth", so don't make
	* the test optional based on is_eth.
	*/
	if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0)
	break;
	if (is_eth && IFP_IS_ETH(ifp) && unit == index++)
	break;
	}
	IFNET_RUNLOCK();
	if (ifp != NULL)
	strlcpy(bsdname, ifp->if_xname, IFNAMSIZ);
	return (ifp);
	}

	/*
	* Implement the SIOCGIFCONF ioctl
	*/

	static int
	linux_ifconf(struct thread td, struct ifconf uifc)
	{
	+ INIT_VNET_NET(TD_TO_VNET(td));
	#ifdef COMPAT_LINUX32
	struct l_ifconf ifc;
	#else
	struct ifconf ifc;
	#endif
	struct l_ifreq ifr;
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct sbuf *sb;
	int error, ethno, full = 0, valid_len, max_len;

	error = copyin(uifc, &ifc, sizeof(ifc));
	if (error != 0)
	return (error);

	max_len = MAXPHYS - 1;

	/* handle the 'request buffer size' case */
	if (ifc.ifc_buf == PTROUT(NULL)) {
	ifc.ifc_len = 0;
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;
	if (sa->sa_family == AF_INET)
	ifc.ifc_len += sizeof(ifr);
	}
	}
	error = copyout(&ifc, uifc, sizeof(ifc));
	return (error);
	}

	if (ifc.ifc_len <= 0)
	return (EINVAL);

	again:
	/* Keep track of eth interfaces */
	ethno = 0;
	if (ifc.ifc_len <= max_len) {
	max_len = ifc.ifc_len;
	full = 1;
	}
	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
	max_len = 0;
	valid_len = 0;

	/* Return all AF_INET addresses of all interfaces */
	IFNET_RLOCK(); /* could sleep XXX */
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	int addrs = 0;

	bzero(&ifr, sizeof(ifr));
	if (IFP_IS_ETH(ifp))
	snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d",
	ethno++);
	else
	strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ);

	/* Walk the address list */
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;

	if (sa->sa_family == AF_INET) {
	ifr.ifr_addr.sa_family = LINUX_AF_INET;
	memcpy(ifr.ifr_addr.sa_data, sa->sa_data,
	sizeof(ifr.ifr_addr.sa_data));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);
	addrs++;
	}

	if (!sbuf_overflowed(sb))
	valid_len = sbuf_len(sb);
	}
	if (addrs == 0) {
	bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);

	if (!sbuf_overflowed(sb))
	valid_len = sbuf_len(sb);
	}
	}
	IFNET_RUNLOCK();

	if (valid_len != max_len && !full) {
	sbuf_delete(sb);
	goto again;
	}

	ifc.ifc_len = valid_len;
	sbuf_finish(sb);
	memcpy(PTRIN(ifc.ifc_buf), sbuf_data(sb), ifc.ifc_len);
	error = copyout(&ifc, uifc, sizeof(ifc));
	sbuf_delete(sb);

	return (error);
	}

	static int
	linux_gifflags(struct thread td, struct ifnet ifp, struct l_ifreq *ifr)
	{
	l_short flags;

	flags = (ifp->if_flags \| ifp->if_drv_flags) & 0xffff;
	/* these flags have no Linux equivalent */
	flags &= ~(IFF_SMART\|IFF_DRV_OACTIVE\|IFF_SIMPLEX\|
	IFF_LINK0\|IFF_LINK1\|IFF_LINK2);
	/* Linux' multicast flag is in a different bit */
	if (flags & IFF_MULTICAST) {
	flags &= ~IFF_MULTICAST;
	flags \|= 0x1000;
	}

	return (copyout(&flags, &ifr->ifr_flags, sizeof(flags)));
	}

	#define ARPHRD_ETHER 1
	#define ARPHRD_LOOPBACK 772

	static int
	linux_gifhwaddr(struct ifnet ifp, struct l_ifreq ifr)
	{
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;
	struct l_sockaddr lsa;

	if (ifp->if_type == IFT_LOOP) {
	bzero(&lsa, sizeof(lsa));
	lsa.sa_family = ARPHRD_LOOPBACK;
	return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
	}

	if (ifp->if_type != IFT_ETHER)
	return (ENOENT);

	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	sdl = (struct sockaddr_dl*)ifa->ifa_addr;
	if (sdl != NULL && (sdl->sdl_family == AF_LINK) &&
	(sdl->sdl_type == IFT_ETHER)) {
	bzero(&lsa, sizeof(lsa));
	lsa.sa_family = ARPHRD_ETHER;
	bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN);
	return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
	}
	}

	return (ENOENT);
	}


	/*
	* If we fault in bsd_to_linux_ifreq() then we will fault when we call
	* the native ioctl(). Thus, we don't really need to check the return
	* value of this function.
	*/
	static int
	bsd_to_linux_ifreq(struct ifreq *arg)
	{
	struct ifreq ifr;
	size_t ifr_len = sizeof(struct ifreq);
	int error;

	if ((error = copyin(arg, &ifr, ifr_len)))
	return (error);

	(u_short )&ifr.ifr_addr = ifr.ifr_addr.sa_family;

	error = copyout(&ifr, arg, ifr_len);

	return (error);
	}

	/*
	* Socket related ioctls
	*/

	static int
	linux_ioctl_socket(struct thread td, struct linux_ioctl_args args)
	{
	char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ];
	struct ifnet *ifp;
	struct file *fp;
	int error, type;

	ifp = NULL;
	error = 0;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	type = fp->f_type;
	fdrop(fp, td);
	if (type != DTYPE_SOCKET) {
	/* not a socket - probably a tap / vmnet device */
	switch (args->cmd) {
	case LINUX_SIOCGIFADDR:
	case LINUX_SIOCSIFADDR:
	case LINUX_SIOCGIFFLAGS:
	return (linux_ioctl_special(td, args));
	default:
	return (ENOIOCTL);
	}
	}

	switch (args->cmd & 0xffff) {

	case LINUX_FIOGETOWN:
	case LINUX_FIOSETOWN:
	case LINUX_SIOCADDMULTI:
	case LINUX_SIOCATMARK:
	case LINUX_SIOCDELMULTI:
	case LINUX_SIOCGIFCONF:
	case LINUX_SIOCGPGRP:
	case LINUX_SIOCSPGRP:
	case LINUX_SIOCGIFCOUNT:
	/* these ioctls don't take an interface name */
	#ifdef DEBUG
	printf("%s(): ioctl %d\n", __func__,
	args->cmd & 0xffff);
	#endif
	break;

	case LINUX_SIOCGIFFLAGS:
	case LINUX_SIOCGIFADDR:
	case LINUX_SIOCSIFADDR:
	case LINUX_SIOCGIFDSTADDR:
	case LINUX_SIOCGIFBRDADDR:
	case LINUX_SIOCGIFNETMASK:
	case LINUX_SIOCSIFNETMASK:
	case LINUX_SIOCGIFMTU:
	case LINUX_SIOCSIFMTU:
	case LINUX_SIOCSIFNAME:
	case LINUX_SIOCGIFHWADDR:
	case LINUX_SIOCSIFHWADDR:
	case LINUX_SIOCDEVPRIVATE:
	case LINUX_SIOCDEVPRIVATE+1:
	case LINUX_SIOCGIFINDEX:
	/* copy in the interface name and translate it. */
	error = copyin((void *)args->arg, lifname, LINUX_IFNAMSIZ);
	if (error != 0)
	return (error);
	#ifdef DEBUG
	printf("%s(): ioctl %d on %.*s\n", __func__,
	args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname);
	#endif
	ifp = ifname_linux_to_bsd(lifname, ifname);
	if (ifp == NULL)
	return (EINVAL);
	/*
	* We need to copy it back out in case we pass the
	* request on to our native ioctl(), which will expect
	* the ifreq to be in user space and have the correct
	* interface name.
	*/
	error = copyout(ifname, (void *)args->arg, IFNAMSIZ);
	if (error != 0)
	return (error);
	#ifdef DEBUG
	printf("%s(): %s translated to %s\n", __func__,
	lifname, ifname);
	#endif
	break;

	default:
	return (ENOIOCTL);
	}

	switch (args->cmd & 0xffff) {

	case LINUX_FIOSETOWN:
	args->cmd = FIOSETOWN;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSPGRP:
	args->cmd = SIOCSPGRP;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_FIOGETOWN:
	args->cmd = FIOGETOWN;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGPGRP:
	args->cmd = SIOCGPGRP;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCATMARK:
	args->cmd = SIOCATMARK;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	/* LINUX_SIOCGSTAMP */

	case LINUX_SIOCGIFCONF:
	error = linux_ifconf(td, (struct ifconf *)args->arg);
	break;

	case LINUX_SIOCGIFFLAGS:
	args->cmd = SIOCGIFFLAGS;
	error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFADDR:
	args->cmd = SIOCGIFADDR;
	error = ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFADDR:
	/* XXX probably doesn't work, included for completeness */
	args->cmd = SIOCSIFADDR;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFDSTADDR:
	args->cmd = SIOCGIFDSTADDR;
	error = ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFBRDADDR:
	args->cmd = SIOCGIFBRDADDR;
	error = ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFNETMASK:
	args->cmd = SIOCGIFNETMASK;
	error = ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFNETMASK:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCGIFMTU:
	args->cmd = SIOCGIFMTU;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSIFMTU:
	args->cmd = SIOCSIFMTU;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSIFNAME:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCGIFHWADDR:
	error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFHWADDR:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCADDMULTI:
	args->cmd = SIOCADDMULTI;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCDELMULTI:
	args->cmd = SIOCDELMULTI;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFINDEX:
	args->cmd = SIOCGIFINDEX;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFCOUNT:
	error = 0;
	break;

	/*
	* XXX This is slightly bogus, but these ioctls are currently
	* XXX only used by the aironet (if_an) network driver.
	*/
	case LINUX_SIOCDEVPRIVATE:
	args->cmd = SIOCGPRIVATE_0;
	error = ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCDEVPRIVATE+1:
	args->cmd = SIOCGPRIVATE_1;
	error = ioctl(td, (struct ioctl_args *)args);
	break;
	}

	if (ifp != NULL)
	/* restore the original interface name */
	copyout(lifname, (void *)args->arg, LINUX_IFNAMSIZ);

	#ifdef DEBUG
	printf("%s(): returning %d\n", __func__, error);
	#endif
	return (error);
	}

	/*
	* Device private ioctl handler
	*/
	static int
	linux_ioctl_private(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error, type;

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	type = fp->f_type;
	fdrop(fp, td);
	if (type == DTYPE_SOCKET)
	return (linux_ioctl_socket(td, args));
	return (ENOIOCTL);
	}

	/*
	* DRM ioctl handler (sys/dev/drm)
	*/
	static int
	linux_ioctl_drm(struct thread td, struct linux_ioctl_args args)
	{
	args->cmd = SETDIR(args->cmd);
	return ioctl(td, (struct ioctl_args *)args);
	}

	static int
	linux_ioctl_sg(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	u_long cmd;
	int error;

	if ((error = fget(td, args->fd, &fp)) != 0) {
	printf("sg_linux_ioctl: fget returned %d\n", error);
	return (error);
	}
	cmd = args->cmd;

	error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td->td_ucred, td));
	fdrop(fp, td);
	return (error);
	}

	/*
	* Special ioctl handler
	*/
	static int
	linux_ioctl_special(struct thread td, struct linux_ioctl_args args)
	{
	int error;

	switch (args->cmd) {
	case LINUX_SIOCGIFADDR:
	args->cmd = SIOCGIFADDR;
	error = ioctl(td, (struct ioctl_args *)args);
	break;
	case LINUX_SIOCSIFADDR:
	args->cmd = SIOCSIFADDR;
	error = ioctl(td, (struct ioctl_args *)args);
	break;
	case LINUX_SIOCGIFFLAGS:
	args->cmd = SIOCGIFFLAGS;
	error = ioctl(td, (struct ioctl_args *)args);
	break;
	default:
	error = ENOIOCTL;
	}

	return (error);
	}

	/*
	* main ioctl syscall function
	*/

	int
	linux_ioctl(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	struct handler_element *he;
	int error, cmd;

	#ifdef DEBUG
	if (ldebug(ioctl))
	printf(ARGS(ioctl, "%d, %04lx, *"), args->fd,
	(unsigned long)args->cmd);
	#endif

	if ((error = fget(td, args->fd, &fp)) != 0)
	return (error);
	if ((fp->f_flag & (FREAD\|FWRITE)) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	/* Iterate over the ioctl handlers */
	cmd = args->cmd & 0xffff;
	sx_slock(&linux_ioctl_sx);
	mtx_lock(&Giant);
	TAILQ_FOREACH(he, &handlers, list) {
	if (cmd >= he->low && cmd <= he->high) {
	error = (*he->func)(td, args);
	if (error != ENOIOCTL) {
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);
	return (error);
	}
	}
	}
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);

	linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented",
	args->fd, (int)(args->cmd & 0xffff),
	(int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff));

	return (EINVAL);
	}

	int
	linux_ioctl_register_handler(struct linux_ioctl_handler *h)
	{
	struct handler_element he, cur;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	/*
	* Reuse the element if the handler is already on the list, otherwise
	* create a new element.
	*/
	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &handlers, list) {
	if (he->func == h->func)
	break;
	}
	if (he == NULL) {
	MALLOC(he, struct handler_element , sizeof(he),
	M_LINUX, M_WAITOK);
	he->func = h->func;
	} else
	TAILQ_REMOVE(&handlers, he, list);

	/* Initialize range information. */
	he->low = h->low;
	he->high = h->high;
	he->span = h->high - h->low + 1;

	/* Add the element to the list, sorted on span. */
	TAILQ_FOREACH(cur, &handlers, list) {
	if (cur->span > he->span) {
	TAILQ_INSERT_BEFORE(cur, he, list);
	sx_xunlock(&linux_ioctl_sx);
	return (0);
	}
	}
	TAILQ_INSERT_TAIL(&handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);

	return (0);
	}

	int
	linux_ioctl_unregister_handler(struct linux_ioctl_handler *h)
	{
	struct handler_element *he;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &handlers, list) {
	if (he->func == h->func) {
	TAILQ_REMOVE(&handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);
	FREE(he, M_LINUX);
	return (0);
	}
	}
	sx_xunlock(&linux_ioctl_sx);

	return (EINVAL);
	}
	Index: head/sys/compat/linux/linux_misc.c
	===================================================================
	--- head/sys/compat/linux/linux_misc.c (revision 183549)
	+++ head/sys/compat/linux/linux_misc.c (revision 183550)
	@@ -1,1864 +1,1865 @@
	/*-
	* Copyright (c) 2002 Doug Rabson
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/blist.h>
	#include <sys/fcntl.h>
	#if defined(__i386__)
	#include <sys/imgact_aout.h>
	#endif
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/stat.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/wait.h>
	#include <sys/cpuset.h>
	#include <sys/vimage.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>
	#include <vm/swap_pager.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_file.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_signal.h>
	#include <compat/linux/linux_util.h>
	#include <compat/linux/linux_sysproto.h>
	#include <compat/linux/linux_emul.h>
	#include <compat/linux/linux_misc.h>

	#ifdef __i386__
	#include <machine/cputypes.h>
	#endif

	#define BSD_TO_LINUX_SIGNAL(sig) \
	(((sig) <= LINUX_SIGTBLSZ) ? bsd_to_linux_signal[_SIG_IDX(sig)] : sig)

	static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
	RLIMIT_MEMLOCK, RLIMIT_AS
	};

	struct l_sysinfo {
	l_long uptime; /* Seconds since boot */
	l_ulong loads[3]; /* 1, 5, and 15 minute load averages */
	#define LINUX_SYSINFO_LOADS_SCALE 65536
	l_ulong totalram; /* Total usable main memory size */
	l_ulong freeram; /* Available memory size */
	l_ulong sharedram; /* Amount of shared memory */
	l_ulong bufferram; /* Memory used by buffers */
	l_ulong totalswap; /* Total swap space size */
	l_ulong freeswap; /* swap space still available */
	l_ushort procs; /* Number of current processes */
	l_ushort pads;
	l_ulong totalbig;
	l_ulong freebig;
	l_uint mem_unit;
	char _f[20-2sizeof(l_long)-sizeof(l_int)]; / padding */
	};
	int
	linux_sysinfo(struct thread td, struct linux_sysinfo_args args)
	{
	struct l_sysinfo sysinfo;
	vm_object_t object;
	int i, j;
	struct timespec ts;

	getnanouptime(&ts);
	if (ts.tv_nsec != 0)
	ts.tv_sec++;
	sysinfo.uptime = ts.tv_sec;

	/* Use the information from the mib to get our load averages */
	for (i = 0; i < 3; i++)
	sysinfo.loads[i] = averunnable.ldavg[i] *
	LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;

	sysinfo.totalram = physmem * PAGE_SIZE;
	sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE;

	sysinfo.sharedram = 0;
	mtx_lock(&vm_object_list_mtx);
	TAILQ_FOREACH(object, &vm_object_list, object_list)
	if (object->shadow_count > 1)
	sysinfo.sharedram += object->resident_page_count;
	mtx_unlock(&vm_object_list_mtx);

	sysinfo.sharedram *= PAGE_SIZE;
	sysinfo.bufferram = 0;

	swap_pager_status(&i, &j);
	sysinfo.totalswap = i * PAGE_SIZE;
	sysinfo.freeswap = (i - j) * PAGE_SIZE;

	sysinfo.procs = nprocs;

	/* The following are only present in newer Linux kernels. */
	sysinfo.totalbig = 0;
	sysinfo.freebig = 0;
	sysinfo.mem_unit = 1;

	return copyout(&sysinfo, args->info, sizeof(sysinfo));
	}

	int
	linux_alarm(struct thread td, struct linux_alarm_args args)
	{
	struct itimerval it, old_it;
	u_int secs;
	int error;

	#ifdef DEBUG
	if (ldebug(alarm))
	printf(ARGS(alarm, "%u"), args->secs);
	#endif

	secs = args->secs;

	if (secs > INT_MAX)
	secs = INT_MAX;

	it.it_value.tv_sec = (long) secs;
	it.it_value.tv_usec = 0;
	it.it_interval.tv_sec = 0;
	it.it_interval.tv_usec = 0;
	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
	if (error)
	return (error);
	if (timevalisset(&old_it.it_value)) {
	if (old_it.it_value.tv_usec != 0)
	old_it.it_value.tv_sec++;
	td->td_retval[0] = old_it.it_value.tv_sec;
	}
	return (0);
	}

	int
	linux_brk(struct thread td, struct linux_brk_args args)
	{
	struct vmspace *vm = td->td_proc->p_vmspace;
	vm_offset_t new, old;
	struct obreak_args /* {
	char * nsize;
	} */ tmp;

	#ifdef DEBUG
	if (ldebug(brk))
	printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
	#endif
	old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
	new = (vm_offset_t)args->dsend;
	tmp.nsize = (char *)new;
	if (((caddr_t)new > vm->vm_daddr) && !obreak(td, &tmp))
	td->td_retval[0] = (long)new;
	else
	td->td_retval[0] = (long)old;

	return 0;
	}

	#if defined(__i386__)
	/* XXX: what about amd64/linux32? */

	int
	linux_uselib(struct thread td, struct linux_uselib_args args)
	{
	struct nameidata ni;
	struct vnode *vp;
	struct exec *a_out;
	struct vattr attr;
	vm_offset_t vmaddr;
	unsigned long file_offset;
	vm_offset_t buffer;
	unsigned long bss_size;
	char *library;
	int error;
	int locked, vfslocked;

	LCONVPATHEXIST(td, args->library, &library);

	#ifdef DEBUG
	if (ldebug(uselib))
	printf(ARGS(uselib, "%s"), library);
	#endif

	a_out = NULL;
	vfslocked = 0;
	locked = 0;
	vp = NULL;

	NDINIT(&ni, LOOKUP, ISOPEN \| FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_SYSSPACE, library, td);
	error = namei(&ni);
	LFREEPATH(library);
	if (error)
	goto cleanup;

	vp = ni.ni_vp;
	vfslocked = NDHASGIANT(&ni);
	NDFREE(&ni, NDF_ONLY_PNBUF);

	/*
	* From here on down, we have a locked vnode that must be unlocked.
	* XXX: The code below largely duplicates exec_check_permissions().
	*/
	locked = 1;

	/* Writable? */
	if (vp->v_writecount) {
	error = ETXTBSY;
	goto cleanup;
	}

	/* Executable? */
	error = VOP_GETATTR(vp, &attr, td->td_ucred);
	if (error)
	goto cleanup;

	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	((attr.va_mode & 0111) == 0) \|\| (attr.va_type != VREG)) {
	/* EACCESS is what exec(2) returns. */
	error = ENOEXEC;
	goto cleanup;
	}

	/* Sensible size? */
	if (attr.va_size == 0) {
	error = ENOEXEC;
	goto cleanup;
	}

	/* Can we access it? */
	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
	if (error)
	goto cleanup;

	/*
	* XXX: This should use vn_open() so that it is properly authorized,
	* and to reduce code redundancy all over the place here.
	* XXX: Not really, it duplicates far more of exec_check_permissions()
	* than vn_open().
	*/
	#ifdef MAC
	error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
	if (error)
	goto cleanup;
	#endif
	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
	if (error)
	goto cleanup;

	/* Pull in executable header into kernel_map */
	error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
	VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
	if (error)
	goto cleanup;

	/* Is it a Linux binary ? */
	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
	error = ENOEXEC;
	goto cleanup;
	}

	/*
	* While we are here, we should REALLY do some more checks
	*/

	/* Set file/virtual offset based on a.out variant. */
	switch ((int)(a_out->a_magic & 0xffff)) {
	case 0413: /* ZMAGIC */
	file_offset = 1024;
	break;
	case 0314: /* QMAGIC */
	file_offset = 0;
	break;
	default:
	error = ENOEXEC;
	goto cleanup;
	}

	bss_size = round_page(a_out->a_bss);

	/* Check various fields in header for validity/bounds. */
	if (a_out->a_text & PAGE_MASK \|\| a_out->a_data & PAGE_MASK) {
	error = ENOEXEC;
	goto cleanup;
	}

	/* text + data can't exceed file size */
	if (a_out->a_data + a_out->a_text > attr.va_size) {
	error = EFAULT;
	goto cleanup;
	}

	/*
	* text/data/bss must not exceed limits
	* XXX - this is not complete. it should check current usage PLUS
	* the resources needed by this library.
	*/
	PROC_LOCK(td->td_proc);
	if (a_out->a_text > maxtsiz \|\|
	a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA)) {
	PROC_UNLOCK(td->td_proc);
	error = ENOMEM;
	goto cleanup;
	}
	PROC_UNLOCK(td->td_proc);

	/*
	* Prevent more writers.
	* XXX: Note that if any of the VM operations fail below we don't
	* clear this flag.
	*/
	vp->v_vflag \|= VV_TEXT;

	/*
	* Lock no longer needed
	*/
	locked = 0;
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);

	/*
	* Check if file_offset page aligned. Currently we cannot handle
	* misalinged file offsets, and so we read in the entire image
	* (what a waste).
	*/
	if (file_offset & PAGE_MASK) {
	#ifdef DEBUG
	printf("uselib: Non page aligned binary %lu\n", file_offset);
	#endif
	/* Map text+data read/write/execute */

	/* a_entry is the load address and is page aligned */
	vmaddr = trunc_page(a_out->a_entry);

	/* get anon user mapping, read+write+execute */
	error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
	&vmaddr, a_out->a_text + a_out->a_data, FALSE, VM_PROT_ALL,
	VM_PROT_ALL, 0);
	if (error)
	goto cleanup;

	/* map file into kernel_map */
	error = vm_mmap(kernel_map, &buffer,
	round_page(a_out->a_text + a_out->a_data + file_offset),
	VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp,
	trunc_page(file_offset));
	if (error)
	goto cleanup;

	/* copy from kernel VM space to user space */
	error = copyout(PTRIN(buffer + file_offset),
	(void *)vmaddr, a_out->a_text + a_out->a_data);

	/* release temporary kernel space */
	vm_map_remove(kernel_map, buffer, buffer +
	round_page(a_out->a_text + a_out->a_data + file_offset));

	if (error)
	goto cleanup;
	} else {
	#ifdef DEBUG
	printf("uselib: Page aligned binary %lu\n", file_offset);
	#endif
	/*
	* for QMAGIC, a_entry is 20 bytes beyond the load address
	* to skip the executable header
	*/
	vmaddr = trunc_page(a_out->a_entry);

	/*
	* Map it all into the process's space as a single
	* copy-on-write "data" segment.
	*/
	error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
	a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
	MAP_PRIVATE \| MAP_FIXED, OBJT_VNODE, vp, file_offset);
	if (error)
	goto cleanup;
	}
	#ifdef DEBUG
	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
	((long *)vmaddr)[1]);
	#endif
	if (bss_size != 0) {
	/* Calculate BSS start address */
	vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
	a_out->a_data;

	/* allocate some 'anon' space */
	error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
	&vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	goto cleanup;
	}

	cleanup:
	/* Unlock vnode if needed */
	if (locked) {
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	}

	/* Release the kernel mapping. */
	if (a_out)
	vm_map_remove(kernel_map, (vm_offset_t)a_out,
	(vm_offset_t)a_out + PAGE_SIZE);

	return error;
	}

	#endif /* __i386__ */

	int
	linux_select(struct thread td, struct linux_select_args args)
	{
	l_timeval ltv;
	struct timeval tv0, tv1, utv, *tvp;
	int error;

	#ifdef DEBUG
	if (ldebug(select))
	printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
	(void )args->readfds, (void )args->writefds,
	(void )args->exceptfds, (void )args->timeout);
	#endif

	/*
	* Store current time for computation of the amount of
	* time left.
	*/
	if (args->timeout) {
	if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
	goto select_out;
	utv.tv_sec = ltv.tv_sec;
	utv.tv_usec = ltv.tv_usec;
	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("incoming timeout (%jd/%ld)"),
	(intmax_t)utv.tv_sec, utv.tv_usec);
	#endif

	if (itimerfix(&utv)) {
	/*
	* The timeval was invalid. Convert it to something
	* valid that will act as it does under Linux.
	*/
	utv.tv_sec += utv.tv_usec / 1000000;
	utv.tv_usec %= 1000000;
	if (utv.tv_usec < 0) {
	utv.tv_sec -= 1;
	utv.tv_usec += 1000000;
	}
	if (utv.tv_sec < 0)
	timevalclear(&utv);
	}
	microtime(&tv0);
	tvp = &utv;
	} else
	tvp = NULL;

	error = kern_select(td, args->nfds, args->readfds, args->writefds,
	args->exceptfds, tvp);

	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("real select returns %d"), error);
	#endif
	if (error)
	goto select_out;

	if (args->timeout) {
	if (td->td_retval[0]) {
	/*
	* Compute how much time was left of the timeout,
	* by subtracting the current time and the time
	* before we started the call, and subtracting
	* that result from the user-supplied value.
	*/
	microtime(&tv1);
	timevalsub(&tv1, &tv0);
	timevalsub(&utv, &tv1);
	if (utv.tv_sec < 0)
	timevalclear(&utv);
	} else
	timevalclear(&utv);
	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("outgoing timeout (%jd/%ld)"),
	(intmax_t)utv.tv_sec, utv.tv_usec);
	#endif
	ltv.tv_sec = utv.tv_sec;
	ltv.tv_usec = utv.tv_usec;
	if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
	goto select_out;
	}

	select_out:
	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("select_out -> %d"), error);
	#endif
	return error;
	}

	int
	linux_mremap(struct thread td, struct linux_mremap_args args)
	{
	struct munmap_args /* {
	void *addr;
	size_t len;
	} */ bsd_args;
	int error = 0;

	#ifdef DEBUG
	if (ldebug(mremap))
	printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
	(void *)(uintptr_t)args->addr,
	(unsigned long)args->old_len,
	(unsigned long)args->new_len,
	(unsigned long)args->flags);
	#endif

	if (args->flags & ~(LINUX_MREMAP_FIXED \| LINUX_MREMAP_MAYMOVE)) {
	td->td_retval[0] = 0;
	return (EINVAL);
	}

	/*
	* Check for the page alignment.
	* Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
	*/
	if (args->addr & PAGE_MASK) {
	td->td_retval[0] = 0;
	return (EINVAL);
	}

	args->new_len = round_page(args->new_len);
	args->old_len = round_page(args->old_len);

	if (args->new_len > args->old_len) {
	td->td_retval[0] = 0;
	return ENOMEM;
	}

	if (args->new_len < args->old_len) {
	bsd_args.addr =
	(caddr_t)((uintptr_t)args->addr + args->new_len);
	bsd_args.len = args->old_len - args->new_len;
	error = munmap(td, &bsd_args);
	}

	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
	return error;
	}

	#define LINUX_MS_ASYNC 0x0001
	#define LINUX_MS_INVALIDATE 0x0002
	#define LINUX_MS_SYNC 0x0004

	int
	linux_msync(struct thread td, struct linux_msync_args args)
	{
	struct msync_args bsd_args;

	bsd_args.addr = (caddr_t)(uintptr_t)args->addr;
	bsd_args.len = (uintptr_t)args->len;
	bsd_args.flags = args->fl & ~LINUX_MS_SYNC;

	return msync(td, &bsd_args);
	}

	int
	linux_time(struct thread td, struct linux_time_args args)
	{
	struct timeval tv;
	l_time_t tm;
	int error;

	#ifdef DEBUG
	if (ldebug(time))
	printf(ARGS(time, "*"));
	#endif

	microtime(&tv);
	tm = tv.tv_sec;
	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
	return error;
	td->td_retval[0] = tm;
	return 0;
	}

	struct l_times_argv {
	l_long tms_utime;
	l_long tms_stime;
	l_long tms_cutime;
	l_long tms_cstime;
	};

	#define CLK_TCK 100 /* Linux uses 100 */

	#define CONVTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))

	int
	linux_times(struct thread td, struct linux_times_args args)
	{
	struct timeval tv, utime, stime, cutime, cstime;
	struct l_times_argv tms;
	struct proc *p;
	int error;

	#ifdef DEBUG
	if (ldebug(times))
	printf(ARGS(times, "*"));
	#endif

	if (args->buf != NULL) {
	p = td->td_proc;
	PROC_LOCK(p);
	PROC_SLOCK(p);
	calcru(p, &utime, &stime);
	PROC_SUNLOCK(p);
	calccru(p, &cutime, &cstime);
	PROC_UNLOCK(p);

	tms.tms_utime = CONVTCK(utime);
	tms.tms_stime = CONVTCK(stime);

	tms.tms_cutime = CONVTCK(cutime);
	tms.tms_cstime = CONVTCK(cstime);

	if ((error = copyout(&tms, args->buf, sizeof(tms))))
	return error;
	}

	microuptime(&tv);
	td->td_retval[0] = (int)CONVTCK(tv);
	return 0;
	}

	int
	linux_newuname(struct thread td, struct linux_newuname_args args)
	{
	+ INIT_VPROCG(TD_TO_VPROCG(td));
	struct l_new_utsname utsname;
	char osname[LINUX_MAX_UTSNAME];
	char osrelease[LINUX_MAX_UTSNAME];
	char *p;

	#ifdef DEBUG
	if (ldebug(newuname))
	printf(ARGS(newuname, "*"));
	#endif

	linux_get_osname(td, osname);
	linux_get_osrelease(td, osrelease);

	bzero(&utsname, sizeof(utsname));
	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
	for (p = utsname.version; *p != '\0'; ++p)
	if (*p == '\n') {
	*p = '\0';
	break;
	}
	#ifdef __i386__
	{
	const char *class;

	switch (cpu_class) {
	case CPUCLASS_686:
	class = "i686";
	break;
	case CPUCLASS_586:
	class = "i586";
	break;
	case CPUCLASS_486:
	class = "i486";
	break;
	default:
	class = "i386";
	}
	strlcpy(utsname.machine, class, LINUX_MAX_UTSNAME);
	}
	#elif defined(__amd64__) /* XXX: Linux can change 'personality'. */
	#ifdef COMPAT_LINUX32
	strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
	#else
	strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
	#endif /* COMPAT_LINUX32 */
	#else /* something other than i386 or amd64 - assume we and Linux agree */
	strlcpy(utsname.machine, machine, LINUX_MAX_UTSNAME);
	#endif /* __i386__ */
	mtx_lock(&hostname_mtx);
	strlcpy(utsname.domainname, V_domainname, LINUX_MAX_UTSNAME);
	mtx_unlock(&hostname_mtx);

	return (copyout(&utsname, args->buf, sizeof(utsname)));
	}

	#if defined(__i386__) \|\| (defined(__amd64__) && defined(COMPAT_LINUX32))
	struct l_utimbuf {
	l_time_t l_actime;
	l_time_t l_modtime;
	};

	int
	linux_utime(struct thread td, struct linux_utime_args args)
	{
	struct timeval tv[2], *tvp;
	struct l_utimbuf lut;
	char *fname;
	int error;

	LCONVPATHEXIST(td, args->fname, &fname);

	#ifdef DEBUG
	if (ldebug(utime))
	printf(ARGS(utime, "%s, *"), fname);
	#endif

	if (args->times) {
	if ((error = copyin(args->times, &lut, sizeof lut))) {
	LFREEPATH(fname);
	return error;
	}
	tv[0].tv_sec = lut.l_actime;
	tv[0].tv_usec = 0;
	tv[1].tv_sec = lut.l_modtime;
	tv[1].tv_usec = 0;
	tvp = tv;
	} else
	tvp = NULL;

	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
	LFREEPATH(fname);
	return (error);
	}

	int
	linux_utimes(struct thread td, struct linux_utimes_args args)
	{
	l_timeval ltv[2];
	struct timeval tv[2], *tvp = NULL;
	char *fname;
	int error;

	LCONVPATHEXIST(td, args->fname, &fname);

	#ifdef DEBUG
	if (ldebug(utimes))
	printf(ARGS(utimes, "%s, *"), fname);
	#endif

	if (args->tptr != NULL) {
	if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
	LFREEPATH(fname);
	return (error);
	}
	tv[0].tv_sec = ltv[0].tv_sec;
	tv[0].tv_usec = ltv[0].tv_usec;
	tv[1].tv_sec = ltv[1].tv_sec;
	tv[1].tv_usec = ltv[1].tv_usec;
	tvp = tv;
	}

	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
	LFREEPATH(fname);
	return (error);
	}

	int
	linux_futimesat(struct thread td, struct linux_futimesat_args args)
	{
	l_timeval ltv[2];
	struct timeval tv[2], *tvp = NULL;
	char *fname;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);

	#ifdef DEBUG
	if (ldebug(futimesat))
	printf(ARGS(futimesat, "%s, *"), fname);
	#endif

	if (args->utimes != NULL) {
	if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
	LFREEPATH(fname);
	return (error);
	}
	tv[0].tv_sec = ltv[0].tv_sec;
	tv[0].tv_usec = ltv[0].tv_usec;
	tv[1].tv_sec = ltv[1].tv_sec;
	tv[1].tv_usec = ltv[1].tv_usec;
	tvp = tv;
	}

	error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
	LFREEPATH(fname);
	return (error);
	}
	#endif /* __i386__ \|\| (__amd64__ && COMPAT_LINUX32) */

	#define __WCLONE 0x80000000

	int
	linux_waitpid(struct thread td, struct linux_waitpid_args args)
	{
	int error, options, tmpstat;

	#ifdef DEBUG
	if (ldebug(waitpid))
	printf(ARGS(waitpid, "%d, %p, %d"),
	args->pid, (void *)args->status, args->options);
	#endif
	/*
	* this is necessary because the test in kern_wait doesn't work
	* because we mess with the options here
	*/
	if (args->options & ~(WUNTRACED \| WNOHANG \| WCONTINUED \| __WCLONE))
	return (EINVAL);

	options = (args->options & (WNOHANG \| WUNTRACED));
	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
	if (args->options & __WCLONE)
	options \|= WLINUXCLONE;

	error = kern_wait(td, args->pid, &tmpstat, options, NULL);
	if (error)
	return error;

	if (args->status) {
	tmpstat &= 0xffff;
	if (WIFSIGNALED(tmpstat))
	tmpstat = (tmpstat & 0xffffff80) \|
	BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
	else if (WIFSTOPPED(tmpstat))
	tmpstat = (tmpstat & 0xffff00ff) \|
	(BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
	return copyout(&tmpstat, args->status, sizeof(int));
	}

	return 0;
	}

	int
	linux_wait4(struct thread td, struct linux_wait4_args args)
	{
	int error, options, tmpstat;
	struct rusage ru, *rup;
	struct proc *p;

	#ifdef DEBUG
	if (ldebug(wait4))
	printf(ARGS(wait4, "%d, %p, %d, %p"),
	args->pid, (void *)args->status, args->options,
	(void *)args->rusage);
	#endif

	options = (args->options & (WNOHANG \| WUNTRACED));
	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
	if (args->options & __WCLONE)
	options \|= WLINUXCLONE;

	if (args->rusage != NULL)
	rup = &ru;
	else
	rup = NULL;
	error = kern_wait(td, args->pid, &tmpstat, options, rup);
	if (error)
	return error;

	p = td->td_proc;
	PROC_LOCK(p);
	sigqueue_delete(&p->p_sigqueue, SIGCHLD);
	PROC_UNLOCK(p);

	if (args->status) {
	tmpstat &= 0xffff;
	if (WIFSIGNALED(tmpstat))
	tmpstat = (tmpstat & 0xffffff80) \|
	BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
	else if (WIFSTOPPED(tmpstat))
	tmpstat = (tmpstat & 0xffff00ff) \|
	(BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
	error = copyout(&tmpstat, args->status, sizeof(int));
	}
	if (args->rusage != NULL && error == 0)
	error = copyout(&ru, args->rusage, sizeof(ru));

	return (error);
	}

	int
	linux_mknod(struct thread td, struct linux_mknod_args args)
	{
	char *path;
	int error;

	LCONVPATHCREAT(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(mknod))
	printf(ARGS(mknod, "%s, %d, %d"), path, args->mode, args->dev);
	#endif

	switch (args->mode & S_IFMT) {
	case S_IFIFO:
	case S_IFSOCK:
	error = kern_mkfifo(td, path, UIO_SYSSPACE, args->mode);
	break;

	case S_IFCHR:
	case S_IFBLK:
	error = kern_mknod(td, path, UIO_SYSSPACE, args->mode,
	args->dev);
	break;

	case S_IFDIR:
	error = EPERM;
	break;

	case 0:
	args->mode \|= S_IFREG;
	/* FALLTHROUGH */
	case S_IFREG:
	error = kern_open(td, path, UIO_SYSSPACE,
	O_WRONLY \| O_CREAT \| O_TRUNC, args->mode);
	if (error == 0)
	kern_close(td, td->td_retval[0]);
	break;

	default:
	error = EINVAL;
	break;
	}
	LFREEPATH(path);
	return (error);
	}

	int
	linux_mknodat(struct thread td, struct linux_mknodat_args args)
	{
	char *path;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHCREAT_AT(td, args->filename, &path, dfd);

	#ifdef DEBUG
	if (ldebug(mknodat))
	printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev);
	#endif

	switch (args->mode & S_IFMT) {
	case S_IFIFO:
	case S_IFSOCK:
	error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
	break;

	case S_IFCHR:
	case S_IFBLK:
	error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
	args->dev);
	break;

	case S_IFDIR:
	error = EPERM;
	break;

	case 0:
	args->mode \|= S_IFREG;
	/* FALLTHROUGH */
	case S_IFREG:
	error = kern_openat(td, dfd, path, UIO_SYSSPACE,
	O_WRONLY \| O_CREAT \| O_TRUNC, args->mode);
	if (error == 0)
	kern_close(td, td->td_retval[0]);
	break;

	default:
	error = EINVAL;
	break;
	}
	LFREEPATH(path);
	return (error);
	}

	/*
	* UGH! This is just about the dumbest idea I've ever heard!!
	*/
	int
	linux_personality(struct thread td, struct linux_personality_args args)
	{
	#ifdef DEBUG
	if (ldebug(personality))
	printf(ARGS(personality, "%lu"), (unsigned long)args->per);
	#endif
	if (args->per != 0)
	return EINVAL;

	/* Yes Jim, it's still a Linux... */
	td->td_retval[0] = 0;
	return 0;
	}

	struct l_itimerval {
	l_timeval it_interval;
	l_timeval it_value;
	};

	#define B2L_ITIMERVAL(bip, lip) \
	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \
	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \
	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \
	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;

	int
	linux_setitimer(struct thread td, struct linux_setitimer_args uap)
	{
	int error;
	struct l_itimerval ls;
	struct itimerval aitv, oitv;

	#ifdef DEBUG
	if (ldebug(setitimer))
	printf(ARGS(setitimer, "%p, %p"),
	(void )uap->itv, (void )uap->oitv);
	#endif

	if (uap->itv == NULL) {
	uap->itv = uap->oitv;
	return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
	}

	error = copyin(uap->itv, &ls, sizeof(ls));
	if (error != 0)
	return (error);
	B2L_ITIMERVAL(&aitv, &ls);
	#ifdef DEBUG
	if (ldebug(setitimer)) {
	printf("setitimer: value: sec: %jd, usec: %ld\n",
	(intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
	printf("setitimer: interval: sec: %jd, usec: %ld\n",
	(intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
	}
	#endif
	error = kern_setitimer(td, uap->which, &aitv, &oitv);
	if (error != 0 \|\| uap->oitv == NULL)
	return (error);
	B2L_ITIMERVAL(&ls, &oitv);

	return (copyout(&ls, uap->oitv, sizeof(ls)));
	}

	int
	linux_getitimer(struct thread td, struct linux_getitimer_args uap)
	{
	int error;
	struct l_itimerval ls;
	struct itimerval aitv;

	#ifdef DEBUG
	if (ldebug(getitimer))
	printf(ARGS(getitimer, "%p"), (void *)uap->itv);
	#endif
	error = kern_getitimer(td, uap->which, &aitv);
	if (error != 0)
	return (error);
	B2L_ITIMERVAL(&ls, &aitv);
	return (copyout(&ls, uap->itv, sizeof(ls)));
	}

	int
	linux_nice(struct thread td, struct linux_nice_args args)
	{
	struct setpriority_args bsd_args;

	bsd_args.which = PRIO_PROCESS;
	bsd_args.who = 0; /* current process */
	bsd_args.prio = args->inc;
	return setpriority(td, &bsd_args);
	}

	int
	linux_setgroups(struct thread td, struct linux_setgroups_args args)
	{
	struct ucred newcred, oldcred;
	l_gid_t linux_gidset[NGROUPS];
	gid_t *bsd_gidset;
	int ngrp, error;
	struct proc *p;

	ngrp = args->gidsetsize;
	if (ngrp < 0 \|\| ngrp >= NGROUPS)
	return (EINVAL);
	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
	if (error)
	return (error);
	newcred = crget();
	p = td->td_proc;
	PROC_LOCK(p);
	oldcred = p->p_ucred;

	/*
	* cr_groups[0] holds egid. Setting the whole set from
	* the supplied set will cause egid to be changed too.
	* Keep cr_groups[0] unchanged to prevent that.
	*/

	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
	PROC_UNLOCK(p);
	crfree(newcred);
	return (error);
	}

	crcopy(newcred, oldcred);
	if (ngrp > 0) {
	newcred->cr_ngroups = ngrp + 1;

	bsd_gidset = newcred->cr_groups;
	ngrp--;
	while (ngrp >= 0) {
	bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
	ngrp--;
	}
	} else
	newcred->cr_ngroups = 1;

	setsugid(p);
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);
	}

	int
	linux_getgroups(struct thread td, struct linux_getgroups_args args)
	{
	struct ucred *cred;
	l_gid_t linux_gidset[NGROUPS];
	gid_t *bsd_gidset;
	int bsd_gidsetsz, ngrp, error;

	cred = td->td_ucred;
	bsd_gidset = cred->cr_groups;
	bsd_gidsetsz = cred->cr_ngroups - 1;

	/*
	* cr_groups[0] holds egid. Returning the whole set
	* here will cause a duplicate. Exclude cr_groups[0]
	* to prevent that.
	*/

	if ((ngrp = args->gidsetsize) == 0) {
	td->td_retval[0] = bsd_gidsetsz;
	return (0);
	}

	if (ngrp < bsd_gidsetsz)
	return (EINVAL);

	ngrp = 0;
	while (ngrp < bsd_gidsetsz) {
	linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
	ngrp++;
	}

	if ((error = copyout(linux_gidset, args->grouplist,
	ngrp * sizeof(l_gid_t))))
	return (error);

	td->td_retval[0] = ngrp;
	return (0);
	}

	int
	linux_setrlimit(struct thread td, struct linux_setrlimit_args args)
	{
	struct rlimit bsd_rlim;
	struct l_rlimit rlim;
	u_int which;
	int error;

	#ifdef DEBUG
	if (ldebug(setrlimit))
	printf(ARGS(setrlimit, "%d, %p"),
	args->resource, (void *)args->rlim);
	#endif

	if (args->resource >= LINUX_RLIM_NLIMITS)
	return (EINVAL);

	which = linux_to_bsd_resource[args->resource];
	if (which == -1)
	return (EINVAL);

	error = copyin(args->rlim, &rlim, sizeof(rlim));
	if (error)
	return (error);

	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
	return (kern_setrlimit(td, which, &bsd_rlim));
	}

	int
	linux_old_getrlimit(struct thread td, struct linux_old_getrlimit_args args)
	{
	struct l_rlimit rlim;
	struct proc *p = td->td_proc;
	struct rlimit bsd_rlim;
	u_int which;

	#ifdef DEBUG
	if (ldebug(old_getrlimit))
	printf(ARGS(old_getrlimit, "%d, %p"),
	args->resource, (void *)args->rlim);
	#endif

	if (args->resource >= LINUX_RLIM_NLIMITS)
	return (EINVAL);

	which = linux_to_bsd_resource[args->resource];
	if (which == -1)
	return (EINVAL);

	PROC_LOCK(p);
	lim_rlimit(p, which, &bsd_rlim);
	PROC_UNLOCK(p);

	#ifdef COMPAT_LINUX32
	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
	if (rlim.rlim_cur == UINT_MAX)
	rlim.rlim_cur = INT_MAX;
	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
	if (rlim.rlim_max == UINT_MAX)
	rlim.rlim_max = INT_MAX;
	#else
	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
	if (rlim.rlim_cur == ULONG_MAX)
	rlim.rlim_cur = LONG_MAX;
	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
	if (rlim.rlim_max == ULONG_MAX)
	rlim.rlim_max = LONG_MAX;
	#endif
	return (copyout(&rlim, args->rlim, sizeof(rlim)));
	}

	int
	linux_getrlimit(struct thread td, struct linux_getrlimit_args args)
	{
	struct l_rlimit rlim;
	struct proc *p = td->td_proc;
	struct rlimit bsd_rlim;
	u_int which;

	#ifdef DEBUG
	if (ldebug(getrlimit))
	printf(ARGS(getrlimit, "%d, %p"),
	args->resource, (void *)args->rlim);
	#endif

	if (args->resource >= LINUX_RLIM_NLIMITS)
	return (EINVAL);

	which = linux_to_bsd_resource[args->resource];
	if (which == -1)
	return (EINVAL);

	PROC_LOCK(p);
	lim_rlimit(p, which, &bsd_rlim);
	PROC_UNLOCK(p);

	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
	return (copyout(&rlim, args->rlim, sizeof(rlim)));
	}

	int
	linux_sched_setscheduler(struct thread *td,
	struct linux_sched_setscheduler_args *args)
	{
	struct sched_setscheduler_args bsd;

	#ifdef DEBUG
	if (ldebug(sched_setscheduler))
	printf(ARGS(sched_setscheduler, "%d, %d, %p"),
	args->pid, args->policy, (const void *)args->param);
	#endif

	switch (args->policy) {
	case LINUX_SCHED_OTHER:
	bsd.policy = SCHED_OTHER;
	break;
	case LINUX_SCHED_FIFO:
	bsd.policy = SCHED_FIFO;
	break;
	case LINUX_SCHED_RR:
	bsd.policy = SCHED_RR;
	break;
	default:
	return EINVAL;
	}

	bsd.pid = args->pid;
	bsd.param = (struct sched_param *)args->param;
	return sched_setscheduler(td, &bsd);
	}

	int
	linux_sched_getscheduler(struct thread *td,
	struct linux_sched_getscheduler_args *args)
	{
	struct sched_getscheduler_args bsd;
	int error;

	#ifdef DEBUG
	if (ldebug(sched_getscheduler))
	printf(ARGS(sched_getscheduler, "%d"), args->pid);
	#endif

	bsd.pid = args->pid;
	error = sched_getscheduler(td, &bsd);

	switch (td->td_retval[0]) {
	case SCHED_OTHER:
	td->td_retval[0] = LINUX_SCHED_OTHER;
	break;
	case SCHED_FIFO:
	td->td_retval[0] = LINUX_SCHED_FIFO;
	break;
	case SCHED_RR:
	td->td_retval[0] = LINUX_SCHED_RR;
	break;
	}

	return error;
	}

	int
	linux_sched_get_priority_max(struct thread *td,
	struct linux_sched_get_priority_max_args *args)
	{
	struct sched_get_priority_max_args bsd;

	#ifdef DEBUG
	if (ldebug(sched_get_priority_max))
	printf(ARGS(sched_get_priority_max, "%d"), args->policy);
	#endif

	switch (args->policy) {
	case LINUX_SCHED_OTHER:
	bsd.policy = SCHED_OTHER;
	break;
	case LINUX_SCHED_FIFO:
	bsd.policy = SCHED_FIFO;
	break;
	case LINUX_SCHED_RR:
	bsd.policy = SCHED_RR;
	break;
	default:
	return EINVAL;
	}
	return sched_get_priority_max(td, &bsd);
	}

	int
	linux_sched_get_priority_min(struct thread *td,
	struct linux_sched_get_priority_min_args *args)
	{
	struct sched_get_priority_min_args bsd;

	#ifdef DEBUG
	if (ldebug(sched_get_priority_min))
	printf(ARGS(sched_get_priority_min, "%d"), args->policy);
	#endif

	switch (args->policy) {
	case LINUX_SCHED_OTHER:
	bsd.policy = SCHED_OTHER;
	break;
	case LINUX_SCHED_FIFO:
	bsd.policy = SCHED_FIFO;
	break;
	case LINUX_SCHED_RR:
	bsd.policy = SCHED_RR;
	break;
	default:
	return EINVAL;
	}
	return sched_get_priority_min(td, &bsd);
	}

	#define REBOOT_CAD_ON 0x89abcdef
	#define REBOOT_CAD_OFF 0
	#define REBOOT_HALT 0xcdef0123
	#define REBOOT_RESTART 0x01234567
	#define REBOOT_RESTART2 0xA1B2C3D4
	#define REBOOT_POWEROFF 0x4321FEDC
	#define REBOOT_MAGIC1 0xfee1dead
	#define REBOOT_MAGIC2 0x28121969
	#define REBOOT_MAGIC2A 0x05121996
	#define REBOOT_MAGIC2B 0x16041998

	int
	linux_reboot(struct thread td, struct linux_reboot_args args)
	{
	struct reboot_args bsd_args;

	#ifdef DEBUG
	if (ldebug(reboot))
	printf(ARGS(reboot, "0x%x"), args->cmd);
	#endif

	if (args->magic1 != REBOOT_MAGIC1)
	return EINVAL;

	switch (args->magic2) {
	case REBOOT_MAGIC2:
	case REBOOT_MAGIC2A:
	case REBOOT_MAGIC2B:
	break;
	default:
	return EINVAL;
	}

	switch (args->cmd) {
	case REBOOT_CAD_ON:
	case REBOOT_CAD_OFF:
	return (priv_check(td, PRIV_REBOOT));
	case REBOOT_HALT:
	bsd_args.opt = RB_HALT;
	break;
	case REBOOT_RESTART:
	case REBOOT_RESTART2:
	bsd_args.opt = 0;
	break;
	case REBOOT_POWEROFF:
	bsd_args.opt = RB_POWEROFF;
	break;
	default:
	return EINVAL;
	}
	return reboot(td, &bsd_args);
	}


	/*
	* The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
	* td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
	* are assumed to be preserved. The following lightweight syscalls fixes
	* this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
	*
	* linux_getpid() - MP SAFE
	* linux_getgid() - MP SAFE
	* linux_getuid() - MP SAFE
	*/

	int
	linux_getpid(struct thread td, struct linux_getpid_args args)
	{
	struct linux_emuldata *em;

	#ifdef DEBUG
	if (ldebug(getpid))
	printf(ARGS(getpid, ""));
	#endif

	if (linux_use26(td)) {
	em = em_find(td->td_proc, EMUL_DONTLOCK);
	KASSERT(em != NULL, ("getpid: emuldata not found.\n"));
	td->td_retval[0] = em->shared->group_pid;
	} else {
	td->td_retval[0] = td->td_proc->p_pid;
	}

	return (0);
	}

	int
	linux_gettid(struct thread td, struct linux_gettid_args args)
	{

	#ifdef DEBUG
	if (ldebug(gettid))
	printf(ARGS(gettid, ""));
	#endif

	td->td_retval[0] = td->td_proc->p_pid;
	return (0);
	}


	int
	linux_getppid(struct thread td, struct linux_getppid_args args)
	{
	struct linux_emuldata *em;
	struct proc p, pp;

	#ifdef DEBUG
	if (ldebug(getppid))
	printf(ARGS(getppid, ""));
	#endif

	if (!linux_use26(td)) {
	PROC_LOCK(td->td_proc);
	td->td_retval[0] = td->td_proc->p_pptr->p_pid;
	PROC_UNLOCK(td->td_proc);
	return (0);
	}

	em = em_find(td->td_proc, EMUL_DONTLOCK);

	KASSERT(em != NULL, ("getppid: process emuldata not found.\n"));

	/* find the group leader */
	p = pfind(em->shared->group_pid);

	if (p == NULL) {
	#ifdef DEBUG
	printf(LMSG("parent process not found.\n"));
	#endif
	return (0);
	}

	pp = p->p_pptr; /* switch to parent */
	PROC_LOCK(pp);
	PROC_UNLOCK(p);

	/* if its also linux process */
	if (pp->p_sysent == &elf_linux_sysvec) {
	em = em_find(pp, EMUL_DONTLOCK);
	KASSERT(em != NULL, ("getppid: parent emuldata not found.\n"));

	td->td_retval[0] = em->shared->group_pid;
	} else
	td->td_retval[0] = pp->p_pid;

	PROC_UNLOCK(pp);

	return (0);
	}

	int
	linux_getgid(struct thread td, struct linux_getgid_args args)
	{

	#ifdef DEBUG
	if (ldebug(getgid))
	printf(ARGS(getgid, ""));
	#endif

	td->td_retval[0] = td->td_ucred->cr_rgid;
	return (0);
	}

	int
	linux_getuid(struct thread td, struct linux_getuid_args args)
	{

	#ifdef DEBUG
	if (ldebug(getuid))
	printf(ARGS(getuid, ""));
	#endif

	td->td_retval[0] = td->td_ucred->cr_ruid;
	return (0);
	}


	int
	linux_getsid(struct thread td, struct linux_getsid_args args)
	{
	struct getsid_args bsd;

	#ifdef DEBUG
	if (ldebug(getsid))
	printf(ARGS(getsid, "%i"), args->pid);
	#endif

	bsd.pid = args->pid;
	return getsid(td, &bsd);
	}

	int
	linux_nosys(struct thread td, struct nosys_args ignore)
	{

	return (ENOSYS);
	}

	int
	linux_getpriority(struct thread td, struct linux_getpriority_args args)
	{
	struct getpriority_args bsd_args;
	int error;

	#ifdef DEBUG
	if (ldebug(getpriority))
	printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
	#endif

	bsd_args.which = args->which;
	bsd_args.who = args->who;
	error = getpriority(td, &bsd_args);
	td->td_retval[0] = 20 - td->td_retval[0];
	return error;
	}

	int
	linux_sethostname(struct thread td, struct linux_sethostname_args args)
	{
	int name[2];

	#ifdef DEBUG
	if (ldebug(sethostname))
	printf(ARGS(sethostname, "*, %i"), args->len);
	#endif

	name[0] = CTL_KERN;
	name[1] = KERN_HOSTNAME;
	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
	args->len, 0, 0));
	}

	int
	linux_exit_group(struct thread td, struct linux_exit_group_args args)
	{
	struct linux_emuldata em, td_em, *tmp_em;
	struct proc *sp;

	#ifdef DEBUG
	if (ldebug(exit_group))
	printf(ARGS(exit_group, "%i"), args->error_code);
	#endif

	if (linux_use26(td)) {
	td_em = em_find(td->td_proc, EMUL_DONTLOCK);

	KASSERT(td_em != NULL, ("exit_group: emuldata not found.\n"));

	EMUL_SHARED_RLOCK(&emul_shared_lock);
	LIST_FOREACH_SAFE(em, &td_em->shared->threads, threads, tmp_em) {
	if (em->pid == td_em->pid)
	continue;

	sp = pfind(em->pid);
	psignal(sp, SIGKILL);
	PROC_UNLOCK(sp);
	#ifdef DEBUG
	printf(LMSG("linux_sys_exit_group: kill PID %d\n"), em->pid);
	#endif
	}

	EMUL_SHARED_RUNLOCK(&emul_shared_lock);
	}
	/*
	* XXX: we should send a signal to the parent if
	* SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
	* as it doesnt occur often.
	*/
	exit1(td, W_EXITCODE(args->error_code, 0));

	return (0);
	}

	int
	linux_prctl(struct thread td, struct linux_prctl_args args)
	{
	int error = 0, max_size;
	struct proc *p = td->td_proc;
	char comm[LINUX_MAX_COMM_LEN];
	struct linux_emuldata *em;
	int pdeath_signal;

	#ifdef DEBUG
	if (ldebug(prctl))
	printf(ARGS(prctl, "%d, %d, %d, %d, %d"), args->option,
	args->arg2, args->arg3, args->arg4, args->arg5);
	#endif

	switch (args->option) {
	case LINUX_PR_SET_PDEATHSIG:
	if (!LINUX_SIG_VALID(args->arg2))
	return (EINVAL);
	em = em_find(p, EMUL_DOLOCK);
	KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
	em->pdeath_signal = args->arg2;
	EMUL_UNLOCK(&emul_lock);
	break;
	case LINUX_PR_GET_PDEATHSIG:
	em = em_find(p, EMUL_DOLOCK);
	KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
	pdeath_signal = em->pdeath_signal;
	EMUL_UNLOCK(&emul_lock);
	error = copyout(&pdeath_signal,
	(void *)(register_t)args->arg2,
	sizeof(pdeath_signal));
	break;
	case LINUX_PR_SET_NAME:
	/*
	* To be on the safe side we need to make sure to not
	* overflow the size a linux program expects. We already
	* do this here in the copyin, so that we don't need to
	* check on copyout.
	*/
	max_size = MIN(sizeof(comm), sizeof(p->p_comm));
	error = copyinstr((void *)(register_t)args->arg2, comm,
	max_size, NULL);

	/* Linux silently truncates the name if it is too long. */
	if (error == ENAMETOOLONG) {
	/*
	* XXX: copyinstr() isn't documented to populate the
	* array completely, so do a copyin() to be on the
	* safe side. This should be changed in case
	* copyinstr() is changed to guarantee this.
	*/
	error = copyin((void *)(register_t)args->arg2, comm,
	max_size - 1);
	comm[max_size - 1] = '\0';
	}
	if (error)
	return (error);

	PROC_LOCK(p);
	strlcpy(p->p_comm, comm, sizeof(p->p_comm));
	PROC_UNLOCK(p);
	break;
	case LINUX_PR_GET_NAME:
	PROC_LOCK(p);
	strlcpy(comm, p->p_comm, sizeof(comm));
	PROC_UNLOCK(p);
	error = copyout(comm, (void *)(register_t)args->arg2,
	strlen(comm) + 1);
	break;
	default:
	error = EINVAL;
	break;
	}

	return (error);
	}

	/*
	* Get affinity of a process.
	*/
	int
	linux_sched_getaffinity(struct thread *td,
	struct linux_sched_getaffinity_args *args)
	{
	int error;
	struct cpuset_getaffinity_args cga;

	#ifdef DEBUG
	if (ldebug(sched_getaffinity))
	printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid,
	args->len);
	#endif

	cga.level = CPU_LEVEL_WHICH;
	cga.which = CPU_WHICH_PID;
	cga.id = args->pid;
	cga.cpusetsize = sizeof(cpumask_t);
	cga.mask = (cpuset_t *) args->user_mask_ptr;

	if ((error = cpuset_getaffinity(td, &cga)) == 0)
	td->td_retval[0] = sizeof(cpumask_t);

	return (error);
	}

	/*
	* Set affinity of a process.
	*/
	int
	linux_sched_setaffinity(struct thread *td,
	struct linux_sched_setaffinity_args *args)
	{
	struct cpuset_setaffinity_args csa;

	#ifdef DEBUG
	if (ldebug(sched_setaffinity))
	printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid,
	args->len);
	#endif
	csa.level = CPU_LEVEL_WHICH;
	csa.which = CPU_WHICH_PID;
	csa.id = args->pid;
	csa.cpusetsize = args->len;
	csa.mask = (cpuset_t *) args->user_mask_ptr;

	return (cpuset_setaffinity(td, &csa));
	}
	Index: head/sys/compat/linux/linux_socket.c
	===================================================================
	--- head/sys/compat/linux/linux_socket.c (revision 183549)
	+++ head/sys/compat/linux/linux_socket.c (revision 183550)
	@@ -1,1211 +1,1214 @@
	/*-
	* Copyright (c) 1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/* XXX we use functions that might not exist. */
	#include "opt_compat.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/uio.h>
	#include <sys/syslog.h>
	#include <sys/un.h>
	#include <sys/vimage.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif
	#include <compat/linux/linux_socket.h>
	#include <compat/linux/linux_util.h>

	static int do_sa_get(struct sockaddr *, const struct osockaddr , int *,
	struct malloc_type *);
	static int linux_to_bsd_domain(int);

	/*
	* Reads a linux sockaddr and does any necessary translation.
	* Linux sockaddrs don't have a length field, only a family.
	*/
	static int
	linux_getsockaddr(struct sockaddr *sap, const struct osockaddr osa, int len)
	{
	int osalen = len;

	return (do_sa_get(sap, osa, &osalen, M_SONAME));
	}

	/*
	* Copy the osockaddr structure pointed to by osa to kernel, adjust
	* family and convert to sockaddr.
	*/
	static int
	do_sa_get(struct sockaddr *sap, const struct osockaddr osa, int *osalen,
	struct malloc_type *mtype)
	{
	int error=0, bdom;
	struct sockaddr *sa;
	struct osockaddr *kosa;
	int alloclen;
	#ifdef INET6
	int oldv6size;
	struct sockaddr_in6 *sin6;
	#endif

	if (osalen < 2 \|\| osalen > UCHAR_MAX \|\| !osa)
	return (EINVAL);

	alloclen = *osalen;
	#ifdef INET6
	oldv6size = 0;
	/*
	* Check for old (pre-RFC2553) sockaddr_in6. We may accept it
	* if it's a v4-mapped address, so reserve the proper space
	* for it.
	*/
	if (alloclen == sizeof (struct sockaddr_in6) - sizeof (u_int32_t)) {
	alloclen = sizeof (struct sockaddr_in6);
	oldv6size = 1;
	}
	#endif

	MALLOC(kosa, struct osockaddr *, alloclen, mtype, M_WAITOK);

	if ((error = copyin(osa, kosa, *osalen)))
	goto out;

	bdom = linux_to_bsd_domain(kosa->sa_family);
	if (bdom == -1) {
	error = EINVAL;
	goto out;
	}

	#ifdef INET6
	/*
	* Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6,
	* which lacks the scope id compared with RFC2553 one. If we detect
	* the situation, reject the address and write a message to system log.
	*
	* Still accept addresses for which the scope id is not used.
	*/
	if (oldv6size && bdom == AF_INET6) {
	sin6 = (struct sockaddr_in6 *)kosa;
	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) \|\|
	(!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) {
	sin6->sin6_scope_id = 0;
	} else {
	log(LOG_DEBUG,
	"obsolete pre-RFC2553 sockaddr_in6 rejected\n");
	error = EINVAL;
	goto out;
	}
	} else
	#endif
	if (bdom == AF_INET)
	alloclen = sizeof(struct sockaddr_in);

	sa = (struct sockaddr *) kosa;
	sa->sa_family = bdom;
	sa->sa_len = alloclen;

	*sap = sa;
	*osalen = alloclen;
	return (0);

	out:
	FREE(kosa, mtype);
	return (error);
	}

	static int
	linux_to_bsd_domain(int domain)
	{

	switch (domain) {
	case LINUX_AF_UNSPEC:
	return (AF_UNSPEC);
	case LINUX_AF_UNIX:
	return (AF_LOCAL);
	case LINUX_AF_INET:
	return (AF_INET);
	case LINUX_AF_INET6:
	return (AF_INET6);
	case LINUX_AF_AX25:
	return (AF_CCITT);
	case LINUX_AF_IPX:
	return (AF_IPX);
	case LINUX_AF_APPLETALK:
	return (AF_APPLETALK);
	}
	return (-1);
	}

	static int
	bsd_to_linux_domain(int domain)
	{

	switch (domain) {
	case AF_UNSPEC:
	return (LINUX_AF_UNSPEC);
	case AF_LOCAL:
	return (LINUX_AF_UNIX);
	case AF_INET:
	return (LINUX_AF_INET);
	case AF_INET6:
	return (LINUX_AF_INET6);
	case AF_CCITT:
	return (LINUX_AF_AX25);
	case AF_IPX:
	return (LINUX_AF_IPX);
	case AF_APPLETALK:
	return (LINUX_AF_APPLETALK);
	}
	return (-1);
	}

	static int
	linux_to_bsd_sockopt_level(int level)
	{

	switch (level) {
	case LINUX_SOL_SOCKET:
	return (SOL_SOCKET);
	}
	return (level);
	}

	static int
	bsd_to_linux_sockopt_level(int level)
	{

	switch (level) {
	case SOL_SOCKET:
	return (LINUX_SOL_SOCKET);
	}
	return (level);
	}

	static int
	linux_to_bsd_ip_sockopt(int opt)
	{

	switch (opt) {
	case LINUX_IP_TOS:
	return (IP_TOS);
	case LINUX_IP_TTL:
	return (IP_TTL);
	case LINUX_IP_OPTIONS:
	return (IP_OPTIONS);
	case LINUX_IP_MULTICAST_IF:
	return (IP_MULTICAST_IF);
	case LINUX_IP_MULTICAST_TTL:
	return (IP_MULTICAST_TTL);
	case LINUX_IP_MULTICAST_LOOP:
	return (IP_MULTICAST_LOOP);
	case LINUX_IP_ADD_MEMBERSHIP:
	return (IP_ADD_MEMBERSHIP);
	case LINUX_IP_DROP_MEMBERSHIP:
	return (IP_DROP_MEMBERSHIP);
	case LINUX_IP_HDRINCL:
	return (IP_HDRINCL);
	}
	return (-1);
	}

	static int
	linux_to_bsd_so_sockopt(int opt)
	{

	switch (opt) {
	case LINUX_SO_DEBUG:
	return (SO_DEBUG);
	case LINUX_SO_REUSEADDR:
	return (SO_REUSEADDR);
	case LINUX_SO_TYPE:
	return (SO_TYPE);
	case LINUX_SO_ERROR:
	return (SO_ERROR);
	case LINUX_SO_DONTROUTE:
	return (SO_DONTROUTE);
	case LINUX_SO_BROADCAST:
	return (SO_BROADCAST);
	case LINUX_SO_SNDBUF:
	return (SO_SNDBUF);
	case LINUX_SO_RCVBUF:
	return (SO_RCVBUF);
	case LINUX_SO_KEEPALIVE:
	return (SO_KEEPALIVE);
	case LINUX_SO_OOBINLINE:
	return (SO_OOBINLINE);
	case LINUX_SO_LINGER:
	return (SO_LINGER);
	case LINUX_SO_PEERCRED:
	return (LOCAL_PEERCRED);
	case LINUX_SO_RCVLOWAT:
	return (SO_RCVLOWAT);
	case LINUX_SO_SNDLOWAT:
	return (SO_SNDLOWAT);
	case LINUX_SO_RCVTIMEO:
	return (SO_RCVTIMEO);
	case LINUX_SO_SNDTIMEO:
	return (SO_SNDTIMEO);
	case LINUX_SO_TIMESTAMP:
	return (SO_TIMESTAMP);
	case LINUX_SO_ACCEPTCONN:
	return (SO_ACCEPTCONN);
	}
	return (-1);
	}

	static int
	linux_to_bsd_msg_flags(int flags)
	{
	int ret_flags = 0;

	if (flags & LINUX_MSG_OOB)
	ret_flags \|= MSG_OOB;
	if (flags & LINUX_MSG_PEEK)
	ret_flags \|= MSG_PEEK;
	if (flags & LINUX_MSG_DONTROUTE)
	ret_flags \|= MSG_DONTROUTE;
	if (flags & LINUX_MSG_CTRUNC)
	ret_flags \|= MSG_CTRUNC;
	if (flags & LINUX_MSG_TRUNC)
	ret_flags \|= MSG_TRUNC;
	if (flags & LINUX_MSG_DONTWAIT)
	ret_flags \|= MSG_DONTWAIT;
	if (flags & LINUX_MSG_EOR)
	ret_flags \|= MSG_EOR;
	if (flags & LINUX_MSG_WAITALL)
	ret_flags \|= MSG_WAITALL;
	if (flags & LINUX_MSG_NOSIGNAL)
	ret_flags \|= MSG_NOSIGNAL;
	#if 0 /* not handled */
	if (flags & LINUX_MSG_PROXY)
	;
	if (flags & LINUX_MSG_FIN)
	;
	if (flags & LINUX_MSG_SYN)
	;
	if (flags & LINUX_MSG_CONFIRM)
	;
	if (flags & LINUX_MSG_RST)
	;
	if (flags & LINUX_MSG_ERRQUEUE)
	;
	#endif
	return ret_flags;
	}

	/*
	* If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the
	* native syscall will fault. Thus, we don't really need to check the
	* return values for these functions.
	*/

	static int
	bsd_to_linux_sockaddr(struct sockaddr *arg)
	{
	struct sockaddr sa;
	size_t sa_len = sizeof(struct sockaddr);
	int error;

	if ((error = copyin(arg, &sa, sa_len)))
	return (error);

	(u_short )&sa = sa.sa_family;

	error = copyout(&sa, arg, sa_len);

	return (error);
	}

	static int
	linux_to_bsd_sockaddr(struct sockaddr *arg, int len)
	{
	struct sockaddr sa;
	size_t sa_len = sizeof(struct sockaddr);
	int error;

	if ((error = copyin(arg, &sa, sa_len)))
	return (error);

	sa.sa_family = (sa_family_t )&sa;
	sa.sa_len = len;

	error = copyout(&sa, arg, sa_len);

	return (error);
	}


	static int
	linux_sa_put(struct osockaddr *osa)
	{
	struct osockaddr sa;
	int error, bdom;

	/*
	* Only read/write the osockaddr family part, the rest is
	* not changed.
	*/
	error = copyin(osa, &sa, sizeof(sa.sa_family));
	if (error)
	return (error);

	bdom = bsd_to_linux_domain(sa.sa_family);
	if (bdom == -1)
	return (EINVAL);

	sa.sa_family = bdom;
	error = copyout(&sa, osa, sizeof(sa.sa_family));
	if (error)
	return (error);

	return (0);
	}

	static int
	linux_sendit(struct thread td, int s, struct msghdr mp, int flags,
	enum uio_seg segflg)
	{
	struct mbuf *control;
	struct sockaddr *to;
	int error;

	if (mp->msg_name != NULL) {
	error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen);
	if (error)
	return (error);
	mp->msg_name = to;
	} else
	to = NULL;

	if (mp->msg_control != NULL) {
	struct cmsghdr *cmsg;

	if (mp->msg_controllen < sizeof(struct cmsghdr)) {
	error = EINVAL;
	goto bad;
	}
	error = sockargs(&control, mp->msg_control,
	mp->msg_controllen, MT_CONTROL);
	if (error)
	goto bad;

	cmsg = mtod(control, struct cmsghdr *);
	cmsg->cmsg_level = linux_to_bsd_sockopt_level(cmsg->cmsg_level);
	} else
	control = NULL;

	error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control,
	segflg);

	bad:
	if (to)
	FREE(to, M_SONAME);
	return (error);
	}

	/* Return 0 if IP_HDRINCL is set for the given socket. */
	static int
	linux_check_hdrincl(struct thread *td, int s)
	{
	int error, optval, size_val;

	size_val = sizeof(optval);
	error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL,
	&optval, UIO_SYSSPACE, &size_val);
	if (error)
	return (error);

	return (optval == 0);
	}

	struct linux_sendto_args {
	int s;
	l_uintptr_t msg;
	int len;
	int flags;
	l_uintptr_t to;
	int tolen;
	};

	/*
	* Updated sendto() when IP_HDRINCL is set:
	* tweak endian-dependent fields in the IP packet.
	*/
	static int
	linux_sendto_hdrincl(struct thread td, struct linux_sendto_args linux_args)
	{
	/*
	* linux_ip_copysize defines how many bytes we should copy
	* from the beginning of the IP packet before we customize it for BSD.
	* It should include all the fields we modify (ip_len and ip_off).
	*/
	#define linux_ip_copysize 8

	struct ip *packet;
	struct msghdr msg;
	struct iovec aiov[1];
	int error;

	/* Check that the packet isn't too big or too small. */
	if (linux_args->len < linux_ip_copysize \|\|
	linux_args->len > IP_MAXPACKET)
	return (EINVAL);

	packet = (struct ip *)malloc(linux_args->len, M_TEMP, M_WAITOK);

	/* Make kernel copy of the packet to be sent */
	if ((error = copyin(PTRIN(linux_args->msg), packet,
	linux_args->len)))
	goto goout;

	/* Convert fields from Linux to BSD raw IP socket format */
	packet->ip_len = linux_args->len;
	packet->ip_off = ntohs(packet->ip_off);

	/* Prepare the msghdr and iovec structures describing the new packet */
	msg.msg_name = PTRIN(linux_args->to);
	msg.msg_namelen = linux_args->tolen;
	msg.msg_iov = aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = NULL;
	msg.msg_flags = 0;
	aiov[0].iov_base = (char *)packet;
	aiov[0].iov_len = linux_args->len;
	error = linux_sendit(td, linux_args->s, &msg, linux_args->flags,
	UIO_SYSSPACE);
	goout:
	free(packet, M_TEMP);
	return (error);
	}

	struct linux_socket_args {
	int domain;
	int type;
	int protocol;
	};

	static int
	linux_socket(struct thread td, struct linux_socket_args args)
	{
	+#ifdef INET6
	+ INIT_VNET_INET6(curvnet);
	+#endif
	struct socket_args /* {
	int domain;
	int type;
	int protocol;
	} */ bsd_args;
	int retval_socket;

	bsd_args.protocol = args->protocol;
	bsd_args.type = args->type;
	bsd_args.domain = linux_to_bsd_domain(args->domain);
	if (bsd_args.domain == -1)
	return (EINVAL);

	retval_socket = socket(td, &bsd_args);
	if (bsd_args.type == SOCK_RAW
	&& (bsd_args.protocol == IPPROTO_RAW \|\| bsd_args.protocol == 0)
	&& bsd_args.domain == AF_INET
	&& retval_socket >= 0) {
	/* It's a raw IP socket: set the IP_HDRINCL option. */
	int hdrincl;

	hdrincl = 1;
	/* We ignore any error returned by kern_setsockopt() */
	kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL,
	&hdrincl, UIO_SYSSPACE, sizeof(hdrincl));
	}
	#ifdef INET6
	/*
	* Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by
	* default and some apps depend on this. So, set V6ONLY to 0
	* for Linux apps if the sysctl value is set to 1.
	*/
	if (bsd_args.domain == PF_INET6 && retval_socket >= 0
	#ifndef KLD_MODULE
	/*
	* XXX: Avoid undefined symbol error with an IPv4 only
	* kernel.
	*/
	&& V_ip6_v6only
	#endif
	) {
	int v6only;

	v6only = 0;
	/* We ignore any error returned by setsockopt() */
	kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY,
	&v6only, UIO_SYSSPACE, sizeof(v6only));
	}
	#endif

	return (retval_socket);
	}

	struct linux_bind_args {
	int s;
	l_uintptr_t name;
	int namelen;
	};

	static int
	linux_bind(struct thread td, struct linux_bind_args args)
	{
	struct sockaddr *sa;
	int error;

	error = linux_getsockaddr(&sa, PTRIN(args->name),
	args->namelen);
	if (error)
	return (error);

	error = kern_bind(td, args->s, sa);
	free(sa, M_SONAME);
	if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in))
	return (EINVAL);
	return (error);
	}

	struct linux_connect_args {
	int s;
	l_uintptr_t name;
	int namelen;
	};
	int linux_connect(struct thread , struct linux_connect_args );

	int
	linux_connect(struct thread td, struct linux_connect_args args)
	{
	struct socket *so;
	struct sockaddr *sa;
	u_int fflag;
	int error;

	error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name),
	args->namelen);
	if (error)
	return (error);

	error = kern_connect(td, args->s, sa);
	free(sa, M_SONAME);
	if (error != EISCONN)
	return (error);

	/*
	* Linux doesn't return EISCONN the first time it occurs,
	* when on a non-blocking socket. Instead it returns the
	* error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD.
	*
	* XXXRW: Instead of using fgetsock(), check that it is a
	* socket and use the file descriptor reference instead of
	* creating a new one.
	*/
	error = fgetsock(td, args->s, &so, &fflag);
	if (error == 0) {
	error = EISCONN;
	if (fflag & FNONBLOCK) {
	SOCK_LOCK(so);
	if (so->so_emuldata == 0)
	error = so->so_error;
	so->so_emuldata = (void *)1;
	SOCK_UNLOCK(so);
	}
	fputsock(so);
	}
	return (error);
	}

	struct linux_listen_args {
	int s;
	int backlog;
	};

	static int
	linux_listen(struct thread td, struct linux_listen_args args)
	{
	struct listen_args /* {
	int s;
	int backlog;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.backlog = args->backlog;
	return (listen(td, &bsd_args));
	}

	struct linux_accept_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	};

	static int
	linux_accept(struct thread td, struct linux_accept_args args)
	{
	struct accept_args /* {
	int s;
	struct sockaddr * __restrict name;
	socklen_t * __restrict anamelen;
	} */ bsd_args;
	int error, fd;

	bsd_args.s = args->s;
	/* XXX: */
	bsd_args.name = (struct sockaddr * __restrict)PTRIN(args->addr);
	bsd_args.anamelen = PTRIN(args->namelen);/* XXX */
	error = accept(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name);
	if (error) {
	if (error == EFAULT && args->namelen != sizeof(struct sockaddr_in))
	return (EINVAL);
	return (error);
	}
	if (args->addr) {
	error = linux_sa_put(PTRIN(args->addr));
	if (error) {
	(void)kern_close(td, td->td_retval[0]);
	return (error);
	}
	}

	/*
	* linux appears not to copy flags from the parent socket to the
	* accepted one, so we must clear the flags in the new descriptor.
	* Ignore any errors, because we already have an open fd.
	*/
	fd = td->td_retval[0];
	(void)kern_fcntl(td, fd, F_SETFL, 0);
	td->td_retval[0] = fd;
	return (0);
	}

	struct linux_getsockname_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	};

	static int
	linux_getsockname(struct thread td, struct linux_getsockname_args args)
	{
	struct getsockname_args /* {
	int fdes;
	struct sockaddr * __restrict asa;
	socklen_t * __restrict alen;
	} */ bsd_args;
	int error;

	bsd_args.fdes = args->s;
	/* XXX: */
	bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr);
	bsd_args.alen = PTRIN(args->namelen); /* XXX */
	error = getsockname(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
	if (error)
	return (error);
	error = linux_sa_put(PTRIN(args->addr));
	if (error)
	return (error);
	return (0);
	}

	struct linux_getpeername_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	};

	static int
	linux_getpeername(struct thread td, struct linux_getpeername_args args)
	{
	struct getpeername_args /* {
	int fdes;
	caddr_t asa;
	int *alen;
	} */ bsd_args;
	int error;

	bsd_args.fdes = args->s;
	bsd_args.asa = (struct sockaddr *)PTRIN(args->addr);
	bsd_args.alen = (int *)PTRIN(args->namelen);
	error = getpeername(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
	if (error)
	return (error);
	error = linux_sa_put(PTRIN(args->addr));
	if (error)
	return (error);
	return (0);
	}

	struct linux_socketpair_args {
	int domain;
	int type;
	int protocol;
	l_uintptr_t rsv;
	};

	static int
	linux_socketpair(struct thread td, struct linux_socketpair_args args)
	{
	struct socketpair_args /* {
	int domain;
	int type;
	int protocol;
	int *rsv;
	} */ bsd_args;

	bsd_args.domain = linux_to_bsd_domain(args->domain);
	if (bsd_args.domain == -1)
	return (EINVAL);

	bsd_args.type = args->type;
	bsd_args.protocol = args->protocol;
	bsd_args.rsv = (int *)PTRIN(args->rsv);
	return (socketpair(td, &bsd_args));
	}

	struct linux_send_args {
	int s;
	l_uintptr_t msg;
	int len;
	int flags;
	};

	static int
	linux_send(struct thread td, struct linux_send_args args)
	{
	struct sendto_args /* {
	int s;
	caddr_t buf;
	int len;
	int flags;
	caddr_t to;
	int tolen;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.buf = (caddr_t)PTRIN(args->msg);
	bsd_args.len = args->len;
	bsd_args.flags = args->flags;
	bsd_args.to = NULL;
	bsd_args.tolen = 0;
	return sendto(td, &bsd_args);
	}

	struct linux_recv_args {
	int s;
	l_uintptr_t msg;
	int len;
	int flags;
	};

	static int
	linux_recv(struct thread td, struct linux_recv_args args)
	{
	struct recvfrom_args /* {
	int s;
	caddr_t buf;
	int len;
	int flags;
	struct sockaddr *from;
	socklen_t fromlenaddr;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.buf = (caddr_t)PTRIN(args->msg);
	bsd_args.len = args->len;
	bsd_args.flags = args->flags;
	bsd_args.from = NULL;
	bsd_args.fromlenaddr = 0;
	return (recvfrom(td, &bsd_args));
	}

	static int
	linux_sendto(struct thread td, struct linux_sendto_args args)
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	if (linux_check_hdrincl(td, args->s) == 0)
	/* IP_HDRINCL set, tweak the packet before sending */
	return (linux_sendto_hdrincl(td, args));

	msg.msg_name = PTRIN(args->to);
	msg.msg_namelen = args->tolen;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = NULL;
	msg.msg_flags = 0;
	aiov.iov_base = PTRIN(args->msg);
	aiov.iov_len = args->len;
	error = linux_sendit(td, args->s, &msg, args->flags, UIO_USERSPACE);
	return (error);
	}

	struct linux_recvfrom_args {
	int s;
	l_uintptr_t buf;
	int len;
	int flags;
	l_uintptr_t from;
	l_uintptr_t fromlen;
	};

	static int
	linux_recvfrom(struct thread td, struct linux_recvfrom_args args)
	{
	struct recvfrom_args /* {
	int s;
	caddr_t buf;
	size_t len;
	int flags;
	struct sockaddr * __restrict from;
	socklen_t * __restrict fromlenaddr;
	} */ bsd_args;
	size_t len;
	int error;

	if ((error = copyin(PTRIN(args->fromlen), &len, sizeof(size_t))))
	return (error);

	bsd_args.s = args->s;
	bsd_args.buf = PTRIN(args->buf);
	bsd_args.len = args->len;
	bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
	/* XXX: */
	bsd_args.from = (struct sockaddr * __restrict)PTRIN(args->from);
	bsd_args.fromlenaddr = PTRIN(args->fromlen);/* XXX */

	linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.from, len);
	error = recvfrom(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.from);

	if (error)
	return (error);
	if (args->from) {
	error = linux_sa_put((struct osockaddr *)
	PTRIN(args->from));
	if (error)
	return (error);
	}
	return (0);
	}

	struct linux_sendmsg_args {
	int s;
	l_uintptr_t msg;
	int flags;
	};

	static int
	linux_sendmsg(struct thread td, struct linux_sendmsg_args args)
	{
	struct msghdr msg;
	struct iovec *iov;
	int error;

	/* XXXTJR sendmsg is broken on amd64 */

	error = copyin(PTRIN(args->msg), &msg, sizeof(msg));
	if (error)
	return (error);

	/*
	* Some Linux applications (ping) define a non-NULL control data
	* pointer, but a msg_controllen of 0, which is not allowed in the
	* FreeBSD system call interface. NULL the msg_control pointer in
	* order to handle this case. This should be checked, but allows the
	* Linux ping to work.
	*/
	if (msg.msg_control != NULL && msg.msg_controllen == 0)
	msg.msg_control = NULL;
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	if (error)
	return (error);
	msg.msg_iov = iov;
	msg.msg_flags = 0;
	error = linux_sendit(td, args->s, &msg, args->flags, UIO_USERSPACE);
	free(iov, M_IOV);
	return (error);
	}

	struct linux_recvmsg_args {
	int s;
	l_uintptr_t msg;
	int flags;
	};

	static int
	linux_recvmsg(struct thread td, struct linux_recvmsg_args args)
	{
	struct recvmsg_args /* {
	int s;
	struct msghdr *msg;
	int flags;
	} */ bsd_args;
	struct msghdr msg;
	struct cmsghdr *cmsg;
	int error;

	/* XXXTJR recvmsg is broken on amd64 */

	if ((error = copyin(PTRIN(args->msg), &msg, sizeof (msg))))
	return (error);

	bsd_args.s = args->s;
	bsd_args.msg = PTRIN(args->msg);
	bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
	if (msg.msg_name) {
	linux_to_bsd_sockaddr((struct sockaddr *)msg.msg_name,
	msg.msg_namelen);
	error = recvmsg(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)msg.msg_name);
	} else
	error = recvmsg(td, &bsd_args);
	if (error)
	return (error);

	if (bsd_args.msg->msg_control != NULL &&
	bsd_args.msg->msg_controllen > 0) {
	cmsg = (struct cmsghdr*)bsd_args.msg->msg_control;
	cmsg->cmsg_level = bsd_to_linux_sockopt_level(cmsg->cmsg_level);
	}

	error = copyin(PTRIN(args->msg), &msg, sizeof(msg));
	if (error)
	return (error);
	if (msg.msg_name && msg.msg_namelen > 2)
	error = linux_sa_put(msg.msg_name);
	return (error);
	}

	struct linux_shutdown_args {
	int s;
	int how;
	};

	static int
	linux_shutdown(struct thread td, struct linux_shutdown_args args)
	{
	struct shutdown_args /* {
	int s;
	int how;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.how = args->how;
	return (shutdown(td, &bsd_args));
	}

	struct linux_setsockopt_args {
	int s;
	int level;
	int optname;
	l_uintptr_t optval;
	int optlen;
	};

	static int
	linux_setsockopt(struct thread td, struct linux_setsockopt_args args)
	{
	struct setsockopt_args /* {
	int s;
	int level;
	int name;
	caddr_t val;
	int valsize;
	} */ bsd_args;
	int error, name;

	bsd_args.s = args->s;
	bsd_args.level = linux_to_bsd_sockopt_level(args->level);
	switch (bsd_args.level) {
	case SOL_SOCKET:
	name = linux_to_bsd_so_sockopt(args->optname);
	break;
	case IPPROTO_IP:
	name = linux_to_bsd_ip_sockopt(args->optname);
	break;
	case IPPROTO_TCP:
	/* Linux TCP option values match BSD's */
	name = args->optname;
	break;
	default:
	name = -1;
	break;
	}
	if (name == -1)
	return (ENOPROTOOPT);

	bsd_args.name = name;
	bsd_args.val = PTRIN(args->optval);
	bsd_args.valsize = args->optlen;

	if (name == IPV6_NEXTHOP) {
	linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val,
	bsd_args.valsize);
	error = setsockopt(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
	} else
	error = setsockopt(td, &bsd_args);

	return (error);
	}

	struct linux_getsockopt_args {
	int s;
	int level;
	int optname;
	l_uintptr_t optval;
	l_uintptr_t optlen;
	};

	static int
	linux_getsockopt(struct thread td, struct linux_getsockopt_args args)
	{
	struct getsockopt_args /* {
	int s;
	int level;
	int name;
	caddr_t val;
	int *avalsize;
	} */ bsd_args;
	int error, name;

	bsd_args.s = args->s;
	bsd_args.level = linux_to_bsd_sockopt_level(args->level);
	switch (bsd_args.level) {
	case SOL_SOCKET:
	name = linux_to_bsd_so_sockopt(args->optname);
	break;
	case IPPROTO_IP:
	name = linux_to_bsd_ip_sockopt(args->optname);
	break;
	case IPPROTO_TCP:
	/* Linux TCP option values match BSD's */
	name = args->optname;
	break;
	default:
	name = -1;
	break;
	}
	if (name == -1)
	return (EINVAL);

	bsd_args.name = name;
	bsd_args.val = PTRIN(args->optval);
	bsd_args.avalsize = PTRIN(args->optlen);

	if (name == IPV6_NEXTHOP) {
	error = getsockopt(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
	} else
	error = getsockopt(td, &bsd_args);

	return (error);
	}

	int
	linux_socketcall(struct thread td, struct linux_socketcall_args args)
	{
	void arg = (void )(intptr_t)args->args;

	switch (args->what) {
	case LINUX_SOCKET:
	return (linux_socket(td, arg));
	case LINUX_BIND:
	return (linux_bind(td, arg));
	case LINUX_CONNECT:
	return (linux_connect(td, arg));
	case LINUX_LISTEN:
	return (linux_listen(td, arg));
	case LINUX_ACCEPT:
	return (linux_accept(td, arg));
	case LINUX_GETSOCKNAME:
	return (linux_getsockname(td, arg));
	case LINUX_GETPEERNAME:
	return (linux_getpeername(td, arg));
	case LINUX_SOCKETPAIR:
	return (linux_socketpair(td, arg));
	case LINUX_SEND:
	return (linux_send(td, arg));
	case LINUX_RECV:
	return (linux_recv(td, arg));
	case LINUX_SENDTO:
	return (linux_sendto(td, arg));
	case LINUX_RECVFROM:
	return (linux_recvfrom(td, arg));
	case LINUX_SHUTDOWN:
	return (linux_shutdown(td, arg));
	case LINUX_SETSOCKOPT:
	return (linux_setsockopt(td, arg));
	case LINUX_GETSOCKOPT:
	return (linux_getsockopt(td, arg));
	case LINUX_SENDMSG:
	return (linux_sendmsg(td, arg));
	case LINUX_RECVMSG:
	return (linux_recvmsg(td, arg));
	}

	uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what);
	return (ENOSYS);
	}
	Index: head/sys/compat/svr4/svr4_sockio.c
	===================================================================
	--- head/sys/compat/svr4/svr4_sockio.c (revision 183549)
	+++ head/sys/compat/svr4/svr4_sockio.c (revision 183550)
	@@ -1,168 +1,169 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1995 Christos Zoulas
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/sockio.h>
	#include <sys/socket.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_ioctl.h>
	#include <compat/svr4/svr4_sockio.h>

	static int bsd_to_svr4_flags(int);

	#define bsd_to_svr4_flag(a) \
	if (bf & __CONCAT(I,a)) sf \|= __CONCAT(SVR4_I,a)

	static int
	bsd_to_svr4_flags(bf)
	int bf;
	{
	int sf = 0;
	bsd_to_svr4_flag(FF_UP);
	bsd_to_svr4_flag(FF_BROADCAST);
	bsd_to_svr4_flag(FF_DEBUG);
	bsd_to_svr4_flag(FF_LOOPBACK);
	bsd_to_svr4_flag(FF_POINTOPOINT);
	#if defined(IFF_NOTRAILERS)
	bsd_to_svr4_flag(FF_NOTRAILERS);
	#endif
	if (bf & IFF_DRV_RUNNING)
	sf \|= SVR4_IFF_RUNNING;
	bsd_to_svr4_flag(FF_NOARP);
	bsd_to_svr4_flag(FF_PROMISC);
	bsd_to_svr4_flag(FF_ALLMULTI);
	bsd_to_svr4_flag(FF_MULTICAST);
	return sf;
	}

	int
	svr4_sock_ioctl(fp, td, retval, fd, cmd, data)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t data;
	{
	int error;

	*retval = 0;

	switch (cmd) {
	case SVR4_SIOCGIFNUM:
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;
	int ifnum = 0;

	/*
	* This does not return the number of physical
	* interfaces (if_index), but the number of interfaces
	* + addresses like ifconf() does, because this number
	* is used by code that will call SVR4_SIOCGIFCONF to
	* find the space needed for SVR4_SIOCGIFCONF. So we
	* count the number of ifreq entries that the next
	* SVR4_SIOCGIFCONF will return. Maybe a more correct
	* fix is to make SVR4_SIOCGIFCONF return only one
	* entry per physical interface?
	*/
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
	if (TAILQ_EMPTY(&ifp->if_addrhead))
	ifnum++;
	else
	TAILQ_FOREACH(ifa, &ifp->if_addrhead,
	ifa_link)
	ifnum++;
	IFNET_RUNLOCK();
	DPRINTF(("SIOCGIFNUM %d\n", ifnum));
	return copyout(&ifnum, data, sizeof(ifnum));
	}

	case SVR4_SIOCGIFFLAGS:
	{
	struct ifreq br;
	struct svr4_ifreq sr;

	if ((error = copyin(data, &sr, sizeof(sr))) != 0)
	return error;

	(void) strlcpy(br.ifr_name, sr.svr4_ifr_name,
	sizeof(br.ifr_name));
	if ((error = fo_ioctl(fp, SIOCGIFFLAGS,
	(caddr_t) &br, td->td_ucred,
	td)) != 0) {
	DPRINTF(("SIOCGIFFLAGS (%s) %s: error %d\n",
	br.ifr_name, sr.svr4_ifr_name, error));
	return error;
	}

	sr.svr4_ifr_flags = bsd_to_svr4_flags(br.ifr_flags);
	DPRINTF(("SIOCGIFFLAGS %s = %x\n",
	sr.svr4_ifr_name, sr.svr4_ifr_flags));
	return copyout(&sr, data, sizeof(sr));
	}

	case SVR4_SIOCGIFCONF:
	{
	struct svr4_ifconf sc;

	if ((error = copyin(data, &sc, sizeof(sc))) != 0)
	return error;

	DPRINTF(("ifreq %d svr4_ifreq %d ifc_len %d\n",
	sizeof(struct ifreq), sizeof(struct svr4_ifreq),
	sc.svr4_ifc_len));

	if ((error = fo_ioctl(fp, OSIOCGIFCONF,
	(caddr_t) &sc, td->td_ucred,
	td)) != 0)
	return error;

	DPRINTF(("SIOCGIFCONF\n"));
	return 0;
	}


	default:
	DPRINTF(("Unknown svr4 sockio %lx\n", cmd));
	return 0; /* ENOSYS really */
	}
	}
	Index: head/sys/contrib/ipfilter/netinet/ip_auth.c
	===================================================================
	--- head/sys/contrib/ipfilter/netinet/ip_auth.c (revision 183549)
	+++ head/sys/contrib/ipfilter/netinet/ip_auth.c (revision 183550)
	@@ -1,1043 +1,1048 @@
	/* $FreeBSD$ */

	/*
	* Copyright (C) 1998-2003 by Darren Reed & Guido van Rooij.
	*
	* See the IPFILTER.LICENCE file for details on licencing.
	*/
	#if defined(KERNEL) \|\| defined(_KERNEL)
	# undef KERNEL
	# undef _KERNEL
	# define KERNEL 1
	# define _KERNEL 1
	#endif
	#include <sys/errno.h>
	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/time.h>
	#include <sys/file.h>
	#if !defined(_KERNEL)
	# include <stdio.h>
	# include <stdlib.h>
	# include <string.h>
	# define _KERNEL
	# ifdef __OpenBSD__
	struct file;
	# endif
	# include <sys/uio.h>
	# undef _KERNEL
	#endif
	#if defined(_KERNEL) && (__FreeBSD_version >= 220000)
	# include <sys/filio.h>
	# include <sys/fcntl.h>
	#else
	# include <sys/ioctl.h>
	#endif
	#if !defined(linux)
	# include <sys/protosw.h>
	#endif
	#include <sys/socket.h>
	#if defined(_KERNEL)
	# include <sys/systm.h>
	# if !defined(__SVR4) && !defined(__svr4__) && !defined(linux)
	# include <sys/mbuf.h>
	# endif
	#endif
	#if defined(__SVR4) \|\| defined(__svr4__)
	# include <sys/filio.h>
	# include <sys/byteorder.h>
	# ifdef _KERNEL
	# include <sys/dditypes.h>
	# endif
	# include <sys/stream.h>
	# include <sys/kmem.h>
	#endif
	#if (defined(_BSDI_VERSION) && _BSDI_VERSION >= 199802) \|\| \
	(defined(__FreeBSD_version) &&(__FreeBSD_version >= 400000))
	# include <sys/queue.h>
	#endif
	#if defined(__NetBSD__) \|\| defined(__OpenBSD__) \|\| defined(bsdi)
	# include <machine/cpu.h>
	#endif
	#if defined(_KERNEL) && defined(__NetBSD__) && (__NetBSD_Version__ >= 104000000)
	# include <sys/proc.h>
	#endif
	#include <net/if.h>
	#ifdef sun
	# include <net/af.h>
	#endif
	#include <net/route.h>
	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	+#if !defined(_KERNEL) && defined(__FreeBSD_version) && \
	+ __FreeBSD_version >= 800049
	+# define V_ip_do_randomid ip_do_randomid
	+# define V_ip_id ip_id
	+#endif
	#if !defined(_KERNEL) && !defined(__osf__) && !defined(__sgi)
	# define KERNEL
	# define _KERNEL
	# define NOT_KERNEL
	#endif
	#if !defined(linux)
	# include <netinet/ip_var.h>
	#endif
	#ifdef NOT_KERNEL
	# undef _KERNEL
	# undef KERNEL
	#endif
	#include <netinet/tcp.h>
	#if defined(IRIX) && (IRIX < 60516) /* IRIX < 6 */
	extern struct ifqueue ipintrq; /* ip packet input queue */
	#else
	# if !defined(__hpux) && !defined(linux)
	# if __FreeBSD_version >= 300000
	# include <net/if_var.h>
	# if __FreeBSD_version >= 500042
	# define IF_QFULL _IF_QFULL
	# define IF_DROP _IF_DROP
	# endif /* __FreeBSD_version >= 500042 */
	# endif
	# include <netinet/in_var.h>
	# include <netinet/tcp_fsm.h>
	# endif
	#endif
	#include <netinet/udp.h>
	#include <netinet/ip_icmp.h>
	#include "netinet/ip_compat.h"
	#include <netinet/tcpip.h>
	#include "netinet/ip_fil.h"
	#include "netinet/ip_auth.h"
	#if !defined(MENTAT) && !defined(linux)
	# include <net/netisr.h>
	# ifdef __FreeBSD__
	# include <machine/cpufunc.h>
	# endif
	#endif
	#if (__FreeBSD_version >= 300000)
	# include <sys/malloc.h>
	# if defined(_KERNEL) && !defined(IPFILTER_LKM)
	# include <sys/libkern.h>
	# include <sys/systm.h>
	# endif
	#endif
	/* END OF INCLUDES */

	#if !defined(lint)
	static const char rcsid[] = "@(#)$FreeBSD$";
	/* static const char rcsid[] = "@(#)$Id: ip_auth.c,v 2.73.2.24 2007/09/09 11:32:04 darrenr Exp $"; */
	#endif


	#if SOLARIS && defined(_KERNEL)
	extern kcondvar_t ipfauthwait;
	extern struct pollhead iplpollhead[IPL_LOGSIZE];
	#endif /* SOLARIS */
	#if defined(linux) && defined(_KERNEL)
	wait_queue_head_t fr_authnext_linux;
	#endif

	int fr_authsize = FR_NUMAUTH;
	int fr_authused = 0;
	int fr_defaultauthage = 600;
	int fr_auth_lock = 0;
	int fr_auth_init = 0;
	fr_authstat_t fr_authstats;
	static frauth_t *fr_auth = NULL;
	mb_t **fr_authpkts = NULL;
	int fr_authstart = 0, fr_authend = 0, fr_authnext = 0;
	frauthent_t *fae_list = NULL;
	frentry_t *ipauth = NULL,
	*fr_authlist = NULL;

	void fr_authderef __P((frauthent_t **));
	int fr_authgeniter __P((ipftoken_t , ipfgeniter_t ));
	int fr_authreply __P((char *));
	int fr_authwait __P((char *));

	/* ------------------------------------------------------------------------ */
	/* Function: fr_authinit */
	/* Returns: int - 0 == success, else error */
	/* Parameters: None */
	/* */
	/* Allocate memory and initialise data structures used in handling auth */
	/* rules. */
	/* ------------------------------------------------------------------------ */
	int fr_authinit()
	{
	KMALLOCS(fr_auth, frauth_t , fr_authsize sizeof(*fr_auth));
	if (fr_auth != NULL)
	bzero((char )fr_auth, fr_authsize sizeof(*fr_auth));
	else
	return -1;

	KMALLOCS(fr_authpkts, mb_t *, fr_authsize sizeof(*fr_authpkts));
	if (fr_authpkts != NULL)
	bzero((char )fr_authpkts, fr_authsize sizeof(*fr_authpkts));
	else
	return -2;

	MUTEX_INIT(&ipf_authmx, "ipf auth log mutex");
	RWLOCK_INIT(&ipf_auth, "ipf IP User-Auth rwlock");
	#if SOLARIS && defined(_KERNEL)
	cv_init(&ipfauthwait, "ipf auth condvar", CV_DRIVER, NULL);
	#endif
	#if defined(linux) && defined(_KERNEL)
	init_waitqueue_head(&fr_authnext_linux);
	#endif

	fr_auth_init = 1;

	return 0;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_checkauth */
	/* Returns: frentry_t* - pointer to ipf rule if match found, else NULL */
	/* Parameters: fin(I) - pointer to ipftoken structure */
	/* passp(I) - pointer to ipfgeniter structure */
	/* */
	/* Check if a packet has authorization. If the packet is found to match an */
	/* authorization result and that would result in a feedback loop (i.e. it */
	/* will end up returning FR_AUTH) then return FR_BLOCK instead. */
	/* ------------------------------------------------------------------------ */
	frentry_t *fr_checkauth(fin, passp)
	fr_info_t *fin;
	u_32_t *passp;
	{
	frentry_t *fr;
	frauth_t *fra;
	u_32_t pass;
	u_short id;
	ip_t *ip;
	int i;

	if (fr_auth_lock \|\| !fr_authused)
	return NULL;

	ip = fin->fin_ip;
	id = ip->ip_id;

	READ_ENTER(&ipf_auth);
	for (i = fr_authstart; i != fr_authend; ) {
	/*
	* index becomes -2 only after an SIOCAUTHW. Check this in
	* case the same packet gets sent again and it hasn't yet been
	* auth'd.
	*/
	fra = fr_auth + i;
	if ((fra->fra_index == -2) && (id == fra->fra_info.fin_id) &&
	!bcmp((char )fin, (char )&fra->fra_info, FI_CSIZE)) {
	/*
	* Avoid feedback loop.
	*/
	if (!(pass = fra->fra_pass) \|\| (FR_ISAUTH(pass)))
	pass = FR_BLOCK;
	/*
	* Create a dummy rule for the stateful checking to
	* use and return. Zero out any values we don't
	* trust from userland!
	*/
	if ((pass & FR_KEEPSTATE) \|\| ((pass & FR_KEEPFRAG) &&
	(fin->fin_flx & FI_FRAG))) {
	KMALLOC(fr, frentry_t *);
	if (fr) {
	bcopy((char *)fra->fra_info.fin_fr,
	(char )fr, sizeof(fr));
	fr->fr_grp = NULL;
	fr->fr_ifa = fin->fin_ifp;
	fr->fr_func = NULL;
	fr->fr_ref = 1;
	fr->fr_flags = pass;
	fr->fr_ifas[1] = NULL;
	fr->fr_ifas[2] = NULL;
	fr->fr_ifas[3] = NULL;
	}
	} else
	fr = fra->fra_info.fin_fr;
	fin->fin_fr = fr;
	RWLOCK_EXIT(&ipf_auth);

	WRITE_ENTER(&ipf_auth);
	/*
	* fr_authlist is populated with the rules malloc'd
	* above and only those.
	*/
	if ((fr != NULL) && (fr != fra->fra_info.fin_fr)) {
	fr->fr_next = fr_authlist;
	fr_authlist = fr;
	}
	fr_authstats.fas_hits++;
	fra->fra_index = -1;
	fr_authused--;
	if (i == fr_authstart) {
	while (fra->fra_index == -1) {
	i++;
	fra++;
	if (i == fr_authsize) {
	i = 0;
	fra = fr_auth;
	}
	fr_authstart = i;
	if (i == fr_authend)
	break;
	}
	if (fr_authstart == fr_authend) {
	fr_authnext = 0;
	fr_authstart = fr_authend = 0;
	}
	}
	RWLOCK_EXIT(&ipf_auth);
	if (passp != NULL)
	*passp = pass;
	ATOMIC_INC64(fr_authstats.fas_hits);
	return fr;
	}
	i++;
	if (i == fr_authsize)
	i = 0;
	}
	fr_authstats.fas_miss++;
	RWLOCK_EXIT(&ipf_auth);
	ATOMIC_INC64(fr_authstats.fas_miss);
	return NULL;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_newauth */
	/* Returns: int - 1 == success, 0 = did not put packet on auth queue */
	/* Parameters: m(I) - pointer to mb_t with packet in it */
	/* fin(I) - pointer to packet information */
	/* */
	/* Check if we have room in the auth array to hold details for another */
	/* packet. If we do, store it and wake up any user programs which are */
	/* waiting to hear about these events. */
	/* ------------------------------------------------------------------------ */
	int fr_newauth(m, fin)
	mb_t *m;
	fr_info_t *fin;
	{
	#if defined(_KERNEL) && defined(MENTAT)
	qpktinfo_t *qpi = fin->fin_qpi;
	#endif
	frauth_t *fra;
	#if !defined(sparc) && !defined(m68k)
	ip_t *ip;
	#endif
	int i;

	if (fr_auth_lock)
	return 0;

	WRITE_ENTER(&ipf_auth);
	if (((fr_authend + 1) % fr_authsize) == fr_authstart) {
	fr_authstats.fas_nospace++;
	RWLOCK_EXIT(&ipf_auth);
	return 0;
	}

	fr_authstats.fas_added++;
	fr_authused++;
	i = fr_authend++;
	if (fr_authend == fr_authsize)
	fr_authend = 0;
	fra = fr_auth + i;
	fra->fra_index = i;
	RWLOCK_EXIT(&ipf_auth);

	if (fin->fin_fr != NULL)
	fra->fra_pass = fin->fin_fr->fr_flags;
	else
	fra->fra_pass = 0;
	fra->fra_age = fr_defaultauthage;
	bcopy((char )fin, (char )&fra->fra_info, sizeof(*fin));
	#if !defined(sparc) && !defined(m68k)
	/*
	* No need to copyback here as we want to undo the changes, not keep
	* them.
	*/
	ip = fin->fin_ip;
	# if defined(MENTAT) && defined(_KERNEL)
	if ((ip == (ip_t *)m->b_rptr) && (fin->fin_v == 4))
	# endif
	{
	register u_short bo;

	bo = ip->ip_len;
	ip->ip_len = htons(bo);
	bo = ip->ip_off;
	ip->ip_off = htons(bo);
	}
	#endif
	#if SOLARIS && defined(_KERNEL)
	COPYIFNAME(fin->fin_v, fin->fin_ifp, fra->fra_info.fin_ifname);
	m->b_rptr -= qpi->qpi_off;
	fr_authpkts[i] = (mblk_t *)fin->fin_mp;
	# if !defined(_INET_IP_STACK_H)
	fra->fra_q = qpi->qpi_q; /* The queue can disappear! */
	# endif
	fra->fra_m = *fin->fin_mp;
	fra->fra_info.fin_mp = &fra->fra_m;
	cv_signal(&ipfauthwait);
	pollwakeup(&iplpollhead[IPL_LOGAUTH], POLLIN\|POLLRDNORM);
	#else
	fr_authpkts[i] = m;
	WAKEUP(&fr_authnext,0);
	#endif
	return 1;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_auth_ioctl */
	/* Returns: int - 0 == success, else error */
	/* Parameters: data(IO) - pointer to ioctl data */
	/* cmd(I) - ioctl command */
	/* mode(I) - mode flags associated with open descriptor */
	/* uid(I) - uid associatd with application making the call */
	/* ctx(I) - pointer for context */
	/* */
	/* This function handles all of the ioctls recognised by the auth component */
	/* in IPFilter - ie ioctls called on an open fd for /dev/ipauth */
	/* ------------------------------------------------------------------------ */
	int fr_auth_ioctl(data, cmd, mode, uid, ctx)
	caddr_t data;
	ioctlcmd_t cmd;
	int mode, uid;
	void *ctx;
	{
	int error = 0, i;
	SPL_INT(s);

	switch (cmd)
	{
	case SIOCGENITER :
	{
	ipftoken_t *token;
	ipfgeniter_t iter;

	error = fr_inobj(data, &iter, IPFOBJ_GENITER);
	if (error != 0)
	break;

	SPL_SCHED(s);
	token = ipf_findtoken(IPFGENITER_AUTH, uid, ctx);
	if (token != NULL)
	error = fr_authgeniter(token, &iter);
	else
	error = ESRCH;
	RWLOCK_EXIT(&ipf_tokens);
	SPL_X(s);

	break;
	}

	case SIOCADAFR :
	case SIOCRMAFR :
	if (!(mode & FWRITE))
	error = EPERM;
	else
	error = frrequest(IPL_LOGAUTH, cmd, data,
	fr_active, 1);
	break;

	case SIOCSTLCK :
	if (!(mode & FWRITE)) {
	error = EPERM;
	break;
	}
	error = fr_lock(data, &fr_auth_lock);
	break;

	case SIOCATHST:
	fr_authstats.fas_faelist = fae_list;
	error = fr_outobj(data, &fr_authstats, IPFOBJ_AUTHSTAT);
	break;

	case SIOCIPFFL:
	SPL_NET(s);
	WRITE_ENTER(&ipf_auth);
	i = fr_authflush();
	RWLOCK_EXIT(&ipf_auth);
	SPL_X(s);
	error = BCOPYOUT((char *)&i, data, sizeof(i));
	if (error != 0)
	error = EFAULT;
	break;

	case SIOCAUTHW:
	error = fr_authwait(data);
	break;

	case SIOCAUTHR:
	error = fr_authreply(data);
	break;

	default :
	error = EINVAL;
	break;
	}
	return error;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authunload */
	/* Returns: None */
	/* Parameters: None */
	/* */
	/* Free all network buffer memory used to keep saved packets. */
	/* ------------------------------------------------------------------------ */
	void fr_authunload()
	{
	register int i;
	register frauthent_t fae, *faep;
	frentry_t fr, *frp;
	mb_t *m;

	if (fr_auth != NULL) {
	KFREES(fr_auth, fr_authsize * sizeof(*fr_auth));
	fr_auth = NULL;
	}

	if (fr_authpkts != NULL) {
	for (i = 0; i < fr_authsize; i++) {
	m = fr_authpkts[i];
	if (m != NULL) {
	FREE_MB_T(m);
	fr_authpkts[i] = NULL;
	}
	}
	KFREES(fr_authpkts, fr_authsize * sizeof(*fr_authpkts));
	fr_authpkts = NULL;
	}

	faep = &fae_list;
	while ((fae = *faep) != NULL) {
	*faep = fae->fae_next;
	KFREE(fae);
	}
	ipauth = NULL;

	if (fr_authlist != NULL) {
	for (frp = &fr_authlist; ((fr = *frp) != NULL); ) {
	if (fr->fr_ref == 1) {
	*frp = fr->fr_next;
	KFREE(fr);
	} else
	frp = &fr->fr_next;
	}
	}

	if (fr_auth_init == 1) {
	# if SOLARIS && defined(_KERNEL)
	cv_destroy(&ipfauthwait);
	# endif
	MUTEX_DESTROY(&ipf_authmx);
	RW_DESTROY(&ipf_auth);

	fr_auth_init = 0;
	}
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authexpire */
	/* Returns: None */
	/* Parameters: None */
	/* */
	/* Slowly expire held auth records. Timeouts are set in expectation of */
	/* this being called twice per second. */
	/* ------------------------------------------------------------------------ */
	void fr_authexpire()
	{
	frauthent_t fae, *faep;
	frentry_t fr, *frp;
	frauth_t *fra;
	mb_t *m;
	int i;
	SPL_INT(s);

	if (fr_auth_lock)
	return;

	SPL_NET(s);
	WRITE_ENTER(&ipf_auth);
	for (i = 0, fra = fr_auth; i < fr_authsize; i++, fra++) {
	fra->fra_age--;
	if ((fra->fra_age == 0) && (m = fr_authpkts[i])) {
	FREE_MB_T(m);
	fr_authpkts[i] = NULL;
	fr_auth[i].fra_index = -1;
	fr_authstats.fas_expire++;
	fr_authused--;
	}
	}

	/*
	* Expire pre-auth rules
	*/
	for (faep = &fae_list; ((fae = *faep) != NULL); ) {
	fae->fae_age--;
	if (fae->fae_age == 0) {
	fr_authderef(&fae);
	fr_authstats.fas_expire++;
	} else
	faep = &fae->fae_next;
	}
	if (fae_list != NULL)
	ipauth = &fae_list->fae_fr;
	else
	ipauth = NULL;

	for (frp = &fr_authlist; ((fr = *frp) != NULL); ) {
	if (fr->fr_ref == 1) {
	*frp = fr->fr_next;
	KFREE(fr);
	} else
	frp = &fr->fr_next;
	}
	RWLOCK_EXIT(&ipf_auth);
	SPL_X(s);
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_preauthcmd */
	/* Returns: int - 0 == success, else error */
	/* Parameters: cmd(I) - ioctl command for rule */
	/* fr(I) - pointer to ipf rule */
	/* fptr(I) - pointer to caller's 'fr' */
	/* */
	/* ------------------------------------------------------------------------ */
	int fr_preauthcmd(cmd, fr, frptr)
	ioctlcmd_t cmd;
	frentry_t fr, *frptr;
	{
	frauthent_t fae, *faep;
	int error = 0;
	SPL_INT(s);

	if ((cmd != SIOCADAFR) && (cmd != SIOCRMAFR))
	return EIO;

	for (faep = &fae_list; ((fae = *faep) != NULL); ) {
	if (&fae->fae_fr == fr)
	break;
	else
	faep = &fae->fae_next;
	}

	if (cmd == (ioctlcmd_t)SIOCRMAFR) {
	if (fr == NULL \|\| frptr == NULL)
	error = EINVAL;
	else if (fae == NULL)
	error = ESRCH;
	else {
	SPL_NET(s);
	WRITE_ENTER(&ipf_auth);
	*faep = fae->fae_next;
	if (ipauth == &fae->fae_fr)
	ipauth = fae_list ? &fae_list->fae_fr : NULL;
	RWLOCK_EXIT(&ipf_auth);
	SPL_X(s);

	KFREE(fae);
	}
	} else if (fr != NULL && frptr != NULL) {
	KMALLOC(fae, frauthent_t *);
	if (fae != NULL) {
	bcopy((char )fr, (char )&fae->fae_fr,
	sizeof(*fr));
	SPL_NET(s);
	WRITE_ENTER(&ipf_auth);
	fae->fae_age = fr_defaultauthage;
	fae->fae_fr.fr_hits = 0;
	fae->fae_fr.fr_next = *frptr;
	fae->fae_ref = 1;
	*frptr = &fae->fae_fr;
	fae->fae_next = *faep;
	*faep = fae;
	ipauth = &fae_list->fae_fr;
	RWLOCK_EXIT(&ipf_auth);
	SPL_X(s);
	} else
	error = ENOMEM;
	} else
	error = EINVAL;
	return error;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authflush */
	/* Returns: int - number of auth entries flushed */
	/* Parameters: None */
	/* Locks: WRITE(ipf_auth) */
	/* */
	/* This function flushs the fr_authpkts array of any packet data with */
	/* references still there. */
	/* It is expected that the caller has already acquired the correct locks or */
	/* set the priority level correctly for this to block out other code paths */
	/* into these data structures. */
	/* ------------------------------------------------------------------------ */
	int fr_authflush()
	{
	register int i, num_flushed;
	mb_t *m;

	if (fr_auth_lock)
	return -1;

	num_flushed = 0;

	for (i = 0 ; i < fr_authsize; i++) {
	m = fr_authpkts[i];
	if (m != NULL) {
	FREE_MB_T(m);
	fr_authpkts[i] = NULL;
	fr_auth[i].fra_index = -1;
	/* perhaps add & use a flush counter inst.*/
	fr_authstats.fas_expire++;
	fr_authused--;
	num_flushed++;
	}
	}

	fr_authstart = 0;
	fr_authend = 0;
	fr_authnext = 0;

	return num_flushed;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_auth_waiting */
	/* Returns: int - 0 = no pakcets wiating, 1 = packets waiting. */
	/* Parameters: None */
	/* */
	/* Simple truth check to see if there are any packets waiting in the auth */
	/* queue. */
	/* ------------------------------------------------------------------------ */
	int fr_auth_waiting()
	{
	return (fr_authused != 0);
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authgeniter */
	/* Returns: int - 0 == success, else error */
	/* Parameters: token(I) - pointer to ipftoken structure */
	/* itp(I) - pointer to ipfgeniter structure */
	/* */
	/* ------------------------------------------------------------------------ */
	int fr_authgeniter(token, itp)
	ipftoken_t *token;
	ipfgeniter_t *itp;
	{
	frauthent_t fae, next, zero;
	int error;

	if (itp->igi_data == NULL)
	return EFAULT;

	if (itp->igi_type != IPFGENITER_AUTH)
	return EINVAL;

	fae = token->ipt_data;
	READ_ENTER(&ipf_auth);
	if (fae == NULL) {
	next = fae_list;
	} else {
	next = fae->fae_next;
	}

	if (next != NULL) {
	/*
	* If we find an auth entry to use, bump its reference count
	* so that it can be used for is_next when we come back.
	*/
	ATOMIC_INC(next->fae_ref);
	if (next->fae_next == NULL) {
	ipf_freetoken(token);
	token = NULL;
	} else {
	token->ipt_data = next;
	}
	} else {
	bzero(&zero, sizeof(zero));
	next = &zero;
	}
	RWLOCK_EXIT(&ipf_auth);

	/*
	* If we had a prior pointer to an auth entry, release it.
	*/
	if (fae != NULL) {
	WRITE_ENTER(&ipf_auth);
	fr_authderef(&fae);
	RWLOCK_EXIT(&ipf_auth);
	}

	/*
	* This should arguably be via fr_outobj() so that the auth
	* structure can (if required) be massaged going out.
	*/
	error = COPYOUT(next, itp->igi_data, sizeof(*next));
	if (error != 0)
	error = EFAULT;

	return error;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authderef */
	/* Returns: None */
	/* Parameters: faep(IO) - pointer to caller's frauthent_t pointer */
	/* Locks: WRITE(ipf_auth) */
	/* */
	/* This function unconditionally sets the pointer in the caller to NULL, */
	/* to make it clear that it should no longer use that pointer, and drops */
	/* the reference count on the structure by 1. If it reaches 0, free it up. */
	/* ------------------------------------------------------------------------ */
	void fr_authderef(faep)
	frauthent_t **faep;
	{
	frauthent_t *fae;

	fae = *faep;
	*faep = NULL;

	fae->fae_ref--;
	if (fae->fae_ref == 0) {
	KFREE(fae);
	}
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authwait */
	/* Returns: int - 0 == success, else error */
	/* Parameters: data(I) - pointer to data from ioctl call */
	/* */
	/* This function is called when an application is waiting for a packet to */
	/* match an "auth" rule by issuing an SIOCAUTHW ioctl. If there is already */
	/* a packet waiting on the queue then we will return that _one_ immediately.*/
	/* If there are no packets present in the queue (fr_authpkts) then we go to */
	/* sleep. */
	/* ------------------------------------------------------------------------ */
	int fr_authwait(data)
	char *data;
	{
	frauth_t auth, *au = &auth;
	int error, len, i;
	mb_t *m;
	char *t;
	#if defined(_KERNEL) && !defined(MENTAT) && !defined(linux) && \
	(!defined(__FreeBSD_version) \|\| (__FreeBSD_version < 501000))
	SPL_INT(s);
	#endif

	fr_authioctlloop:
	error = fr_inobj(data, au, IPFOBJ_FRAUTH);
	if (error != 0)
	return error;

	/*
	* XXX Locks are held below over calls to copyout...a better
	* solution needs to be found so this isn't necessary. The situation
	* we are trying to guard against here is an error in the copyout
	* steps should not cause the packet to "disappear" from the queue.
	*/
	READ_ENTER(&ipf_auth);

	/*
	* If fr_authnext is not equal to fr_authend it will be because there
	* is a packet waiting to be delt with in the fr_authpkts array. We
	* copy as much of that out to user space as requested.
	*/
	if (fr_authused > 0) {
	while (fr_authpkts[fr_authnext] == NULL) {
	fr_authnext++;
	if (fr_authnext == fr_authsize)
	fr_authnext = 0;
	}

	error = fr_outobj(data, &fr_auth[fr_authnext], IPFOBJ_FRAUTH);
	if (error != 0)
	return error;

	if (auth.fra_len != 0 && auth.fra_buf != NULL) {
	/*
	* Copy packet contents out to user space if
	* requested. Bail on an error.
	*/
	m = fr_authpkts[fr_authnext];
	len = MSGDSIZE(m);
	if (len > auth.fra_len)
	len = auth.fra_len;
	auth.fra_len = len;

	for (t = auth.fra_buf; m && (len > 0); ) {
	i = MIN(M_LEN(m), len);
	error = copyoutptr(MTOD(m, char *), &t, i);
	len -= i;
	t += i;
	if (error != 0)
	return error;
	m = m->m_next;
	}
	}
	RWLOCK_EXIT(&ipf_auth);

	SPL_NET(s);
	WRITE_ENTER(&ipf_auth);
	fr_authnext++;
	if (fr_authnext == fr_authsize)
	fr_authnext = 0;
	RWLOCK_EXIT(&ipf_auth);
	SPL_X(s);

	return 0;
	}
	RWLOCK_EXIT(&ipf_auth);

	MUTEX_ENTER(&ipf_authmx);
	#ifdef _KERNEL
	# if SOLARIS
	error = 0;
	if (!cv_wait_sig(&ipfauthwait, &ipf_authmx.ipf_lk))
	error = EINTR;
	# else /* SOLARIS */
	# ifdef __hpux
	{
	lock_t *l;

	l = get_sleep_lock(&fr_authnext);
	error = sleep(&fr_authnext, PZERO+1);
	spinunlock(l);
	}
	# else
	# ifdef __osf__
	error = mpsleep(&fr_authnext, PSUSP\|PCATCH, "fr_authnext", 0,
	&ipf_authmx, MS_LOCK_SIMPLE);
	# else
	error = SLEEP(&fr_authnext, "fr_authnext");
	# endif /* __osf__ */
	# endif /* __hpux */
	# endif /* SOLARIS */
	#endif
	MUTEX_EXIT(&ipf_authmx);
	if (error == 0)
	goto fr_authioctlloop;
	return error;
	}


	/* ------------------------------------------------------------------------ */
	/* Function: fr_authreply */
	/* Returns: int - 0 == success, else error */
	/* Parameters: data(I) - pointer to data from ioctl call */
	/* */
	/* This function is called by an application when it wants to return a */
	/* decision on a packet using the SIOCAUTHR ioctl. This is after it has */
	/* received information using an SIOCAUTHW. The decision returned in the */
	/* form of flags, the same as those used in each rule. */
	/* ------------------------------------------------------------------------ */
	int fr_authreply(data)
	char *data;
	{
	frauth_t auth, au = &auth, fra;
	int error, i;
	mb_t *m;
	SPL_INT(s);

	error = fr_inobj(data, &auth, IPFOBJ_FRAUTH);
	if (error != 0)
	return error;

	SPL_NET(s);
	WRITE_ENTER(&ipf_auth);

	i = au->fra_index;
	fra = fr_auth + i;
	error = 0;

	/*
	* Check the validity of the information being returned with two simple
	* checks. First, the auth index value should be within the size of
	* the array and second the packet id being returned should also match.
	*/
	if ((i < 0) \|\| (i >= fr_authsize) \|\|
	(fra->fra_info.fin_id != au->fra_info.fin_id)) {
	RWLOCK_EXIT(&ipf_auth);
	SPL_X(s);
	return ESRCH;
	}

	m = fr_authpkts[i];
	fra->fra_index = -2;
	fra->fra_pass = au->fra_pass;
	fr_authpkts[i] = NULL;

	RWLOCK_EXIT(&ipf_auth);

	/*
	* Re-insert the packet back into the packet stream flowing through
	* the kernel in a manner that will mean IPFilter sees the packet
	* again. This is not the same as is done with fastroute,
	* deliberately, as we want to resume the normal packet processing
	* path for it.
	*/
	#ifdef _KERNEL
	if ((m != NULL) && (au->fra_info.fin_out != 0)) {
	error = ipf_inject(&fra->fra_info, m);
	if (error != 0) {
	error = ENOBUFS;
	fr_authstats.fas_sendfail++;
	} else {
	fr_authstats.fas_sendok++;
	}
	} else if (m) {
	error = ipf_inject(&fra->fra_info, m);
	if (error != 0) {
	error = ENOBUFS;
	fr_authstats.fas_quefail++;
	} else {
	fr_authstats.fas_queok++;
	}
	} else {
	error = EINVAL;
	}

	/*
	* If we experience an error which will result in the packet
	* not being processed, make sure we advance to the next one.
	*/
	if (error == ENOBUFS) {
	WRITE_ENTER(&ipf_auth);
	fr_authused--;
	fra->fra_index = -1;
	fra->fra_pass = 0;
	if (i == fr_authstart) {
	while (fra->fra_index == -1) {
	i++;
	if (i == fr_authsize)
	i = 0;
	fr_authstart = i;
	if (i == fr_authend)
	break;
	}
	if (fr_authstart == fr_authend) {
	fr_authnext = 0;
	fr_authstart = fr_authend = 0;
	}
	}
	RWLOCK_EXIT(&ipf_auth);
	}
	#endif /* _KERNEL */
	SPL_X(s);

	return 0;
	}
	Index: head/sys/contrib/pf/net/pf.c
	===================================================================
	--- head/sys/contrib/pf/net/pf.c (revision 183549)
	+++ head/sys/contrib/pf/net/pf.c (revision 183550)
	@@ -1,7613 +1,7631 @@
	/* $OpenBSD: pf.c,v 1.527 2007/02/22 15:23:23 pyr Exp $ */
	/* add: $OpenBSD: pf.c,v 1.559 2007/09/18 18:45:59 markus Exp $ */

	/*
	* Copyright (c) 2001 Daniel Hartmeier
	* Copyright (c) 2002,2003 Henning Brauer
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials provided
	* with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* Effort sponsored in part by the Defense Advanced Research Projects
	* Agency (DARPA) and Air Force Research Laboratory, Air Force
	* Materiel Command, USAF, under agreement number F30602-01-2-0537.
	*
	*/

	#ifdef __FreeBSD__
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");
	#endif

	#ifdef __FreeBSD__
	#include "opt_mac.h"
	#include "opt_bpf.h"
	#include "opt_pf.h"

	#ifdef DEV_BPF
	#define NBPFILTER DEV_BPF
	#else
	#define NBPFILTER 0
	#endif

	#ifdef DEV_PFLOG
	#define NPFLOG DEV_PFLOG
	#else
	#define NPFLOG 0
	#endif

	#ifdef DEV_PFSYNC
	#define NPFSYNC DEV_PFSYNC
	#else
	#define NPFSYNC 0
	#endif

	#else
	#include "bpfilter.h"
	#include "pflog.h"
	#include "pfsync.h"
	#endif

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/filio.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/kernel.h>
	#include <sys/time.h>
	#ifdef __FreeBSD__
	#include <sys/sysctl.h>
	#include <sys/endian.h>
	#else
	#include <sys/pool.h>
	#endif
	#include <sys/proc.h>
	#ifdef __FreeBSD__
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/sx.h>
	#include <sys/vimage.h>
	#else
	#include <sys/rwlock.h>
	#endif

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/bpf.h>
	#include <net/route.h>
	#ifndef __FreeBSD__
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/udp.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/in_pcb.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/udp_var.h>
	#include <netinet/icmp_var.h>
	#include <netinet/if_ether.h>

	#ifndef __FreeBSD__
	#include <dev/rndvar.h>
	#endif
	#include <net/pfvar.h>
	#include <net/if_pflog.h>

	#if NPFSYNC > 0
	#include <net/if_pfsync.h>
	#endif /* NPFSYNC > 0 */

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet/in_pcb.h>
	#include <netinet/icmp6.h>
	#include <netinet6/nd6.h>
	#ifdef __FreeBSD__
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_pcb.h>
	#endif
	#endif /* INET6 */

	#ifdef __FreeBSD__
	#include <machine/in_cksum.h>
	#include <sys/limits.h>
	#include <sys/ucred.h>
	#include <security/mac/mac_framework.h>

	extern int ip_optcopy(struct ip , struct ip );
	extern int debug_pfugidhack;
	#endif

	#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x

	/*
	* Global variables
	*/

	struct pf_altqqueue pf_altqs[2];
	struct pf_palist pf_pabuf;
	struct pf_altqqueue *pf_altqs_active;
	struct pf_altqqueue *pf_altqs_inactive;
	struct pf_status pf_status;

	u_int32_t ticket_altqs_active;
	u_int32_t ticket_altqs_inactive;
	int altqs_inactive_open;
	u_int32_t ticket_pabuf;

	struct pf_anchor_stackframe {
	struct pf_ruleset *rs;
	struct pf_rule *r;
	struct pf_anchor_node *parent;
	struct pf_anchor *child;
	} pf_anchor_stack[64];

	#ifdef __FreeBSD__
	uma_zone_t pf_src_tree_pl, pf_rule_pl;
	uma_zone_t pf_state_pl, pf_altq_pl, pf_pooladdr_pl;
	#else
	struct pool pf_src_tree_pl, pf_rule_pl;
	struct pool pf_state_pl, pf_altq_pl, pf_pooladdr_pl;
	#endif

	void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t);

	void pf_init_threshold(struct pf_threshold *, u_int32_t,
	u_int32_t);
	void pf_add_threshold(struct pf_threshold *);
	int pf_check_threshold(struct pf_threshold *);

	void pf_change_ap(struct pf_addr , u_int16_t ,
	u_int16_t , u_int16_t , struct pf_addr *,
	u_int16_t, u_int8_t, sa_family_t);
	int pf_modulate_sack(struct mbuf , int, struct pf_pdesc ,
	struct tcphdr , struct pf_state_peer );
	#ifdef INET6
	void pf_change_a6(struct pf_addr , u_int16_t ,
	struct pf_addr *, u_int8_t);
	#endif /* INET6 */
	void pf_change_icmp(struct pf_addr , u_int16_t ,
	struct pf_addr , struct pf_addr , u_int16_t,
	u_int16_t , u_int16_t , u_int16_t *,
	u_int16_t *, u_int8_t, sa_family_t);
	#ifdef __FreeBSD__
	void pf_send_tcp(struct mbuf *,
	const struct pf_rule *, sa_family_t,
	#else
	void pf_send_tcp(const struct pf_rule *, sa_family_t,
	#endif
	const struct pf_addr , const struct pf_addr ,
	u_int16_t, u_int16_t, u_int32_t, u_int32_t,
	u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
	u_int16_t, struct ether_header , struct ifnet );
	void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
	sa_family_t, struct pf_rule *);
	struct pf_rule pf_match_translation(struct pf_pdesc , struct mbuf *,
	int, int, struct pfi_kif *,
	struct pf_addr , u_int16_t, struct pf_addr ,
	u_int16_t, int);
	struct pf_rule pf_get_translation(struct pf_pdesc , struct mbuf *,
	int, int, struct pfi_kif , struct pf_src_node *,
	struct pf_addr *, u_int16_t,
	struct pf_addr *, u_int16_t,
	struct pf_addr , u_int16_t );
	int pf_test_tcp(struct pf_rule , struct pf_state ,
	int, struct pfi_kif , struct mbuf , int,
	void , struct pf_pdesc , struct pf_rule **,
	#ifdef __FreeBSD__
	struct pf_ruleset *, struct ifqueue ,
	struct inpcb *);
	#else
	struct pf_ruleset *, struct ifqueue );
	#endif
	int pf_test_udp(struct pf_rule , struct pf_state ,
	int, struct pfi_kif , struct mbuf , int,
	void , struct pf_pdesc , struct pf_rule **,
	#ifdef __FreeBSD__
	struct pf_ruleset *, struct ifqueue ,
	struct inpcb *);
	#else
	struct pf_ruleset *, struct ifqueue );
	#endif
	int pf_test_icmp(struct pf_rule , struct pf_state ,
	int, struct pfi_kif , struct mbuf , int,
	void , struct pf_pdesc , struct pf_rule **,
	struct pf_ruleset *, struct ifqueue );
	int pf_test_other(struct pf_rule , struct pf_state ,
	int, struct pfi_kif , struct mbuf , int, void *,
	struct pf_pdesc , struct pf_rule *,
	struct pf_ruleset *, struct ifqueue );
	int pf_test_fragment(struct pf_rule **, int,
	struct pfi_kif , struct mbuf , void *,
	struct pf_pdesc , struct pf_rule *,
	struct pf_ruleset **);
	int pf_test_state_tcp(struct pf_state **, int,
	struct pfi_kif , struct mbuf , int,
	void , struct pf_pdesc , u_short *);
	int pf_test_state_udp(struct pf_state **, int,
	struct pfi_kif , struct mbuf , int,
	void , struct pf_pdesc );
	int pf_test_state_icmp(struct pf_state **, int,
	struct pfi_kif , struct mbuf , int,
	void , struct pf_pdesc , u_short *);
	int pf_test_state_other(struct pf_state **, int,
	struct pfi_kif , struct pf_pdesc );
	int pf_match_tag(struct mbuf , struct pf_rule ,
	struct pf_mtag , int );
	int pf_step_out_of_anchor(int , struct pf_ruleset *,
	int, struct pf_rule , struct pf_rule ,
	int *);
	void pf_hash(struct pf_addr , struct pf_addr ,
	struct pf_poolhashkey *, sa_family_t);
	int pf_map_addr(u_int8_t, struct pf_rule *,
	struct pf_addr , struct pf_addr ,
	struct pf_addr , struct pf_src_node *);
	int pf_get_sport(sa_family_t, u_int8_t, struct pf_rule *,
	struct pf_addr , struct pf_addr , u_int16_t,
	struct pf_addr , u_int16_t, u_int16_t, u_int16_t,
	struct pf_src_node **);
	void pf_route(struct mbuf *, struct pf_rule , int,
	struct ifnet , struct pf_state ,
	struct pf_pdesc *);
	void pf_route6(struct mbuf *, struct pf_rule , int,
	struct ifnet , struct pf_state ,
	struct pf_pdesc *);
	#ifdef __FreeBSD__
	/* XXX: import */
	#else
	int pf_socket_lookup(int, struct pf_pdesc *);
	#endif
	u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t,
	sa_family_t);
	u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t,
	sa_family_t);
	u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t,
	u_int16_t);
	void pf_set_rt_ifp(struct pf_state *,
	struct pf_addr *);
	int pf_check_proto_cksum(struct mbuf *, int, int,
	u_int8_t, sa_family_t);
	int pf_addr_wrap_neq(struct pf_addr_wrap *,
	struct pf_addr_wrap *);
	struct pf_state pf_find_state_recurse(struct pfi_kif ,
	struct pf_state_cmp *, u_int8_t);
	int pf_src_connlimit(struct pf_state **);
	int pf_check_congestion(struct ifqueue *);

	#ifdef __FreeBSD__
	int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);

	extern int pf_end_threads;

	struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX];
	#else
	extern struct pool pfr_ktable_pl;
	extern struct pool pfr_kentry_pl;

	struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX] = {
	{ &pf_state_pl, PFSTATE_HIWAT },
	{ &pf_src_tree_pl, PFSNODE_HIWAT },
	{ &pf_frent_pl, PFFRAG_FRENT_HIWAT },
	{ &pfr_ktable_pl, PFR_KTABLE_HIWAT },
	{ &pfr_kentry_pl, PFR_KENTRY_HIWAT }
	};
	#endif

	#define STATE_LOOKUP() \
	do { \
	if (direction == PF_IN) \
	*state = pf_find_state_recurse( \
	kif, &key, PF_EXT_GWY); \
	else \
	*state = pf_find_state_recurse( \
	kif, &key, PF_LAN_EXT); \
	if (state == NULL \|\| (state)->timeout == PFTM_PURGE) \
	return (PF_DROP); \
	if (direction == PF_OUT && \
	(((*state)->rule.ptr->rt == PF_ROUTETO && \
	(*state)->rule.ptr->direction == PF_OUT) \|\| \
	((*state)->rule.ptr->rt == PF_REPLYTO && \
	(*state)->rule.ptr->direction == PF_IN)) && \
	(*state)->rt_kif != NULL && \
	(*state)->rt_kif != kif) \
	return (PF_PASS); \
	} while (0)

	#define STATE_TRANSLATE(s) \
	(s)->lan.addr.addr32[0] != (s)->gwy.addr.addr32[0] \|\| \
	((s)->af == AF_INET6 && \
	((s)->lan.addr.addr32[1] != (s)->gwy.addr.addr32[1] \|\| \
	(s)->lan.addr.addr32[2] != (s)->gwy.addr.addr32[2] \|\| \
	(s)->lan.addr.addr32[3] != (s)->gwy.addr.addr32[3])) \|\| \
	(s)->lan.port != (s)->gwy.port

	#define BOUND_IFACE(r, k) \
	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all

	#define STATE_INC_COUNTERS(s) \
	do { \
	s->rule.ptr->states++; \
	if (s->anchor.ptr != NULL) \
	s->anchor.ptr->states++; \
	if (s->nat_rule.ptr != NULL) \
	s->nat_rule.ptr->states++; \
	} while (0)

	#define STATE_DEC_COUNTERS(s) \
	do { \
	if (s->nat_rule.ptr != NULL) \
	s->nat_rule.ptr->states--; \
	if (s->anchor.ptr != NULL) \
	s->anchor.ptr->states--; \
	s->rule.ptr->states--; \
	} while (0)

	struct pf_src_tree tree_src_tracking;

	struct pf_state_tree_id tree_id;
	struct pf_state_queue state_list;

	#ifdef __FreeBSD__
	static int pf_src_compare(struct pf_src_node , struct pf_src_node );
	static int pf_state_compare_lan_ext(struct pf_state , struct pf_state );
	static int pf_state_compare_ext_gwy(struct pf_state , struct pf_state );
	static int pf_state_compare_id(struct pf_state , struct pf_state );
	#endif

	RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
	RB_GENERATE(pf_state_tree_lan_ext, pf_state,
	u.s.entry_lan_ext, pf_state_compare_lan_ext);
	RB_GENERATE(pf_state_tree_ext_gwy, pf_state,
	u.s.entry_ext_gwy, pf_state_compare_ext_gwy);
	RB_GENERATE(pf_state_tree_id, pf_state,
	u.s.entry_id, pf_state_compare_id);

	#ifdef __FreeBSD__
	static int
	#else
	static __inline int
	#endif
	pf_src_compare(struct pf_src_node a, struct pf_src_node b)
	{
	int diff;

	if (a->rule.ptr > b->rule.ptr)
	return (1);
	if (a->rule.ptr < b->rule.ptr)
	return (-1);
	if ((diff = a->af - b->af) != 0)
	return (diff);
	switch (a->af) {
	#ifdef INET
	case AF_INET:
	if (a->addr.addr32[0] > b->addr.addr32[0])
	return (1);
	if (a->addr.addr32[0] < b->addr.addr32[0])
	return (-1);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (a->addr.addr32[3] > b->addr.addr32[3])
	return (1);
	if (a->addr.addr32[3] < b->addr.addr32[3])
	return (-1);
	if (a->addr.addr32[2] > b->addr.addr32[2])
	return (1);
	if (a->addr.addr32[2] < b->addr.addr32[2])
	return (-1);
	if (a->addr.addr32[1] > b->addr.addr32[1])
	return (1);
	if (a->addr.addr32[1] < b->addr.addr32[1])
	return (-1);
	if (a->addr.addr32[0] > b->addr.addr32[0])
	return (1);
	if (a->addr.addr32[0] < b->addr.addr32[0])
	return (-1);
	break;
	#endif /* INET6 */
	}
	return (0);
	}

	#ifdef __FreeBSD__
	static int
	#else
	static __inline int
	#endif
	pf_state_compare_lan_ext(struct pf_state a, struct pf_state b)
	{
	int diff;

	if ((diff = a->proto - b->proto) != 0)
	return (diff);
	if ((diff = a->af - b->af) != 0)
	return (diff);
	switch (a->af) {
	#ifdef INET
	case AF_INET:
	if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0])
	return (1);
	if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0])
	return (-1);
	if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0])
	return (1);
	if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0])
	return (-1);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (a->lan.addr.addr32[3] > b->lan.addr.addr32[3])
	return (1);
	if (a->lan.addr.addr32[3] < b->lan.addr.addr32[3])
	return (-1);
	if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3])
	return (1);
	if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3])
	return (-1);
	if (a->lan.addr.addr32[2] > b->lan.addr.addr32[2])
	return (1);
	if (a->lan.addr.addr32[2] < b->lan.addr.addr32[2])
	return (-1);
	if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2])
	return (1);
	if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2])
	return (-1);
	if (a->lan.addr.addr32[1] > b->lan.addr.addr32[1])
	return (1);
	if (a->lan.addr.addr32[1] < b->lan.addr.addr32[1])
	return (-1);
	if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1])
	return (1);
	if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1])
	return (-1);
	if (a->lan.addr.addr32[0] > b->lan.addr.addr32[0])
	return (1);
	if (a->lan.addr.addr32[0] < b->lan.addr.addr32[0])
	return (-1);
	if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0])
	return (1);
	if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0])
	return (-1);
	break;
	#endif /* INET6 */
	}

	if ((diff = a->lan.port - b->lan.port) != 0)
	return (diff);
	if ((diff = a->ext.port - b->ext.port) != 0)
	return (diff);

	return (0);
	}

	#ifdef __FreeBSD__
	static int
	#else
	static __inline int
	#endif
	pf_state_compare_ext_gwy(struct pf_state a, struct pf_state b)
	{
	int diff;

	if ((diff = a->proto - b->proto) != 0)
	return (diff);
	if ((diff = a->af - b->af) != 0)
	return (diff);
	switch (a->af) {
	#ifdef INET
	case AF_INET:
	if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0])
	return (1);
	if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0])
	return (-1);
	if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0])
	return (1);
	if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0])
	return (-1);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (a->ext.addr.addr32[3] > b->ext.addr.addr32[3])
	return (1);
	if (a->ext.addr.addr32[3] < b->ext.addr.addr32[3])
	return (-1);
	if (a->gwy.addr.addr32[3] > b->gwy.addr.addr32[3])
	return (1);
	if (a->gwy.addr.addr32[3] < b->gwy.addr.addr32[3])
	return (-1);
	if (a->ext.addr.addr32[2] > b->ext.addr.addr32[2])
	return (1);
	if (a->ext.addr.addr32[2] < b->ext.addr.addr32[2])
	return (-1);
	if (a->gwy.addr.addr32[2] > b->gwy.addr.addr32[2])
	return (1);
	if (a->gwy.addr.addr32[2] < b->gwy.addr.addr32[2])
	return (-1);
	if (a->ext.addr.addr32[1] > b->ext.addr.addr32[1])
	return (1);
	if (a->ext.addr.addr32[1] < b->ext.addr.addr32[1])
	return (-1);
	if (a->gwy.addr.addr32[1] > b->gwy.addr.addr32[1])
	return (1);
	if (a->gwy.addr.addr32[1] < b->gwy.addr.addr32[1])
	return (-1);
	if (a->ext.addr.addr32[0] > b->ext.addr.addr32[0])
	return (1);
	if (a->ext.addr.addr32[0] < b->ext.addr.addr32[0])
	return (-1);
	if (a->gwy.addr.addr32[0] > b->gwy.addr.addr32[0])
	return (1);
	if (a->gwy.addr.addr32[0] < b->gwy.addr.addr32[0])
	return (-1);
	break;
	#endif /* INET6 */
	}

	if ((diff = a->ext.port - b->ext.port) != 0)
	return (diff);
	if ((diff = a->gwy.port - b->gwy.port) != 0)
	return (diff);

	return (0);
	}

	#ifdef __FreeBSD__
	static int
	#else
	static __inline int
	#endif
	pf_state_compare_id(struct pf_state a, struct pf_state b)
	{
	if (a->id > b->id)
	return (1);
	if (a->id < b->id)
	return (-1);
	if (a->creatorid > b->creatorid)
	return (1);
	if (a->creatorid < b->creatorid)
	return (-1);

	return (0);
	}

	#ifdef INET6
	void
	pf_addrcpy(struct pf_addr dst, struct pf_addr src, sa_family_t af)
	{
	switch (af) {
	#ifdef INET
	case AF_INET:
	dst->addr32[0] = src->addr32[0];
	break;
	#endif /* INET */
	case AF_INET6:
	dst->addr32[0] = src->addr32[0];
	dst->addr32[1] = src->addr32[1];
	dst->addr32[2] = src->addr32[2];
	dst->addr32[3] = src->addr32[3];
	break;
	}
	}
	#endif /* INET6 */

	struct pf_state *
	pf_find_state_byid(struct pf_state_cmp *key)
	{
	pf_status.fcounters[FCNT_STATE_SEARCH]++;
	return (RB_FIND(pf_state_tree_id, &tree_id, (struct pf_state *)key));
	}

	struct pf_state *
	pf_find_state_recurse(struct pfi_kif kif, struct pf_state_cmp key, u_int8_t tree)
	{
	struct pf_state *s;

	pf_status.fcounters[FCNT_STATE_SEARCH]++;

	switch (tree) {
	case PF_LAN_EXT:
	if ((s = RB_FIND(pf_state_tree_lan_ext, &kif->pfik_lan_ext,
	(struct pf_state *)key)) != NULL)
	return (s);
	if ((s = RB_FIND(pf_state_tree_lan_ext, &pfi_all->pfik_lan_ext,
	(struct pf_state *)key)) != NULL)
	return (s);
	return (NULL);
	case PF_EXT_GWY:
	if ((s = RB_FIND(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy,
	(struct pf_state *)key)) != NULL)
	return (s);
	if ((s = RB_FIND(pf_state_tree_ext_gwy, &pfi_all->pfik_ext_gwy,
	(struct pf_state *)key)) != NULL)
	return (s);
	return (NULL);
	default:
	panic("pf_find_state_recurse");
	}
	}

	struct pf_state *
	pf_find_state_all(struct pf_state_cmp key, u_int8_t tree, int more)
	{
	struct pf_state s, ss = NULL;
	struct pfi_kif *kif;

	pf_status.fcounters[FCNT_STATE_SEARCH]++;

	switch (tree) {
	case PF_LAN_EXT:
	TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) {
	s = RB_FIND(pf_state_tree_lan_ext,
	&kif->pfik_lan_ext, (struct pf_state *)key);
	if (s == NULL)
	continue;
	if (more == NULL)
	return (s);
	ss = s;
	(*more)++;
	}
	return (ss);
	case PF_EXT_GWY:
	TAILQ_FOREACH(kif, &pfi_statehead, pfik_w_states) {
	s = RB_FIND(pf_state_tree_ext_gwy,
	&kif->pfik_ext_gwy, (struct pf_state *)key);
	if (s == NULL)
	continue;
	if (more == NULL)
	return (s);
	ss = s;
	(*more)++;
	}
	return (ss);
	default:
	panic("pf_find_state_all");
	}
	}

	void
	pf_init_threshold(struct pf_threshold *threshold,
	u_int32_t limit, u_int32_t seconds)
	{
	threshold->limit = limit * PF_THRESHOLD_MULT;
	threshold->seconds = seconds;
	threshold->count = 0;
	threshold->last = time_second;
	}

	void
	pf_add_threshold(struct pf_threshold *threshold)
	{
	u_int32_t t = time_second, diff = t - threshold->last;

	if (diff >= threshold->seconds)
	threshold->count = 0;
	else
	threshold->count -= threshold->count * diff /
	threshold->seconds;
	threshold->count += PF_THRESHOLD_MULT;
	threshold->last = t;
	}

	int
	pf_check_threshold(struct pf_threshold *threshold)
	{
	return (threshold->count > threshold->limit);
	}

	int
	pf_src_connlimit(struct pf_state **state)
	{
	struct pf_state *s;
	int bad = 0;

	(*state)->src_node->conn++;
	(*state)->src.tcp_est = 1;
	pf_add_threshold(&(*state)->src_node->conn_rate);

	if ((*state)->rule.ptr->max_src_conn &&
	(*state)->rule.ptr->max_src_conn <
	(*state)->src_node->conn) {
	pf_status.lcounters[LCNT_SRCCONN]++;
	bad++;
	}

	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
	pf_check_threshold(&(*state)->src_node->conn_rate)) {
	pf_status.lcounters[LCNT_SRCCONNRATE]++;
	bad++;
	}

	if (!bad)
	return (0);

	if ((*state)->rule.ptr->overload_tbl) {
	struct pfr_addr p;
	u_int32_t killed = 0;

	pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf_src_connlimit: blocking address ");
	pf_print_host(&(*state)->src_node->addr, 0,
	(*state)->af);
	}

	bzero(&p, sizeof(p));
	p.pfra_af = (*state)->af;
	switch ((*state)->af) {
	#ifdef INET
	case AF_INET:
	p.pfra_net = 32;
	p.pfra_ip4addr = (*state)->src_node->addr.v4;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	p.pfra_net = 128;
	p.pfra_ip6addr = (*state)->src_node->addr.v6;
	break;
	#endif /* INET6 */
	}

	pfr_insert_kentry((*state)->rule.ptr->overload_tbl,
	&p, time_second);

	/* kill existing states if that's required. */
	if ((*state)->rule.ptr->flush) {
	pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;

	RB_FOREACH(s, pf_state_tree_id, &tree_id) {
	/*
	* Kill states from this source. (Only those
	* from the same rule if PF_FLUSH_GLOBAL is not
	* set)
	*/
	if (s->af == (*state)->af &&
	(((*state)->direction == PF_OUT &&
	PF_AEQ(&(*state)->src_node->addr,
	&s->lan.addr, s->af)) \|\|
	((*state)->direction == PF_IN &&
	PF_AEQ(&(*state)->src_node->addr,
	&s->ext.addr, s->af))) &&
	((*state)->rule.ptr->flush &
	PF_FLUSH_GLOBAL \|\|
	(*state)->rule.ptr == s->rule.ptr)) {
	s->timeout = PFTM_PURGE;
	s->src.state = s->dst.state =
	TCPS_CLOSED;
	killed++;
	}
	}
	if (pf_status.debug >= PF_DEBUG_MISC)
	printf(", %u states killed", killed);
	}
	if (pf_status.debug >= PF_DEBUG_MISC)
	printf("\n");
	}

	/* kill this state */
	(*state)->timeout = PFTM_PURGE;
	(state)->src.state = (state)->dst.state = TCPS_CLOSED;
	return (1);
	}

	int
	pf_insert_src_node(struct pf_src_node *sn, struct pf_rule rule,
	struct pf_addr *src, sa_family_t af)
	{
	struct pf_src_node k;

	if (*sn == NULL) {
	k.af = af;
	PF_ACPY(&k.addr, src, af);
	if (rule->rule_flag & PFRULE_RULESRCTRACK \|\|
	rule->rpool.opts & PF_POOL_STICKYADDR)
	k.rule.ptr = rule;
	else
	k.rule.ptr = NULL;
	pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
	*sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k);
	}
	if (*sn == NULL) {
	if (!rule->max_src_nodes \|\|
	rule->src_nodes < rule->max_src_nodes)
	(*sn) = pool_get(&pf_src_tree_pl, PR_NOWAIT);
	else
	pf_status.lcounters[LCNT_SRCNODES]++;
	if ((*sn) == NULL)
	return (-1);
	bzero(*sn, sizeof(struct pf_src_node));

	pf_init_threshold(&(*sn)->conn_rate,
	rule->max_src_conn_rate.limit,
	rule->max_src_conn_rate.seconds);

	(*sn)->af = af;
	if (rule->rule_flag & PFRULE_RULESRCTRACK \|\|
	rule->rpool.opts & PF_POOL_STICKYADDR)
	(*sn)->rule.ptr = rule;
	else
	(*sn)->rule.ptr = NULL;
	PF_ACPY(&(*sn)->addr, src, af);
	if (RB_INSERT(pf_src_tree,
	&tree_src_tracking, *sn) != NULL) {
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: src_tree insert failed: ");
	pf_print_host(&(*sn)->addr, 0, af);
	printf("\n");
	}
	pool_put(&pf_src_tree_pl, *sn);
	return (-1);
	}
	(*sn)->creation = time_second;
	(*sn)->ruletype = rule->action;
	if ((*sn)->rule.ptr != NULL)
	(*sn)->rule.ptr->src_nodes++;
	pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
	pf_status.src_nodes++;
	} else {
	if (rule->max_src_states &&
	(*sn)->states >= rule->max_src_states) {
	pf_status.lcounters[LCNT_SRCSTATES]++;
	return (-1);
	}
	}
	return (0);
	}

	int
	pf_insert_state(struct pfi_kif kif, struct pf_state state)
	{
	/* Thou MUST NOT insert multiple duplicate keys */
	state->u.s.kif = kif;
	if (RB_INSERT(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state)) {
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: state insert failed: tree_lan_ext");
	printf(" lan: ");
	pf_print_host(&state->lan.addr, state->lan.port,
	state->af);
	printf(" gwy: ");
	pf_print_host(&state->gwy.addr, state->gwy.port,
	state->af);
	printf(" ext: ");
	pf_print_host(&state->ext.addr, state->ext.port,
	state->af);
	if (state->sync_flags & PFSTATE_FROMSYNC)
	printf(" (from sync)");
	printf("\n");
	}
	return (-1);
	}

	if (RB_INSERT(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state)) {
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: state insert failed: tree_ext_gwy");
	printf(" lan: ");
	pf_print_host(&state->lan.addr, state->lan.port,
	state->af);
	printf(" gwy: ");
	pf_print_host(&state->gwy.addr, state->gwy.port,
	state->af);
	printf(" ext: ");
	pf_print_host(&state->ext.addr, state->ext.port,
	state->af);
	if (state->sync_flags & PFSTATE_FROMSYNC)
	printf(" (from sync)");
	printf("\n");
	}
	RB_REMOVE(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state);
	return (-1);
	}

	if (state->id == 0 && state->creatorid == 0) {
	state->id = htobe64(pf_status.stateid++);
	state->creatorid = pf_status.hostid;
	}
	if (RB_INSERT(pf_state_tree_id, &tree_id, state) != NULL) {
	if (pf_status.debug >= PF_DEBUG_MISC) {
	#ifdef __FreeBSD__
	printf("pf: state insert failed: "
	"id: %016llx creatorid: %08x",
	(long long)be64toh(state->id),
	ntohl(state->creatorid));
	#else
	printf("pf: state insert failed: "
	"id: %016llx creatorid: %08x",
	betoh64(state->id), ntohl(state->creatorid));
	#endif
	if (state->sync_flags & PFSTATE_FROMSYNC)
	printf(" (from sync)");
	printf("\n");
	}
	RB_REMOVE(pf_state_tree_lan_ext, &kif->pfik_lan_ext, state);
	RB_REMOVE(pf_state_tree_ext_gwy, &kif->pfik_ext_gwy, state);
	return (-1);
	}
	TAILQ_INSERT_TAIL(&state_list, state, u.s.entry_list);
	pf_status.fcounters[FCNT_STATE_INSERT]++;
	pf_status.states++;
	pfi_kif_ref(kif, PFI_KIF_REF_STATE);
	#if NPFSYNC
	pfsync_insert_state(state);
	#endif
	return (0);
	}

	void
	pf_purge_thread(void *v)
	{
	int nloops = 0, s;

	for (;;) {
	tsleep(pf_purge_thread, PWAIT, "pftm", 1 * hz);

	#ifdef __FreeBSD__
	sx_slock(&pf_consistency_lock);
	PF_LOCK();

	if (pf_end_threads) {
	pf_purge_expired_states(pf_status.states);
	pf_purge_expired_fragments();
	pf_purge_expired_src_nodes(0);
	pf_end_threads++;

	sx_sunlock(&pf_consistency_lock);
	PF_UNLOCK();
	wakeup(pf_purge_thread);
	kproc_exit(0);
	}
	#endif
	s = splsoftnet();

	/* process a fraction of the state table every second */
	pf_purge_expired_states(1 + (pf_status.states
	/ pf_default_rule.timeout[PFTM_INTERVAL]));

	/* purge other expired types every PFTM_INTERVAL seconds */
	if (++nloops >= pf_default_rule.timeout[PFTM_INTERVAL]) {
	pf_purge_expired_fragments();
	pf_purge_expired_src_nodes(0);
	nloops = 0;
	}

	splx(s);
	#ifdef __FreeBSD__
	PF_UNLOCK();
	sx_sunlock(&pf_consistency_lock);
	#endif
	}
	}

	u_int32_t
	pf_state_expires(const struct pf_state *state)
	{
	u_int32_t timeout;
	u_int32_t start;
	u_int32_t end;
	u_int32_t states;

	/* handle all PFTM_* > PFTM_MAX here */
	if (state->timeout == PFTM_PURGE)
	return (time_second);
	if (state->timeout == PFTM_UNTIL_PACKET)
	return (0);
	#ifdef __FreeBSD__
	KASSERT(state->timeout != PFTM_UNLINKED,
	("pf_state_expires: timeout == PFTM_UNLINKED"));
	KASSERT((state->timeout < PFTM_MAX),
	("pf_state_expires: timeout > PFTM_MAX"));
	#else
	KASSERT(state->timeout != PFTM_UNLINKED);
	KASSERT(state->timeout < PFTM_MAX);
	#endif
	timeout = state->rule.ptr->timeout[state->timeout];
	if (!timeout)
	timeout = pf_default_rule.timeout[state->timeout];
	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
	if (start) {
	end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
	states = state->rule.ptr->states;
	} else {
	start = pf_default_rule.timeout[PFTM_ADAPTIVE_START];
	end = pf_default_rule.timeout[PFTM_ADAPTIVE_END];
	states = pf_status.states;
	}
	if (end && states > start && start < end) {
	if (states < end)
	return (state->expire + timeout * (end - states) /
	(end - start));
	else
	return (time_second);
	}
	return (state->expire + timeout);
	}

	void
	pf_purge_expired_src_nodes(int waslocked)
	{
	struct pf_src_node cur, next;
	int locked = waslocked;

	for (cur = RB_MIN(pf_src_tree, &tree_src_tracking); cur; cur = next) {
	next = RB_NEXT(pf_src_tree, &tree_src_tracking, cur);

	if (cur->states <= 0 && cur->expire <= time_second) {
	if (! locked) {
	#ifdef __FreeBSD__
	if (!sx_try_upgrade(&pf_consistency_lock)) {
	PF_UNLOCK();
	sx_sunlock(&pf_consistency_lock);
	sx_xlock(&pf_consistency_lock);
	PF_LOCK();
	}
	#else
	rw_enter_write(&pf_consistency_lock);
	#endif
	next = RB_NEXT(pf_src_tree,
	&tree_src_tracking, cur);
	locked = 1;
	}
	if (cur->rule.ptr != NULL) {
	cur->rule.ptr->src_nodes--;
	if (cur->rule.ptr->states <= 0 &&
	cur->rule.ptr->max_src_nodes <= 0)
	pf_rm_rule(NULL, cur->rule.ptr);
	}
	RB_REMOVE(pf_src_tree, &tree_src_tracking, cur);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, cur);
	}
	}

	if (locked && !waslocked)
	#ifdef __FreeBSD__
	sx_downgrade(&pf_consistency_lock);
	#else
	rw_exit_write(&pf_consistency_lock);
	#endif
	}

	void
	pf_src_tree_remove_state(struct pf_state *s)
	{
	u_int32_t timeout;

	if (s->src_node != NULL) {
	if (s->proto == IPPROTO_TCP) {
	if (s->src.tcp_est)
	--s->src_node->conn;
	}
	if (--s->src_node->states <= 0) {
	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
	if (!timeout)
	timeout =
	pf_default_rule.timeout[PFTM_SRC_NODE];
	s->src_node->expire = time_second + timeout;
	}
	}
	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
	if (--s->nat_src_node->states <= 0) {
	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
	if (!timeout)
	timeout =
	pf_default_rule.timeout[PFTM_SRC_NODE];
	s->nat_src_node->expire = time_second + timeout;
	}
	}
	s->src_node = s->nat_src_node = NULL;
	}

	/* callers should be at splsoftnet */
	void
	pf_unlink_state(struct pf_state *cur)
	{
	#ifdef __FreeBSD__
	if (cur->local_flags & PFSTATE_EXPIRING)
	return;
	cur->local_flags \|= PFSTATE_EXPIRING;
	#endif
	if (cur->src.state == PF_TCPS_PROXY_DST) {
	#ifdef __FreeBSD__
	pf_send_tcp(NULL, cur->rule.ptr, cur->af,
	#else
	pf_send_tcp(cur->rule.ptr, cur->af,
	#endif
	&cur->ext.addr, &cur->lan.addr,
	cur->ext.port, cur->lan.port,
	cur->src.seqhi, cur->src.seqlo + 1,
	TH_RST\|TH_ACK, 0, 0, 0, 1, cur->tag, NULL, NULL);
	}
	RB_REMOVE(pf_state_tree_ext_gwy,
	&cur->u.s.kif->pfik_ext_gwy, cur);
	RB_REMOVE(pf_state_tree_lan_ext,
	&cur->u.s.kif->pfik_lan_ext, cur);
	RB_REMOVE(pf_state_tree_id, &tree_id, cur);
	#if NPFSYNC
	if (cur->creatorid == pf_status.hostid)
	pfsync_delete_state(cur);
	#endif
	cur->timeout = PFTM_UNLINKED;
	pf_src_tree_remove_state(cur);
	}

	/* callers should be at splsoftnet and hold the
	* write_lock on pf_consistency_lock */
	void
	pf_free_state(struct pf_state *cur)
	{
	#if NPFSYNC
	if (pfsyncif != NULL &&
	(pfsyncif->sc_bulk_send_next == cur \|\|
	pfsyncif->sc_bulk_terminator == cur))
	return;
	#endif
	#ifdef __FreeBSD__
	KASSERT(cur->timeout == PFTM_UNLINKED,
	("pf_free_state: cur->timeout != PFTM_UNLINKED"));
	#else
	KASSERT(cur->timeout == PFTM_UNLINKED);
	#endif
	if (--cur->rule.ptr->states <= 0 &&
	cur->rule.ptr->src_nodes <= 0)
	pf_rm_rule(NULL, cur->rule.ptr);
	if (cur->nat_rule.ptr != NULL)
	if (--cur->nat_rule.ptr->states <= 0 &&
	cur->nat_rule.ptr->src_nodes <= 0)
	pf_rm_rule(NULL, cur->nat_rule.ptr);
	if (cur->anchor.ptr != NULL)
	if (--cur->anchor.ptr->states <= 0)
	pf_rm_rule(NULL, cur->anchor.ptr);
	pf_normalize_tcp_cleanup(cur);
	pfi_kif_unref(cur->u.s.kif, PFI_KIF_REF_STATE);
	TAILQ_REMOVE(&state_list, cur, u.s.entry_list);
	if (cur->tag)
	pf_tag_unref(cur->tag);
	pool_put(&pf_state_pl, cur);
	pf_status.fcounters[FCNT_STATE_REMOVALS]++;
	pf_status.states--;
	}

	void
	pf_purge_expired_states(u_int32_t maxcheck)
	{
	static struct pf_state *cur = NULL;
	struct pf_state *next;
	int locked = 0;

	while (maxcheck--) {
	/* wrap to start of list when we hit the end */
	if (cur == NULL) {
	cur = TAILQ_FIRST(&state_list);
	if (cur == NULL)
	break; /* list empty */
	}

	/* get next state, as cur may get deleted */
	next = TAILQ_NEXT(cur, u.s.entry_list);

	if (cur->timeout == PFTM_UNLINKED) {
	/* free unlinked state */
	if (! locked) {
	#ifdef __FreeBSD__
	if (!sx_try_upgrade(&pf_consistency_lock)) {
	PF_UNLOCK();
	sx_sunlock(&pf_consistency_lock);
	sx_xlock(&pf_consistency_lock);
	PF_LOCK();
	}
	#else
	rw_enter_write(&pf_consistency_lock);
	#endif
	locked = 1;
	}
	pf_free_state(cur);
	} else if (pf_state_expires(cur) <= time_second) {
	/* unlink and free expired state */
	pf_unlink_state(cur);
	if (! locked) {
	#ifdef __FreeBSD__
	if (!sx_try_upgrade(&pf_consistency_lock)) {
	PF_UNLOCK();
	sx_sunlock(&pf_consistency_lock);
	sx_xlock(&pf_consistency_lock);
	PF_LOCK();
	}
	#else
	rw_enter_write(&pf_consistency_lock);
	#endif
	locked = 1;
	}
	pf_free_state(cur);
	}
	cur = next;
	}

	if (locked)
	#ifdef __FreeBSD__
	sx_downgrade(&pf_consistency_lock);
	#else
	rw_exit_write(&pf_consistency_lock);
	#endif
	}

	int
	pf_tbladdr_setup(struct pf_ruleset rs, struct pf_addr_wrap aw)
	{
	if (aw->type != PF_ADDR_TABLE)
	return (0);
	if ((aw->p.tbl = pfr_attach_table(rs, aw->v.tblname)) == NULL)
	return (1);
	return (0);
	}

	void
	pf_tbladdr_remove(struct pf_addr_wrap *aw)
	{
	if (aw->type != PF_ADDR_TABLE \|\| aw->p.tbl == NULL)
	return;
	pfr_detach_table(aw->p.tbl);
	aw->p.tbl = NULL;
	}

	void
	pf_tbladdr_copyout(struct pf_addr_wrap *aw)
	{
	struct pfr_ktable *kt = aw->p.tbl;

	if (aw->type != PF_ADDR_TABLE \|\| kt == NULL)
	return;
	if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
	kt = kt->pfrkt_root;
	aw->p.tbl = NULL;
	aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
	kt->pfrkt_cnt : -1;
	}

	void
	pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
	{
	switch (af) {
	#ifdef INET
	case AF_INET: {
	u_int32_t a = ntohl(addr->addr32[0]);
	printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
	(a>>8)&255, a&255);
	if (p) {
	p = ntohs(p);
	printf(":%u", p);
	}
	break;
	}
	#endif /* INET */
	#ifdef INET6
	case AF_INET6: {
	u_int16_t b;
	u_int8_t i, curstart = 255, curend = 0,
	maxstart = 0, maxend = 0;
	for (i = 0; i < 8; i++) {
	if (!addr->addr16[i]) {
	if (curstart == 255)
	curstart = i;
	else
	curend = i;
	} else {
	if (curstart) {
	if ((curend - curstart) >
	(maxend - maxstart)) {
	maxstart = curstart;
	maxend = curend;
	curstart = 255;
	}
	}
	}
	}
	for (i = 0; i < 8; i++) {
	if (i >= maxstart && i <= maxend) {
	if (maxend != 7) {
	if (i == maxstart)
	printf(":");
	} else {
	if (i == maxend)
	printf(":");
	}
	} else {
	b = ntohs(addr->addr16[i]);
	printf("%x", b);
	if (i < 7)
	printf(":");
	}
	}
	if (p) {
	p = ntohs(p);
	printf("[%u]", p);
	}
	break;
	}
	#endif /* INET6 */
	}
	}

	void
	pf_print_state(struct pf_state *s)
	{
	switch (s->proto) {
	case IPPROTO_TCP:
	printf("TCP ");
	break;
	case IPPROTO_UDP:
	printf("UDP ");
	break;
	case IPPROTO_ICMP:
	printf("ICMP ");
	break;
	case IPPROTO_ICMPV6:
	printf("ICMPV6 ");
	break;
	default:
	printf("%u ", s->proto);
	break;
	}
	pf_print_host(&s->lan.addr, s->lan.port, s->af);
	printf(" ");
	pf_print_host(&s->gwy.addr, s->gwy.port, s->af);
	printf(" ");
	pf_print_host(&s->ext.addr, s->ext.port, s->af);
	printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo,
	s->src.seqhi, s->src.max_win, s->src.seqdiff);
	if (s->src.wscale && s->dst.wscale)
	printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK);
	printf("]");
	printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo,
	s->dst.seqhi, s->dst.max_win, s->dst.seqdiff);
	if (s->src.wscale && s->dst.wscale)
	printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK);
	printf("]");
	printf(" %u:%u", s->src.state, s->dst.state);
	}

	void
	pf_print_flags(u_int8_t f)
	{
	if (f)
	printf(" ");
	if (f & TH_FIN)
	printf("F");
	if (f & TH_SYN)
	printf("S");
	if (f & TH_RST)
	printf("R");
	if (f & TH_PUSH)
	printf("P");
	if (f & TH_ACK)
	printf("A");
	if (f & TH_URG)
	printf("U");
	if (f & TH_ECE)
	printf("E");
	if (f & TH_CWR)
	printf("W");
	}

	#define PF_SET_SKIP_STEPS(i) \
	do { \
	while (head[i] != cur) { \
	head[i]->skip[i].ptr = cur; \
	head[i] = TAILQ_NEXT(head[i], entries); \
	} \
	} while (0)

	void
	pf_calc_skip_steps(struct pf_rulequeue *rules)
	{
	struct pf_rule cur, prev, *head[PF_SKIP_COUNT];
	int i;

	cur = TAILQ_FIRST(rules);
	prev = cur;
	for (i = 0; i < PF_SKIP_COUNT; ++i)
	head[i] = cur;
	while (cur != NULL) {

	if (cur->kif != prev->kif \|\| cur->ifnot != prev->ifnot)
	PF_SET_SKIP_STEPS(PF_SKIP_IFP);
	if (cur->direction != prev->direction)
	PF_SET_SKIP_STEPS(PF_SKIP_DIR);
	if (cur->af != prev->af)
	PF_SET_SKIP_STEPS(PF_SKIP_AF);
	if (cur->proto != prev->proto)
	PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
	if (cur->src.neg != prev->src.neg \|\|
	pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
	PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
	if (cur->src.port[0] != prev->src.port[0] \|\|
	cur->src.port[1] != prev->src.port[1] \|\|
	cur->src.port_op != prev->src.port_op)
	PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
	if (cur->dst.neg != prev->dst.neg \|\|
	pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
	PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
	if (cur->dst.port[0] != prev->dst.port[0] \|\|
	cur->dst.port[1] != prev->dst.port[1] \|\|
	cur->dst.port_op != prev->dst.port_op)
	PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);

	prev = cur;
	cur = TAILQ_NEXT(cur, entries);
	}
	for (i = 0; i < PF_SKIP_COUNT; ++i)
	PF_SET_SKIP_STEPS(i);
	}

	int
	pf_addr_wrap_neq(struct pf_addr_wrap aw1, struct pf_addr_wrap aw2)
	{
	if (aw1->type != aw2->type)
	return (1);
	switch (aw1->type) {
	case PF_ADDR_ADDRMASK:
	if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
	return (1);
	if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
	return (1);
	return (0);
	case PF_ADDR_DYNIFTL:
	return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
	case PF_ADDR_NOROUTE:
	case PF_ADDR_URPFFAILED:
	return (0);
	case PF_ADDR_TABLE:
	return (aw1->p.tbl != aw2->p.tbl);
	case PF_ADDR_RTLABEL:
	return (aw1->v.rtlabel != aw2->v.rtlabel);
	default:
	printf("invalid address type: %d\n", aw1->type);
	return (1);
	}
	}

	u_int16_t
	pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
	{
	u_int32_t l;

	if (udp && !cksum)
	return (0x0000);
	l = cksum + old - new;
	l = (l >> 16) + (l & 65535);
	l = l & 65535;
	if (udp && !l)
	return (0xFFFF);
	return (l);
	}

	void
	pf_change_ap(struct pf_addr a, u_int16_t p, u_int16_t ic, u_int16_t pc,
	struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
	{
	struct pf_addr ao;
	u_int16_t po = *p;

	PF_ACPY(&ao, a, af);
	PF_ACPY(a, an, af);

	*p = pn;

	switch (af) {
	#ifdef INET
	case AF_INET:
	ic = pf_cksum_fixup(pf_cksum_fixup(ic,
	ao.addr16[0], an->addr16[0], 0),
	ao.addr16[1], an->addr16[1], 0);
	*p = pn;
	pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(pc,
	ao.addr16[0], an->addr16[0], u),
	ao.addr16[1], an->addr16[1], u),
	po, pn, u);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
	ao.addr16[0], an->addr16[0], u),
	ao.addr16[1], an->addr16[1], u),
	ao.addr16[2], an->addr16[2], u),
	ao.addr16[3], an->addr16[3], u),
	ao.addr16[4], an->addr16[4], u),
	ao.addr16[5], an->addr16[5], u),
	ao.addr16[6], an->addr16[6], u),
	ao.addr16[7], an->addr16[7], u),
	po, pn, u);
	break;
	#endif /* INET6 */
	}
	}


	/* Changes a u_int32_t. Uses a void * so there are no align restrictions */
	void
	pf_change_a(void a, u_int16_t c, u_int32_t an, u_int8_t u)
	{
	u_int32_t ao;

	memcpy(&ao, a, sizeof(ao));
	memcpy(a, &an, sizeof(u_int32_t));
	c = pf_cksum_fixup(pf_cksum_fixup(c, ao / 65536, an / 65536, u),
	ao % 65536, an % 65536, u);
	}

	#ifdef INET6
	void
	pf_change_a6(struct pf_addr a, u_int16_t c, struct pf_addr *an, u_int8_t u)
	{
	struct pf_addr ao;

	PF_ACPY(&ao, a, AF_INET6);
	PF_ACPY(a, an, AF_INET6);

	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(*c,
	ao.addr16[0], an->addr16[0], u),
	ao.addr16[1], an->addr16[1], u),
	ao.addr16[2], an->addr16[2], u),
	ao.addr16[3], an->addr16[3], u),
	ao.addr16[4], an->addr16[4], u),
	ao.addr16[5], an->addr16[5], u),
	ao.addr16[6], an->addr16[6], u),
	ao.addr16[7], an->addr16[7], u);
	}
	#endif /* INET6 */

	void
	pf_change_icmp(struct pf_addr ia, u_int16_t ip, struct pf_addr *oa,
	struct pf_addr na, u_int16_t np, u_int16_t pc, u_int16_t *h2c,
	u_int16_t ic, u_int16_t hc, u_int8_t u, sa_family_t af)
	{
	struct pf_addr oia, ooa;

	PF_ACPY(&oia, ia, af);
	PF_ACPY(&ooa, oa, af);

	/* Change inner protocol port, fix inner protocol checksum. */
	if (ip != NULL) {
	u_int16_t oip = *ip;
	u_int32_t opc = 0; /* make the compiler happy */

	if (pc != NULL)
	opc = *pc;
	*ip = np;
	if (pc != NULL)
	pc = pf_cksum_fixup(pc, oip, *ip, u);
	ic = pf_cksum_fixup(ic, oip, *ip, 0);
	if (pc != NULL)
	ic = pf_cksum_fixup(ic, opc, *pc, 0);
	}
	/* Change inner ip address, fix inner ip and icmp checksums. */
	PF_ACPY(ia, na, af);
	switch (af) {
	#ifdef INET
	case AF_INET: {
	u_int32_t oh2c = *h2c;

	h2c = pf_cksum_fixup(pf_cksum_fixup(h2c,
	oia.addr16[0], ia->addr16[0], 0),
	oia.addr16[1], ia->addr16[1], 0);
	ic = pf_cksum_fixup(pf_cksum_fixup(ic,
	oia.addr16[0], ia->addr16[0], 0),
	oia.addr16[1], ia->addr16[1], 0);
	ic = pf_cksum_fixup(ic, oh2c, *h2c, 0);
	break;
	}
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(*ic,
	oia.addr16[0], ia->addr16[0], u),
	oia.addr16[1], ia->addr16[1], u),
	oia.addr16[2], ia->addr16[2], u),
	oia.addr16[3], ia->addr16[3], u),
	oia.addr16[4], ia->addr16[4], u),
	oia.addr16[5], ia->addr16[5], u),
	oia.addr16[6], ia->addr16[6], u),
	oia.addr16[7], ia->addr16[7], u);
	break;
	#endif /* INET6 */
	}
	/* Change outer ip address, fix outer ip or icmpv6 checksum. */
	PF_ACPY(oa, na, af);
	switch (af) {
	#ifdef INET
	case AF_INET:
	hc = pf_cksum_fixup(pf_cksum_fixup(hc,
	ooa.addr16[0], oa->addr16[0], 0),
	ooa.addr16[1], oa->addr16[1], 0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
	pf_cksum_fixup(pf_cksum_fixup(*ic,
	ooa.addr16[0], oa->addr16[0], u),
	ooa.addr16[1], oa->addr16[1], u),
	ooa.addr16[2], oa->addr16[2], u),
	ooa.addr16[3], oa->addr16[3], u),
	ooa.addr16[4], oa->addr16[4], u),
	ooa.addr16[5], oa->addr16[5], u),
	ooa.addr16[6], oa->addr16[6], u),
	ooa.addr16[7], oa->addr16[7], u);
	break;
	#endif /* INET6 */
	}
	}


	/*
	* Need to modulate the sequence numbers in the TCP SACK option
	* (credits to Krzysztof Pfaff for report and patch)
	*/
	int
	pf_modulate_sack(struct mbuf m, int off, struct pf_pdesc pd,
	struct tcphdr th, struct pf_state_peer dst)
	{
	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
	#ifdef __FreeBSD__
	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
	#else
	u_int8_t opts[MAX_TCPOPTLEN], *opt = opts;
	#endif
	int copyback = 0, i, olen;
	struct sackblk sack;

	#define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
	if (hlen < TCPOLEN_SACKLEN \|\|
	!pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
	return 0;

	while (hlen >= TCPOLEN_SACKLEN) {
	olen = opt[1];
	switch (*opt) {
	case TCPOPT_EOL: /* FALLTHROUGH */
	case TCPOPT_NOP:
	opt++;
	hlen--;
	break;
	case TCPOPT_SACK:
	if (olen > hlen)
	olen = hlen;
	if (olen >= TCPOLEN_SACKLEN) {
	for (i = 2; i + TCPOLEN_SACK <= olen;
	i += TCPOLEN_SACK) {
	memcpy(&sack, &opt[i], sizeof(sack));
	pf_change_a(&sack.start, &th->th_sum,
	htonl(ntohl(sack.start) -
	dst->seqdiff), 0);
	pf_change_a(&sack.end, &th->th_sum,
	htonl(ntohl(sack.end) -
	dst->seqdiff), 0);
	memcpy(&opt[i], &sack, sizeof(sack));
	}
	copyback = 1;
	}
	/* FALLTHROUGH */
	default:
	if (olen < 2)
	olen = 2;
	hlen -= olen;
	opt += olen;
	}
	}

	if (copyback)
	#ifdef __FreeBSD__
	m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
	#else
	m_copyback(m, off + sizeof(*th), thoptlen, opts);
	#endif
	return (copyback);
	}

	void
	#ifdef __FreeBSD__
	pf_send_tcp(struct mbuf replyto, const struct pf_rule r, sa_family_t af,
	#else
	pf_send_tcp(const struct pf_rule *r, sa_family_t af,
	#endif
	const struct pf_addr saddr, const struct pf_addr daddr,
	u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
	u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
	u_int16_t rtag, struct ether_header eh, struct ifnet ifp)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *m;
	int len, tlen;
	#ifdef INET
	struct ip *h;
	#endif /* INET */
	#ifdef INET6
	struct ip6_hdr *h6;
	#endif /* INET6 */
	struct tcphdr *th;
	char *opt;
	struct pf_mtag *pf_mtag;

	#ifdef __FreeBSD__
	KASSERT(
	#ifdef INET
	af == AF_INET
	#else
	0
	#endif
	\|\|
	#ifdef INET6
	af == AF_INET6
	#else
	0
	#endif
	, ("Unsupported AF %d", af));
	len = 0;
	th = NULL;
	#ifdef INET
	h = NULL;
	#endif
	#ifdef INET6
	h6 = NULL;
	#endif
	#endif

	/* maximum segment size tcp option */
	tlen = sizeof(struct tcphdr);
	if (mss)
	tlen += 4;

	switch (af) {
	#ifdef INET
	case AF_INET:
	len = sizeof(struct ip) + tlen;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	len = sizeof(struct ip6_hdr) + tlen;
	break;
	#endif /* INET6 */
	}

	/* create outgoing mbuf */
	m = m_gethdr(M_DONTWAIT, MT_HEADER);
	if (m == NULL)
	return;
	#ifdef __FreeBSD__
	#ifdef MAC
	if (replyto)
	mac_netinet_firewall_reply(replyto, m);
	else
	mac_netinet_firewall_send(m);
	#else
	(void)replyto;
	#endif
	#endif
	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
	m_freem(m);
	return;
	}
	if (tag)
	#ifdef __FreeBSD__
	m->m_flags \|= M_SKIP_FIREWALL;
	#else
	pf_mtag->flags \|= PF_TAG_GENERATED;
	#endif

	pf_mtag->tag = rtag;

	if (r != NULL && r->rtableid >= 0)
	#ifdef __FreeBSD__
	{
	M_SETFIB(m, r->rtableid);
	#endif
	pf_mtag->rtableid = r->rtableid;
	#ifdef __FreeBSD__
	}
	#endif
	#ifdef ALTQ
	if (r != NULL && r->qid) {
	pf_mtag->qid = r->qid;
	/* add hints for ecn */
	pf_mtag->af = af;
	pf_mtag->hdr = mtod(m, struct ip *);
	}
	#endif /* ALTQ */
	m->m_data += max_linkhdr;
	m->m_pkthdr.len = m->m_len = len;
	m->m_pkthdr.rcvif = NULL;
	bzero(m->m_data, len);
	switch (af) {
	#ifdef INET
	case AF_INET:
	h = mtod(m, struct ip *);

	/* IP header fields included in the TCP checksum */
	h->ip_p = IPPROTO_TCP;
	h->ip_len = htons(tlen);
	h->ip_src.s_addr = saddr->v4.s_addr;
	h->ip_dst.s_addr = daddr->v4.s_addr;

	th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	h6 = mtod(m, struct ip6_hdr *);

	/* IP header fields included in the TCP checksum */
	h6->ip6_nxt = IPPROTO_TCP;
	h6->ip6_plen = htons(tlen);
	memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
	memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));

	th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
	break;
	#endif /* INET6 */
	}

	/* TCP header */
	th->th_sport = sport;
	th->th_dport = dport;
	th->th_seq = htonl(seq);
	th->th_ack = htonl(ack);
	th->th_off = tlen >> 2;
	th->th_flags = flags;
	th->th_win = htons(win);

	if (mss) {
	opt = (char *)(th + 1);
	opt[0] = TCPOPT_MAXSEG;
	opt[1] = 4;
	HTONS(mss);
	bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
	}

	switch (af) {
	#ifdef INET
	case AF_INET:
	/* TCP checksum */
	th->th_sum = in_cksum(m, len);

	/* Finish the IP header */
	h->ip_v = 4;
	h->ip_hl = sizeof(*h) >> 2;
	h->ip_tos = IPTOS_LOWDELAY;
	#ifdef __FreeBSD__
	h->ip_off = V_path_mtu_discovery ? IP_DF : 0;
	h->ip_len = len;
	#else
	h->ip_off = htons(ip_mtudisc ? IP_DF : 0);
	h->ip_len = htons(len);
	#endif
	h->ip_ttl = ttl ? ttl : V_ip_defttl;
	h->ip_sum = 0;
	if (eh == NULL) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	ip_output(m, (void )NULL, (void )NULL, 0,
	(void )NULL, (void )NULL);
	PF_LOCK();
	#else /* ! __FreeBSD__ */
	ip_output(m, (void )NULL, (void )NULL, 0,
	(void )NULL, (void )NULL);
	#endif
	} else {
	struct route ro;
	struct rtentry rt;
	struct ether_header e = (void )ro.ro_dst.sa_data;

	if (ifp == NULL) {
	m_freem(m);
	return;
	}
	rt.rt_ifp = ifp;
	ro.ro_rt = &rt;
	ro.ro_dst.sa_len = sizeof(ro.ro_dst);
	ro.ro_dst.sa_family = pseudo_AF_HDRCMPLT;
	bcopy(eh->ether_dhost, e->ether_shost, ETHER_ADDR_LEN);
	bcopy(eh->ether_shost, e->ether_dhost, ETHER_ADDR_LEN);
	e->ether_type = eh->ether_type;
	#ifdef __FreeBSD__
	PF_UNLOCK();
	/* XXX_IMPORT: later */
	ip_output(m, (void *)NULL, &ro, 0,
	(void )NULL, (void )NULL);
	PF_LOCK();
	#else /* ! __FreeBSD__ */
	ip_output(m, (void *)NULL, &ro, IP_ROUTETOETHER,
	(void )NULL, (void )NULL);
	#endif
	}
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	/* TCP checksum */
	th->th_sum = in6_cksum(m, IPPROTO_TCP,
	sizeof(struct ip6_hdr), tlen);

	h6->ip6_vfc \|= IPV6_VERSION;
	h6->ip6_hlim = IPV6_DEFHLIM;

	#ifdef __FreeBSD__
	PF_UNLOCK();
	ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
	PF_LOCK();
	#else
	ip6_output(m, NULL, NULL, 0, NULL, NULL);
	#endif
	break;
	#endif /* INET6 */
	}
	}

	void
	pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
	struct pf_rule *r)
	{
	struct pf_mtag *pf_mtag;
	struct mbuf *m0;
	#ifdef __FreeBSD__
	struct ip *ip;
	#endif

	#ifdef __FreeBSD__
	m0 = m_copypacket(m, M_DONTWAIT);
	if (m0 == NULL)
	return;
	#else
	m0 = m_copy(m, 0, M_COPYALL);
	#endif
	if ((pf_mtag = pf_get_mtag(m0)) == NULL)
	return;
	#ifdef __FreeBSD__
	/* XXX: revisit */
	m0->m_flags \|= M_SKIP_FIREWALL;
	#else
	pf_mtag->flags \|= PF_TAG_GENERATED;
	#endif

	if (r->rtableid >= 0)
	#ifdef __FreeBSD__
	{
	M_SETFIB(m0, r->rtableid);
	#endif
	pf_mtag->rtableid = r->rtableid;
	#ifdef __FreeBSD__
	}
	#endif

	#ifdef ALTQ
	if (r->qid) {
	pf_mtag->qid = r->qid;
	/* add hints for ecn */
	pf_mtag->af = af;
	pf_mtag->hdr = mtod(m0, struct ip *);
	}
	#endif /* ALTQ */

	switch (af) {
	#ifdef INET
	case AF_INET:
	#ifdef __FreeBSD__
	/* icmp_error() expects host byte ordering */
	ip = mtod(m0, struct ip *);
	NTOHS(ip->ip_len);
	NTOHS(ip->ip_off);
	PF_UNLOCK();
	icmp_error(m0, type, code, 0, 0);
	PF_LOCK();
	#else
	icmp_error(m0, type, code, 0, 0);
	#endif
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	icmp6_error(m0, type, code, 0);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	break;
	#endif /* INET6 */
	}
	}

	/*
	* Return 1 if the addresses a and b match (with mask m), otherwise return 0.
	* If n is 0, they match if they are equal. If n is != 0, they match if they
	* are different.
	*/
	int
	pf_match_addr(u_int8_t n, struct pf_addr a, struct pf_addr m,
	struct pf_addr *b, sa_family_t af)
	{
	int match = 0;

	switch (af) {
	#ifdef INET
	case AF_INET:
	if ((a->addr32[0] & m->addr32[0]) ==
	(b->addr32[0] & m->addr32[0]))
	match++;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (((a->addr32[0] & m->addr32[0]) ==
	(b->addr32[0] & m->addr32[0])) &&
	((a->addr32[1] & m->addr32[1]) ==
	(b->addr32[1] & m->addr32[1])) &&
	((a->addr32[2] & m->addr32[2]) ==
	(b->addr32[2] & m->addr32[2])) &&
	((a->addr32[3] & m->addr32[3]) ==
	(b->addr32[3] & m->addr32[3])))
	match++;
	break;
	#endif /* INET6 */
	}
	if (match) {
	if (n)
	return (0);
	else
	return (1);
	} else {
	if (n)
	return (1);
	else
	return (0);
	}
	}

	int
	pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
	{
	switch (op) {
	case PF_OP_IRG:
	return ((p > a1) && (p < a2));
	case PF_OP_XRG:
	return ((p < a1) \|\| (p > a2));
	case PF_OP_RRG:
	return ((p >= a1) && (p <= a2));
	case PF_OP_EQ:
	return (p == a1);
	case PF_OP_NE:
	return (p != a1);
	case PF_OP_LT:
	return (p < a1);
	case PF_OP_LE:
	return (p <= a1);
	case PF_OP_GT:
	return (p > a1);
	case PF_OP_GE:
	return (p >= a1);
	}
	return (0); /* never reached */
	}

	int
	pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
	{
	NTOHS(a1);
	NTOHS(a2);
	NTOHS(p);
	return (pf_match(op, a1, a2, p));
	}

	int
	pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
	{
	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
	return (0);
	return (pf_match(op, a1, a2, u));
	}

	int
	pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
	{
	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
	return (0);
	return (pf_match(op, a1, a2, g));
	}

	#ifndef __FreeBSD__
	struct pf_mtag *
	pf_find_mtag(struct mbuf *m)
	{
	struct m_tag *mtag;

	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL)
	return (NULL);

	return ((struct pf_mtag *)(mtag + 1));
	}

	struct pf_mtag *
	pf_get_mtag(struct mbuf *m)
	{
	struct m_tag *mtag;

	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) {
	mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag),
	M_NOWAIT);
	if (mtag == NULL)
	return (NULL);
	bzero(mtag + 1, sizeof(struct pf_mtag));
	m_tag_prepend(m, mtag);
	}

	return ((struct pf_mtag *)(mtag + 1));
	}
	#endif

	int
	pf_match_tag(struct mbuf m, struct pf_rule r, struct pf_mtag *pf_mtag,
	int *tag)
	{
	if (*tag == -1)
	*tag = pf_mtag->tag;

	return ((!r->match_tag_not && r->match_tag == *tag) \|\|
	(r->match_tag_not && r->match_tag != *tag));
	}

	int
	pf_tag_packet(struct mbuf m, struct pf_mtag pf_mtag, int tag, int rtableid)
	{
	if (tag <= 0 && rtableid < 0)
	return (0);

	if (pf_mtag == NULL)
	if ((pf_mtag = pf_get_mtag(m)) == NULL)
	return (1);
	if (tag > 0)
	pf_mtag->tag = tag;
	if (rtableid >= 0)
	#ifdef __FreeBSD__
	{
	M_SETFIB(m, rtableid);
	#endif
	pf_mtag->rtableid = rtableid;
	#ifdef __FreeBSD__
	}
	#endif

	return (0);
	}

	static void
	pf_step_into_anchor(int depth, struct pf_ruleset *rs, int n,
	struct pf_rule r, struct pf_rule a, int *match)
	{
	struct pf_anchor_stackframe *f;

	(*r)->anchor->match = 0;
	if (match)
	*match = 0;
	if (*depth >= sizeof(pf_anchor_stack) /
	sizeof(pf_anchor_stack[0])) {
	printf("pf_step_into_anchor: stack overflow\n");
	r = TAILQ_NEXT(r, entries);
	return;
	} else if (*depth == 0 && a != NULL)
	a = r;
	f = pf_anchor_stack + (*depth)++;
	f->rs = *rs;
	f->r = *r;
	if ((*r)->anchor_wildcard) {
	f->parent = &(*r)->anchor->children;
	if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
	NULL) {
	*r = NULL;
	return;
	}
	*rs = &f->child->ruleset;
	} else {
	f->parent = NULL;
	f->child = NULL;
	rs = &(r)->anchor->ruleset;
	}
	r = TAILQ_FIRST((rs)->rules[n].active.ptr);
	}

	int
	pf_step_out_of_anchor(int depth, struct pf_ruleset *rs, int n,
	struct pf_rule r, struct pf_rule a, int *match)
	{
	struct pf_anchor_stackframe *f;
	int quick = 0;

	do {
	if (*depth <= 0)
	break;
	f = pf_anchor_stack + *depth - 1;
	if (f->parent != NULL && f->child != NULL) {
	if (f->child->match \|\|
	(match != NULL && *match)) {
	f->r->anchor->match = 1;
	*match = 0;
	}
	f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
	if (f->child != NULL) {
	*rs = &f->child->ruleset;
	r = TAILQ_FIRST((rs)->rules[n].active.ptr);
	if (*r == NULL)
	continue;
	else
	break;
	}
	}
	(*depth)--;
	if (*depth == 0 && a != NULL)
	*a = NULL;
	*rs = f->rs;
	if (f->r->anchor->match \|\| (match != NULL && *match))
	quick = f->r->quick;
	*r = TAILQ_NEXT(f->r, entries);
	} while (*r == NULL);

	return (quick);
	}

	#ifdef INET6
	void
	pf_poolmask(struct pf_addr naddr, struct pf_addr raddr,
	struct pf_addr rmask, struct pf_addr saddr, sa_family_t af)
	{
	switch (af) {
	#ifdef INET
	case AF_INET:
	naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) \|
	((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
	break;
	#endif /* INET */
	case AF_INET6:
	naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) \|
	((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
	naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) \|
	((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
	naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) \|
	((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
	naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) \|
	((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
	break;
	}
	}

	void
	pf_addr_inc(struct pf_addr *addr, sa_family_t af)
	{
	switch (af) {
	#ifdef INET
	case AF_INET:
	addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
	break;
	#endif /* INET */
	case AF_INET6:
	if (addr->addr32[3] == 0xffffffff) {
	addr->addr32[3] = 0;
	if (addr->addr32[2] == 0xffffffff) {
	addr->addr32[2] = 0;
	if (addr->addr32[1] == 0xffffffff) {
	addr->addr32[1] = 0;
	addr->addr32[0] =
	htonl(ntohl(addr->addr32[0]) + 1);
	} else
	addr->addr32[1] =
	htonl(ntohl(addr->addr32[1]) + 1);
	} else
	addr->addr32[2] =
	htonl(ntohl(addr->addr32[2]) + 1);
	} else
	addr->addr32[3] =
	htonl(ntohl(addr->addr32[3]) + 1);
	break;
	}
	}
	#endif /* INET6 */

	#define mix(a,b,c) \
	do { \
	a -= b; a -= c; a ^= (c >> 13); \
	b -= c; b -= a; b ^= (a << 8); \
	c -= a; c -= b; c ^= (b >> 13); \
	a -= b; a -= c; a ^= (c >> 12); \
	b -= c; b -= a; b ^= (a << 16); \
	c -= a; c -= b; c ^= (b >> 5); \
	a -= b; a -= c; a ^= (c >> 3); \
	b -= c; b -= a; b ^= (a << 10); \
	c -= a; c -= b; c ^= (b >> 15); \
	} while (0)

	/*
	* hash function based on bridge_hash in if_bridge.c
	*/
	void
	pf_hash(struct pf_addr inaddr, struct pf_addr hash,
	struct pf_poolhashkey *key, sa_family_t af)
	{
	u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0];

	switch (af) {
	#ifdef INET
	case AF_INET:
	a += inaddr->addr32[0];
	b += key->key32[1];
	mix(a, b, c);
	hash->addr32[0] = c + key->key32[2];
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	a += inaddr->addr32[0];
	b += inaddr->addr32[2];
	mix(a, b, c);
	hash->addr32[0] = c;
	a += inaddr->addr32[1];
	b += inaddr->addr32[3];
	c += key->key32[1];
	mix(a, b, c);
	hash->addr32[1] = c;
	a += inaddr->addr32[2];
	b += inaddr->addr32[1];
	c += key->key32[2];
	mix(a, b, c);
	hash->addr32[2] = c;
	a += inaddr->addr32[3];
	b += inaddr->addr32[0];
	c += key->key32[3];
	mix(a, b, c);
	hash->addr32[3] = c;
	break;
	#endif /* INET6 */
	}
	}

	int
	pf_map_addr(sa_family_t af, struct pf_rule r, struct pf_addr saddr,
	struct pf_addr naddr, struct pf_addr init_addr, struct pf_src_node **sn)
	{
	unsigned char hash[16];
	struct pf_pool *rpool = &r->rpool;
	struct pf_addr *raddr = &rpool->cur->addr.v.a.addr;
	struct pf_addr *rmask = &rpool->cur->addr.v.a.mask;
	struct pf_pooladdr *acur = rpool->cur;
	struct pf_src_node k;

	if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
	(r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
	k.af = af;
	PF_ACPY(&k.addr, saddr, af);
	if (r->rule_flag & PFRULE_RULESRCTRACK \|\|
	r->rpool.opts & PF_POOL_STICKYADDR)
	k.rule.ptr = r;
	else
	k.rule.ptr = NULL;
	pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
	*sn = RB_FIND(pf_src_tree, &tree_src_tracking, &k);
	if (sn != NULL && !PF_AZERO(&(sn)->raddr, af)) {
	PF_ACPY(naddr, &(*sn)->raddr, af);
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf_map_addr: src tracking maps ");
	pf_print_host(&k.addr, 0, af);
	printf(" to ");
	pf_print_host(naddr, 0, af);
	printf("\n");
	}
	return (0);
	}
	}

	if (rpool->cur->addr.type == PF_ADDR_NOROUTE)
	return (1);
	if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
	switch (af) {
	#ifdef INET
	case AF_INET:
	if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 &&
	(rpool->opts & PF_POOL_TYPEMASK) !=
	PF_POOL_ROUNDROBIN)
	return (1);
	raddr = &rpool->cur->addr.p.dyn->pfid_addr4;
	rmask = &rpool->cur->addr.p.dyn->pfid_mask4;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 &&
	(rpool->opts & PF_POOL_TYPEMASK) !=
	PF_POOL_ROUNDROBIN)
	return (1);
	raddr = &rpool->cur->addr.p.dyn->pfid_addr6;
	rmask = &rpool->cur->addr.p.dyn->pfid_mask6;
	break;
	#endif /* INET6 */
	}
	} else if (rpool->cur->addr.type == PF_ADDR_TABLE) {
	if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN)
	return (1); /* unsupported */
	} else {
	raddr = &rpool->cur->addr.v.a.addr;
	rmask = &rpool->cur->addr.v.a.mask;
	}

	switch (rpool->opts & PF_POOL_TYPEMASK) {
	case PF_POOL_NONE:
	PF_ACPY(naddr, raddr, af);
	break;
	case PF_POOL_BITMASK:
	PF_POOLMASK(naddr, raddr, rmask, saddr, af);
	break;
	case PF_POOL_RANDOM:
	if (init_addr != NULL && PF_AZERO(init_addr, af)) {
	switch (af) {
	#ifdef INET
	case AF_INET:
	rpool->counter.addr32[0] = htonl(arc4random());
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (rmask->addr32[3] != 0xffffffff)
	rpool->counter.addr32[3] =
	htonl(arc4random());
	else
	break;
	if (rmask->addr32[2] != 0xffffffff)
	rpool->counter.addr32[2] =
	htonl(arc4random());
	else
	break;
	if (rmask->addr32[1] != 0xffffffff)
	rpool->counter.addr32[1] =
	htonl(arc4random());
	else
	break;
	if (rmask->addr32[0] != 0xffffffff)
	rpool->counter.addr32[0] =
	htonl(arc4random());
	break;
	#endif /* INET6 */
	}
	PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
	PF_ACPY(init_addr, naddr, af);

	} else {
	PF_AINC(&rpool->counter, af);
	PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af);
	}
	break;
	case PF_POOL_SRCHASH:
	pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af);
	PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af);
	break;
	case PF_POOL_ROUNDROBIN:
	if (rpool->cur->addr.type == PF_ADDR_TABLE) {
	if (!pfr_pool_get(rpool->cur->addr.p.tbl,
	&rpool->tblidx, &rpool->counter,
	&raddr, &rmask, af))
	goto get_addr;
	} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
	if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
	&rpool->tblidx, &rpool->counter,
	&raddr, &rmask, af))
	goto get_addr;
	} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
	goto get_addr;

	try_next:
	if ((rpool->cur = TAILQ_NEXT(rpool->cur, entries)) == NULL)
	rpool->cur = TAILQ_FIRST(&rpool->list);
	if (rpool->cur->addr.type == PF_ADDR_TABLE) {
	rpool->tblidx = -1;
	if (pfr_pool_get(rpool->cur->addr.p.tbl,
	&rpool->tblidx, &rpool->counter,
	&raddr, &rmask, af)) {
	/* table contains no address of type 'af' */
	if (rpool->cur != acur)
	goto try_next;
	return (1);
	}
	} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
	rpool->tblidx = -1;
	if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
	&rpool->tblidx, &rpool->counter,
	&raddr, &rmask, af)) {
	/* table contains no address of type 'af' */
	if (rpool->cur != acur)
	goto try_next;
	return (1);
	}
	} else {
	raddr = &rpool->cur->addr.v.a.addr;
	rmask = &rpool->cur->addr.v.a.mask;
	PF_ACPY(&rpool->counter, raddr, af);
	}

	get_addr:
	PF_ACPY(naddr, &rpool->counter, af);
	if (init_addr != NULL && PF_AZERO(init_addr, af))
	PF_ACPY(init_addr, naddr, af);
	PF_AINC(&rpool->counter, af);
	break;
	}
	if (*sn != NULL)
	PF_ACPY(&(*sn)->raddr, naddr, af);

	if (pf_status.debug >= PF_DEBUG_MISC &&
	(rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) {
	printf("pf_map_addr: selected address ");
	pf_print_host(naddr, 0, af);
	printf("\n");
	}

	return (0);
	}

	int
	pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r,
	struct pf_addr saddr, struct pf_addr daddr, u_int16_t dport,
	struct pf_addr naddr, u_int16_t nport, u_int16_t low, u_int16_t high,
	struct pf_src_node **sn)
	{
	struct pf_state_cmp key;
	struct pf_addr init_addr;
	u_int16_t cut;

	bzero(&init_addr, sizeof(init_addr));
	if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
	return (1);

	if (proto == IPPROTO_ICMP) {
	low = 1;
	high = 65535;
	}

	do {
	key.af = af;
	key.proto = proto;
	PF_ACPY(&key.ext.addr, daddr, key.af);
	PF_ACPY(&key.gwy.addr, naddr, key.af);
	key.ext.port = dport;

	/*
	* port search; start random, step;
	* similar 2 portloop in in_pcbbind
	*/
	if (!(proto == IPPROTO_TCP \|\| proto == IPPROTO_UDP \|\|
	proto == IPPROTO_ICMP)) {
	key.gwy.port = dport;
	if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL)
	return (0);
	} else if (low == 0 && high == 0) {
	key.gwy.port = *nport;
	if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL)
	return (0);
	} else if (low == high) {
	key.gwy.port = htons(low);
	if (pf_find_state_all(&key, PF_EXT_GWY, NULL) == NULL) {
	*nport = htons(low);
	return (0);
	}
	} else {
	u_int16_t tmp;

	if (low > high) {
	tmp = low;
	low = high;
	high = tmp;
	}
	/* low < high */
	cut = htonl(arc4random()) % (1 + high - low) + low;
	/* low <= cut <= high */
	for (tmp = cut; tmp <= high; ++(tmp)) {
	key.gwy.port = htons(tmp);
	if (pf_find_state_all(&key, PF_EXT_GWY, NULL) ==
	NULL) {
	*nport = htons(tmp);
	return (0);
	}
	}
	for (tmp = cut - 1; tmp >= low; --(tmp)) {
	key.gwy.port = htons(tmp);
	if (pf_find_state_all(&key, PF_EXT_GWY, NULL) ==
	NULL) {
	*nport = htons(tmp);
	return (0);
	}
	}
	}

	switch (r->rpool.opts & PF_POOL_TYPEMASK) {
	case PF_POOL_RANDOM:
	case PF_POOL_ROUNDROBIN:
	if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn))
	return (1);
	break;
	case PF_POOL_NONE:
	case PF_POOL_SRCHASH:
	case PF_POOL_BITMASK:
	default:
	return (1);
	}
	} while (! PF_AEQ(&init_addr, naddr, af) );

	return (1); /* none available */
	}

	struct pf_rule *
	pf_match_translation(struct pf_pdesc pd, struct mbuf m, int off,
	int direction, struct pfi_kif kif, struct pf_addr saddr, u_int16_t sport,
	struct pf_addr *daddr, u_int16_t dport, int rs_num)
	{
	struct pf_rule r, rm = NULL;
	struct pf_ruleset *ruleset = NULL;
	int tag = -1;
	int rtableid = -1;
	int asd = 0;

	r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr);
	while (r && rm == NULL) {
	struct pf_rule_addr src = NULL, dst = NULL;
	struct pf_addr_wrap *xdst = NULL;

	if (r->action == PF_BINAT && direction == PF_IN) {
	src = &r->dst;
	if (r->rpool.cur != NULL)
	xdst = &r->rpool.cur->addr;
	} else {
	src = &r->src;
	dst = &r->dst;
	}

	r->evaluations++;
	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	r = r->skip[PF_SKIP_IFP].ptr;
	else if (r->direction && r->direction != direction)
	r = r->skip[PF_SKIP_DIR].ptr;
	else if (r->af && r->af != pd->af)
	r = r->skip[PF_SKIP_AF].ptr;
	else if (r->proto && r->proto != pd->proto)
	r = r->skip[PF_SKIP_PROTO].ptr;
	else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
	src->neg, kif))
	r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
	PF_SKIP_DST_ADDR].ptr;
	else if (src->port_op && !pf_match_port(src->port_op,
	src->port[0], src->port[1], sport))
	r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT :
	PF_SKIP_DST_PORT].ptr;
	else if (dst != NULL &&
	PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL))
	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af,
	0, NULL))
	r = TAILQ_NEXT(r, entries);
	else if (dst != NULL && dst->port_op &&
	!pf_match_port(dst->port_op, dst->port[0],
	dst->port[1], dport))
	r = r->skip[PF_SKIP_DST_PORT].ptr;
	else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
	r = TAILQ_NEXT(r, entries);
	else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto !=
	IPPROTO_TCP \|\| !pf_osfp_match(pf_osfp_fingerprint(pd, m,
	off, pd->hdr.tcp), r->os_fingerprint)))
	r = TAILQ_NEXT(r, entries);
	else {
	if (r->tag)
	tag = r->tag;
	if (r->rtableid >= 0)
	rtableid = r->rtableid;
	if (r->anchor == NULL) {
	rm = r;
	} else
	pf_step_into_anchor(&asd, &ruleset, rs_num,
	&r, NULL, NULL);
	}
	if (r == NULL)
	pf_step_out_of_anchor(&asd, &ruleset, rs_num, &r,
	NULL, NULL);
	}
	if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid))
	return (NULL);
	if (rm != NULL && (rm->action == PF_NONAT \|\|
	rm->action == PF_NORDR \|\| rm->action == PF_NOBINAT))
	return (NULL);
	return (rm);
	}

	struct pf_rule *
	pf_get_translation(struct pf_pdesc pd, struct mbuf m, int off, int direction,
	struct pfi_kif kif, struct pf_src_node *sn,
	struct pf_addr *saddr, u_int16_t sport,
	struct pf_addr *daddr, u_int16_t dport,
	struct pf_addr naddr, u_int16_t nport)
	{
	struct pf_rule *r = NULL;

	if (direction == PF_OUT) {
	r = pf_match_translation(pd, m, off, direction, kif, saddr,
	sport, daddr, dport, PF_RULESET_BINAT);
	if (r == NULL)
	r = pf_match_translation(pd, m, off, direction, kif,
	saddr, sport, daddr, dport, PF_RULESET_NAT);
	} else {
	r = pf_match_translation(pd, m, off, direction, kif, saddr,
	sport, daddr, dport, PF_RULESET_RDR);
	if (r == NULL)
	r = pf_match_translation(pd, m, off, direction, kif,
	saddr, sport, daddr, dport, PF_RULESET_BINAT);
	}

	if (r != NULL) {
	switch (r->action) {
	case PF_NONAT:
	case PF_NOBINAT:
	case PF_NORDR:
	return (NULL);
	case PF_NAT:
	if (pf_get_sport(pd->af, pd->proto, r, saddr,
	daddr, dport, naddr, nport, r->rpool.proxy_port[0],
	r->rpool.proxy_port[1], sn)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: NAT proxy port allocation "
	"(%u-%u) failed\n",
	r->rpool.proxy_port[0],
	r->rpool.proxy_port[1]));
	return (NULL);
	}
	break;
	case PF_BINAT:
	switch (direction) {
	case PF_OUT:
	if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	if (r->rpool.cur->addr.p.dyn->
	pfid_acnt4 < 1)
	return (NULL);
	PF_POOLMASK(naddr,
	&r->rpool.cur->addr.p.dyn->
	pfid_addr4,
	&r->rpool.cur->addr.p.dyn->
	pfid_mask4,
	saddr, AF_INET);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (r->rpool.cur->addr.p.dyn->
	pfid_acnt6 < 1)
	return (NULL);
	PF_POOLMASK(naddr,
	&r->rpool.cur->addr.p.dyn->
	pfid_addr6,
	&r->rpool.cur->addr.p.dyn->
	pfid_mask6,
	saddr, AF_INET6);
	break;
	#endif /* INET6 */
	}
	} else
	PF_POOLMASK(naddr,
	&r->rpool.cur->addr.v.a.addr,
	&r->rpool.cur->addr.v.a.mask,
	saddr, pd->af);
	break;
	case PF_IN:
	if (r->src.addr.type == PF_ADDR_DYNIFTL) {
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	if (r->src.addr.p.dyn->
	pfid_acnt4 < 1)
	return (NULL);
	PF_POOLMASK(naddr,
	&r->src.addr.p.dyn->
	pfid_addr4,
	&r->src.addr.p.dyn->
	pfid_mask4,
	daddr, AF_INET);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (r->src.addr.p.dyn->
	pfid_acnt6 < 1)
	return (NULL);
	PF_POOLMASK(naddr,
	&r->src.addr.p.dyn->
	pfid_addr6,
	&r->src.addr.p.dyn->
	pfid_mask6,
	daddr, AF_INET6);
	break;
	#endif /* INET6 */
	}
	} else
	PF_POOLMASK(naddr,
	&r->src.addr.v.a.addr,
	&r->src.addr.v.a.mask, daddr,
	pd->af);
	break;
	}
	break;
	case PF_RDR: {
	if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn))
	return (NULL);
	if ((r->rpool.opts & PF_POOL_TYPEMASK) ==
	PF_POOL_BITMASK)
	PF_POOLMASK(naddr, naddr,
	&r->rpool.cur->addr.v.a.mask, daddr,
	pd->af);

	if (r->rpool.proxy_port[1]) {
	u_int32_t tmp_nport;

	tmp_nport = ((ntohs(dport) -
	ntohs(r->dst.port[0])) %
	(r->rpool.proxy_port[1] -
	r->rpool.proxy_port[0] + 1)) +
	r->rpool.proxy_port[0];

	/* wrap around if necessary */
	if (tmp_nport > 65535)
	tmp_nport -= 65535;
	*nport = htons((u_int16_t)tmp_nport);
	} else if (r->rpool.proxy_port[0])
	*nport = htons(r->rpool.proxy_port[0]);
	break;
	}
	default:
	return (NULL);
	}
	}

	return (r);
	}

	int
	#ifdef __FreeBSD__
	pf_socket_lookup(int direction, struct pf_pdesc pd, struct inpcb inp_arg)
	#else
	pf_socket_lookup(int direction, struct pf_pdesc *pd)
	#endif
	{
	+ INIT_VNET_INET(curvnet);
	struct pf_addr saddr, daddr;
	u_int16_t sport, dport;
	#ifdef __FreeBSD__
	struct inpcbinfo *pi;
	#else
	struct inpcbtable *tb;
	#endif
	struct inpcb *inp;

	if (pd == NULL)
	return (-1);
	pd->lookup.uid = UID_MAX;
	pd->lookup.gid = GID_MAX;
	pd->lookup.pid = NO_PID; /* XXX: revisit */
	#ifdef __FreeBSD__
	if (inp_arg != NULL) {
	INP_LOCK_ASSERT(inp_arg);
	if (inp_arg->inp_socket) {
	pd->lookup.uid = inp_arg->inp_socket->so_cred->cr_uid;
	pd->lookup.gid =
	inp_arg->inp_socket->so_cred->cr_groups[0];
	return (1);
	} else
	return (-1);
	}
	#endif
	switch (pd->proto) {
	case IPPROTO_TCP:
	if (pd->hdr.tcp == NULL)
	return (-1);
	sport = pd->hdr.tcp->th_sport;
	dport = pd->hdr.tcp->th_dport;
	#ifdef __FreeBSD__
	pi = &V_tcbinfo;
	#else
	tb = &tcbtable;
	#endif
	break;
	case IPPROTO_UDP:
	if (pd->hdr.udp == NULL)
	return (-1);
	sport = pd->hdr.udp->uh_sport;
	dport = pd->hdr.udp->uh_dport;
	#ifdef __FreeBSD__
	pi = &V_udbinfo;
	#else
	tb = &udbtable;
	#endif
	break;
	default:
	return (-1);
	}
	if (direction == PF_IN) {
	saddr = pd->src;
	daddr = pd->dst;
	} else {
	u_int16_t p;

	p = sport;
	sport = dport;
	dport = p;
	saddr = pd->dst;
	daddr = pd->src;
	}
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	#ifdef __FreeBSD__
	INP_INFO_RLOCK(pi); /* XXX LOR */
	inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4,
	dport, 0, NULL);
	if (inp == NULL) {
	inp = in_pcblookup_hash(pi, saddr->v4, sport,
	daddr->v4, dport, INPLOOKUP_WILDCARD, NULL);
	if(inp == NULL) {
	INP_INFO_RUNLOCK(pi);
	return (-1);
	}
	}
	#else
	inp = in_pcbhashlookup(tb, saddr->v4, sport, daddr->v4, dport);
	if (inp == NULL) {
	inp = in_pcblookup_listen(tb, daddr->v4, dport, 0);
	if (inp == NULL)
	return (-1);
	}
	#endif
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	#ifdef __FreeBSD__
	INP_INFO_RLOCK(pi);
	inp = in6_pcblookup_hash(pi, &saddr->v6, sport,
	&daddr->v6, dport, 0, NULL);
	if (inp == NULL) {
	inp = in6_pcblookup_hash(pi, &saddr->v6, sport,
	&daddr->v6, dport, INPLOOKUP_WILDCARD, NULL);
	if (inp == NULL) {
	INP_INFO_RUNLOCK(pi);
	return (-1);
	}
	}
	#else
	inp = in6_pcbhashlookup(tb, &saddr->v6, sport, &daddr->v6,
	dport);
	if (inp == NULL) {
	inp = in6_pcblookup_listen(tb, &daddr->v6, dport, 0);
	if (inp == NULL)
	return (-1);
	}
	#endif
	break;
	#endif /* INET6 */

	default:
	return (-1);
	}
	#ifdef __FreeBSD__
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(pi);
	if ((inp->inp_socket == NULL) \|\| (inp->inp_socket->so_cred == NULL)) {
	INP_RUNLOCK(inp);
	return (-1);
	}
	pd->lookup.uid = inp->inp_socket->so_cred->cr_uid;
	pd->lookup.gid = inp->inp_socket->so_cred->cr_groups[0];
	INP_RUNLOCK(inp);
	#else
	pd->lookup.uid = inp->inp_socket->so_euid;
	pd->lookup.gid = inp->inp_socket->so_egid;
	pd->lookup.pid = inp->inp_socket->so_cpid;
	#endif
	return (1);
	}

	u_int8_t
	pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
	{
	int hlen;
	u_int8_t hdr[60];
	u_int8_t *opt, optlen;
	u_int8_t wscale = 0;

	hlen = th_off << 2; /* hlen <= sizeof(hdr) */
	if (hlen <= sizeof(struct tcphdr))
	return (0);
	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
	return (0);
	opt = hdr + sizeof(struct tcphdr);
	hlen -= sizeof(struct tcphdr);
	while (hlen >= 3) {
	switch (*opt) {
	case TCPOPT_EOL:
	case TCPOPT_NOP:
	++opt;
	--hlen;
	break;
	case TCPOPT_WINDOW:
	wscale = opt[2];
	if (wscale > TCP_MAX_WINSHIFT)
	wscale = TCP_MAX_WINSHIFT;
	wscale \|= PF_WSCALE_FLAG;
	/* FALLTHROUGH */
	default:
	optlen = opt[1];
	if (optlen < 2)
	optlen = 2;
	hlen -= optlen;
	opt += optlen;
	break;
	}
	}
	return (wscale);
	}

	u_int16_t
	pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
	{
	+ INIT_VNET_INET(curvnet);
	int hlen;
	u_int8_t hdr[60];
	u_int8_t *opt, optlen;
	u_int16_t mss = V_tcp_mssdflt;

	hlen = th_off << 2; /* hlen <= sizeof(hdr) */
	if (hlen <= sizeof(struct tcphdr))
	return (0);
	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
	return (0);
	opt = hdr + sizeof(struct tcphdr);
	hlen -= sizeof(struct tcphdr);
	while (hlen >= TCPOLEN_MAXSEG) {
	switch (*opt) {
	case TCPOPT_EOL:
	case TCPOPT_NOP:
	++opt;
	--hlen;
	break;
	case TCPOPT_MAXSEG:
	bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
	NTOHS(mss);
	/* FALLTHROUGH */
	default:
	optlen = opt[1];
	if (optlen < 2)
	optlen = 2;
	hlen -= optlen;
	opt += optlen;
	break;
	}
	}
	return (mss);
	}

	u_int16_t
	pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer)
	{
	#ifdef INET
	+ INIT_VNET_INET(curvnet);
	struct sockaddr_in *dst;
	struct route ro;
	#endif /* INET */
	#ifdef INET6
	struct sockaddr_in6 *dst6;
	struct route_in6 ro6;
	#endif /* INET6 */
	struct rtentry *rt = NULL;
	int hlen = 0; /* make the compiler happy */
	u_int16_t mss = V_tcp_mssdflt;

	switch (af) {
	#ifdef INET
	case AF_INET:
	hlen = sizeof(struct ip);
	bzero(&ro, sizeof(ro));
	dst = (struct sockaddr_in *)&ro.ro_dst;
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = addr->v4;
	#ifdef __FreeBSD__
	#ifdef RTF_PRCLONING
	rtalloc_ign(&ro, (RTF_CLONING \| RTF_PRCLONING));
	#else /* !RTF_PRCLONING */
	in_rtalloc_ign(&ro, RTF_CLONING, 0);
	#endif
	#else /* ! __FreeBSD__ */
	rtalloc_noclone(&ro, NO_CLONING);
	#endif
	rt = ro.ro_rt;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	hlen = sizeof(struct ip6_hdr);
	bzero(&ro6, sizeof(ro6));
	dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
	dst6->sin6_family = AF_INET6;
	dst6->sin6_len = sizeof(*dst6);
	dst6->sin6_addr = addr->v6;
	#ifdef __FreeBSD__
	#ifdef RTF_PRCLONING
	rtalloc_ign((struct route *)&ro6,
	(RTF_CLONING \| RTF_PRCLONING));
	#else /* !RTF_PRCLONING */
	rtalloc_ign((struct route *)&ro6, RTF_CLONING);
	#endif
	#else /* ! __FreeBSD__ */
	rtalloc_noclone((struct route *)&ro6, NO_CLONING);
	#endif
	rt = ro6.ro_rt;
	break;
	#endif /* INET6 */
	}

	if (rt && rt->rt_ifp) {
	mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
	mss = max(V_tcp_mssdflt, mss);
	RTFREE(rt);
	}
	mss = min(mss, offer);
	mss = max(mss, 64); /* sanity - at least max opt space */
	return (mss);
	}

	void
	pf_set_rt_ifp(struct pf_state s, struct pf_addr saddr)
	{
	struct pf_rule *r = s->rule.ptr;

	s->rt_kif = NULL;
	if (!r->rt \|\| r->rt == PF_FASTROUTE)
	return;
	switch (s->af) {
	#ifdef INET
	case AF_INET:
	pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL,
	&s->nat_src_node);
	s->rt_kif = r->rpool.cur->kif;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL,
	&s->nat_src_node);
	s->rt_kif = r->rpool.cur->kif;
	break;
	#endif /* INET6 */
	}
	}

	int
	pf_test_tcp(struct pf_rule rm, struct pf_state sm, int direction,
	struct pfi_kif kif, struct mbuf m, int off, void *h,
	#ifdef __FreeBSD__
	struct pf_pdesc pd, struct pf_rule am, struct pf_ruleset *rsm,
	struct ifqueue ifq, struct inpcb inp)
	#else
	struct pf_pdesc pd, struct pf_rule am, struct pf_ruleset *rsm,
	struct ifqueue *ifq)
	#endif
	{
	+ INIT_VNET_INET(curvnet);
	struct pf_rule *nr = NULL;
	struct pf_addr saddr = pd->src, daddr = pd->dst;
	struct tcphdr *th = pd->hdr.tcp;
	u_int16_t bport, nport = 0;
	sa_family_t af = pd->af;
	struct pf_rule r, a = NULL;
	struct pf_ruleset *ruleset = NULL;
	struct pf_src_node *nsn = NULL;
	u_short reason;
	int rewrite = 0;
	int tag = -1, rtableid = -1;
	u_int16_t mss = V_tcp_mssdflt;
	int asd = 0;
	int match = 0;

	if (pf_check_congestion(ifq)) {
	REASON_SET(&reason, PFRES_CONGEST);
	return (PF_DROP);
	}

	#ifdef __FreeBSD__
	if (inp != NULL)
	pd->lookup.done = pf_socket_lookup(direction, pd, inp);
	else if (debug_pfugidhack) {
	PF_UNLOCK();
	DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
	pd->lookup.done = pf_socket_lookup(direction, pd, inp);
	PF_LOCK();
	}
	#endif

	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);

	if (direction == PF_OUT) {
	bport = nport = th->th_sport;
	/* check outgoing packet for BINAT/NAT */
	if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn,
	saddr, th->th_sport, daddr, th->th_dport,
	&pd->naddr, &nport)) != NULL) {
	PF_ACPY(&pd->baddr, saddr, af);
	pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
	&th->th_sum, &pd->naddr, nport, 0, af);
	rewrite++;
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	} else {
	bport = nport = th->th_dport;
	/* check incoming packet for BINAT/RDR */
	if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn,
	saddr, th->th_sport, daddr, th->th_dport,
	&pd->naddr, &nport)) != NULL) {
	PF_ACPY(&pd->baddr, daddr, af);
	pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
	&th->th_sum, &pd->naddr, nport, 0, af);
	rewrite++;
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	}

	while (r != NULL) {
	r->evaluations++;
	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	r = r->skip[PF_SKIP_IFP].ptr;
	else if (r->direction && r->direction != direction)
	r = r->skip[PF_SKIP_DIR].ptr;
	else if (r->af && r->af != af)
	r = r->skip[PF_SKIP_AF].ptr;
	else if (r->proto && r->proto != IPPROTO_TCP)
	r = r->skip[PF_SKIP_PROTO].ptr;
	else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
	r->src.neg, kif))
	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	else if (r->src.port_op && !pf_match_port(r->src.port_op,
	r->src.port[0], r->src.port[1], th->th_sport))
	r = r->skip[PF_SKIP_SRC_PORT].ptr;
	else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
	r->dst.neg, NULL))
	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
	r->dst.port[0], r->dst.port[1], th->th_dport))
	r = r->skip[PF_SKIP_DST_PORT].ptr;
	else if (r->tos && !(r->tos == pd->tos))
	r = TAILQ_NEXT(r, entries);
	else if (r->rule_flag & PFRULE_FRAGMENT)
	r = TAILQ_NEXT(r, entries);
	else if ((r->flagset & th->th_flags) != r->flags)
	r = TAILQ_NEXT(r, entries);
	else if (r->uid.op && (pd->lookup.done \|\| (pd->lookup.done =
	#ifdef __FreeBSD__
	pf_socket_lookup(direction, pd, inp), 1)) &&
	#else
	pf_socket_lookup(direction, pd), 1)) &&
	#endif
	!pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
	pd->lookup.uid))
	r = TAILQ_NEXT(r, entries);
	else if (r->gid.op && (pd->lookup.done \|\| (pd->lookup.done =
	#ifdef __FreeBSD__
	pf_socket_lookup(direction, pd, inp), 1)) &&
	#else
	pf_socket_lookup(direction, pd), 1)) &&
	#endif
	!pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
	pd->lookup.gid))
	r = TAILQ_NEXT(r, entries);
	else if (r->prob && r->prob <= arc4random())
	r = TAILQ_NEXT(r, entries);
	else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
	r = TAILQ_NEXT(r, entries);
	else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
	pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint))
	r = TAILQ_NEXT(r, entries);
	else {
	if (r->tag)
	tag = r->tag;
	if (r->rtableid >= 0)
	rtableid = r->rtableid;
	if (r->anchor == NULL) {
	match = 1;
	*rm = r;
	*am = a;
	*rsm = ruleset;
	if ((*rm)->quick)
	break;
	r = TAILQ_NEXT(r, entries);
	} else
	pf_step_into_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match);
	}
	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match))
	break;
	}
	r = *rm;
	a = *am;
	ruleset = *rsm;

	REASON_SET(&reason, PFRES_MATCH);

	if (r->log \|\| (nr != NULL && nr->natpass && nr->log)) {
	if (rewrite)
	#ifdef __FreeBSD__
	m_copyback(m, off, sizeof(*th), (caddr_t)th);
	#else
	m_copyback(m, off, sizeof(*th), th);
	#endif
	PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
	a, ruleset, pd);
	}

	if ((r->action == PF_DROP) &&
	((r->rule_flag & PFRULE_RETURNRST) \|\|
	(r->rule_flag & PFRULE_RETURNICMP) \|\|
	(r->rule_flag & PFRULE_RETURN))) {
	/* undo NAT changes, if they have taken place */
	if (nr != NULL) {
	if (direction == PF_OUT) {
	pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
	&th->th_sum, &pd->baddr, bport, 0, af);
	rewrite++;
	} else {
	pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
	&th->th_sum, &pd->baddr, bport, 0, af);
	rewrite++;
	}
	}
	if (((r->rule_flag & PFRULE_RETURNRST) \|\|
	(r->rule_flag & PFRULE_RETURN)) &&
	!(th->th_flags & TH_RST)) {
	u_int32_t ack = ntohl(th->th_seq) + pd->p_len;

	if (th->th_flags & TH_SYN)
	ack++;
	if (th->th_flags & TH_FIN)
	ack++;
	#ifdef __FreeBSD__
	pf_send_tcp(m, r, af, pd->dst,
	#else
	pf_send_tcp(r, af, pd->dst,
	#endif
	pd->src, th->th_dport, th->th_sport,
	ntohl(th->th_ack), ack, TH_RST\|TH_ACK, 0, 0,
	r->return_ttl, 1, 0, pd->eh, kif->pfik_ifp);
	} else if ((af == AF_INET) && r->return_icmp)
	pf_send_icmp(m, r->return_icmp >> 8,
	r->return_icmp & 255, af, r);
	else if ((af == AF_INET6) && r->return_icmp6)
	pf_send_icmp(m, r->return_icmp6 >> 8,
	r->return_icmp6 & 255, af, r);
	}

	if (r->action == PF_DROP)
	return (PF_DROP);

	if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
	REASON_SET(&reason, PFRES_MEMORY);
	return (PF_DROP);
	}

	if (r->keep_state \|\| nr != NULL \|\|
	(pd->flags & PFDESC_TCP_NORM)) {
	/* create new state */
	u_int16_t len;
	struct pf_state *s = NULL;
	struct pf_src_node *sn = NULL;

	len = pd->tot_len - off - (th->th_off << 2);

	/* check maximums */
	if (r->max_states && (r->states >= r->max_states)) {
	pf_status.lcounters[LCNT_STATES]++;
	REASON_SET(&reason, PFRES_MAXSTATES);
	goto cleanup;
	}
	/* src node for filter rule */
	if ((r->rule_flag & PFRULE_SRCTRACK \|\|
	r->rpool.opts & PF_POOL_STICKYADDR) &&
	pf_insert_src_node(&sn, r, saddr, af) != 0) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	/* src node for translation rule */
	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
	((direction == PF_OUT &&
	pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) \|\|
	(pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	s = pool_get(&pf_state_pl, PR_NOWAIT);
	if (s == NULL) {
	REASON_SET(&reason, PFRES_MEMORY);
	cleanup:
	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, sn);
	}
	if (nsn != sn && nsn != NULL && nsn->states == 0 &&
	nsn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, nsn);
	}
	return (PF_DROP);
	}
	bzero(s, sizeof(*s));
	s->rule.ptr = r;
	s->nat_rule.ptr = nr;
	s->anchor.ptr = a;
	STATE_INC_COUNTERS(s);
	s->allow_opts = r->allow_opts;
	s->log = r->log & PF_LOG_ALL;
	if (nr != NULL)
	s->log \|= nr->log & PF_LOG_ALL;
	s->proto = IPPROTO_TCP;
	s->direction = direction;
	s->af = af;
	if (direction == PF_OUT) {
	PF_ACPY(&s->gwy.addr, saddr, af);
	s->gwy.port = th->th_sport; /* sport */
	PF_ACPY(&s->ext.addr, daddr, af);
	s->ext.port = th->th_dport;
	if (nr != NULL) {
	PF_ACPY(&s->lan.addr, &pd->baddr, af);
	s->lan.port = bport;
	} else {
	PF_ACPY(&s->lan.addr, &s->gwy.addr, af);
	s->lan.port = s->gwy.port;
	}
	} else {
	PF_ACPY(&s->lan.addr, daddr, af);
	s->lan.port = th->th_dport;
	PF_ACPY(&s->ext.addr, saddr, af);
	s->ext.port = th->th_sport;
	if (nr != NULL) {
	PF_ACPY(&s->gwy.addr, &pd->baddr, af);
	s->gwy.port = bport;
	} else {
	PF_ACPY(&s->gwy.addr, &s->lan.addr, af);
	s->gwy.port = s->lan.port;
	}
	}

	s->src.seqlo = ntohl(th->th_seq);
	s->src.seqhi = s->src.seqlo + len + 1;
	if ((th->th_flags & (TH_SYN\|TH_ACK)) == TH_SYN &&
	r->keep_state == PF_STATE_MODULATE) {
	/* Generate sequence number modulator */
	#ifdef __FreeBSD__
	while ((s->src.seqdiff =
	pf_new_isn(s) - s->src.seqlo) == 0)
	;
	#else
	while ((s->src.seqdiff =
	tcp_rndiss_next() - s->src.seqlo) == 0)
	;
	#endif
	pf_change_a(&th->th_seq, &th->th_sum,
	htonl(s->src.seqlo + s->src.seqdiff), 0);
	rewrite = 1;
	} else
	s->src.seqdiff = 0;
	if (th->th_flags & TH_SYN) {
	s->src.seqhi++;
	s->src.wscale = pf_get_wscale(m, off, th->th_off, af);
	}
	s->src.max_win = MAX(ntohs(th->th_win), 1);
	if (s->src.wscale & PF_WSCALE_MASK) {
	/* Remove scale factor from initial window */
	int win = s->src.max_win;
	win += 1 << (s->src.wscale & PF_WSCALE_MASK);
	s->src.max_win = (win - 1) >>
	(s->src.wscale & PF_WSCALE_MASK);
	}
	if (th->th_flags & TH_FIN)
	s->src.seqhi++;
	s->dst.seqhi = 1;
	s->dst.max_win = 1;
	s->src.state = TCPS_SYN_SENT;
	s->dst.state = TCPS_CLOSED;
	s->creation = time_second;
	s->expire = time_second;
	s->timeout = PFTM_TCP_FIRST_PACKET;
	pf_set_rt_ifp(s, saddr);
	if (sn != NULL) {
	s->src_node = sn;
	s->src_node->states++;
	}
	if (nsn != NULL) {
	PF_ACPY(&nsn->raddr, &pd->naddr, af);
	s->nat_src_node = nsn;
	s->nat_src_node->states++;
	}
	if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
	off, pd, th, &s->src, &s->dst)) {
	REASON_SET(&reason, PFRES_MEMORY);
	pf_src_tree_remove_state(s);
	STATE_DEC_COUNTERS(s);
	pool_put(&pf_state_pl, s);
	return (PF_DROP);
	}
	if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
	pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
	&s->src, &s->dst, &rewrite)) {
	/* This really shouldn't happen!!! */
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_normalize_tcp_stateful failed on first pkt"));
	pf_normalize_tcp_cleanup(s);
	pf_src_tree_remove_state(s);
	STATE_DEC_COUNTERS(s);
	pool_put(&pf_state_pl, s);
	return (PF_DROP);
	}
	if (pf_insert_state(BOUND_IFACE(r, kif), s)) {
	pf_normalize_tcp_cleanup(s);
	REASON_SET(&reason, PFRES_STATEINS);
	pf_src_tree_remove_state(s);
	STATE_DEC_COUNTERS(s);
	pool_put(&pf_state_pl, s);
	return (PF_DROP);
	} else
	*sm = s;
	if (tag > 0) {
	pf_tag_ref(tag);
	s->tag = tag;
	}
	if ((th->th_flags & (TH_SYN\|TH_ACK)) == TH_SYN &&
	r->keep_state == PF_STATE_SYNPROXY) {
	s->src.state = PF_TCPS_PROXY_SRC;
	if (nr != NULL) {
	if (direction == PF_OUT) {
	pf_change_ap(saddr, &th->th_sport,
	pd->ip_sum, &th->th_sum, &pd->baddr,
	bport, 0, af);
	} else {
	pf_change_ap(daddr, &th->th_dport,
	pd->ip_sum, &th->th_sum, &pd->baddr,
	bport, 0, af);
	}
	}
	s->src.seqhi = htonl(arc4random());
	/* Find mss option */
	mss = pf_get_mss(m, off, th->th_off, af);
	mss = pf_calc_mss(saddr, af, mss);
	mss = pf_calc_mss(daddr, af, mss);
	s->src.mss = mss;
	#ifdef __FreeBSD__
	pf_send_tcp(NULL, r, af, daddr, saddr, th->th_dport,
	#else
	pf_send_tcp(r, af, daddr, saddr, th->th_dport,
	#endif
	th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
	TH_SYN\|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL, NULL);
	REASON_SET(&reason, PFRES_SYNPROXY);
	return (PF_SYNPROXY_DROP);
	}
	}

	/* copy back packet headers if we performed NAT operations */
	if (rewrite)
	m_copyback(m, off, sizeof(*th), (caddr_t)th);

	return (PF_PASS);
	}

	int
	pf_test_udp(struct pf_rule rm, struct pf_state sm, int direction,
	struct pfi_kif kif, struct mbuf m, int off, void *h,
	#ifdef __FreeBSD__
	struct pf_pdesc pd, struct pf_rule am, struct pf_ruleset *rsm,
	struct ifqueue ifq, struct inpcb inp)
	#else
	struct pf_pdesc pd, struct pf_rule am, struct pf_ruleset *rsm,
	struct ifqueue *ifq)
	#endif
	{
	struct pf_rule *nr = NULL;
	struct pf_addr saddr = pd->src, daddr = pd->dst;
	struct udphdr *uh = pd->hdr.udp;
	u_int16_t bport, nport = 0;
	sa_family_t af = pd->af;
	struct pf_rule r, a = NULL;
	struct pf_ruleset *ruleset = NULL;
	struct pf_src_node *nsn = NULL;
	u_short reason;
	int rewrite = 0;
	int tag = -1, rtableid = -1;
	int asd = 0;
	int match = 0;

	if (pf_check_congestion(ifq)) {
	REASON_SET(&reason, PFRES_CONGEST);
	return (PF_DROP);
	}

	#ifdef __FreeBSD__
	if (inp != NULL)
	pd->lookup.done = pf_socket_lookup(direction, pd, inp);
	else if (debug_pfugidhack) {
	PF_UNLOCK();
	DPFPRINTF(PF_DEBUG_MISC, ("pf: unlocked lookup\n"));
	pd->lookup.done = pf_socket_lookup(direction, pd, inp);
	PF_LOCK();
	}
	#endif

	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);

	if (direction == PF_OUT) {
	bport = nport = uh->uh_sport;
	/* check outgoing packet for BINAT/NAT */
	if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn,
	saddr, uh->uh_sport, daddr, uh->uh_dport,
	&pd->naddr, &nport)) != NULL) {
	PF_ACPY(&pd->baddr, saddr, af);
	pf_change_ap(saddr, &uh->uh_sport, pd->ip_sum,
	&uh->uh_sum, &pd->naddr, nport, 1, af);
	rewrite++;
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	} else {
	bport = nport = uh->uh_dport;
	/* check incoming packet for BINAT/RDR */
	if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn,
	saddr, uh->uh_sport, daddr, uh->uh_dport, &pd->naddr,
	&nport)) != NULL) {
	PF_ACPY(&pd->baddr, daddr, af);
	pf_change_ap(daddr, &uh->uh_dport, pd->ip_sum,
	&uh->uh_sum, &pd->naddr, nport, 1, af);
	rewrite++;
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	}

	while (r != NULL) {
	r->evaluations++;
	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	r = r->skip[PF_SKIP_IFP].ptr;
	else if (r->direction && r->direction != direction)
	r = r->skip[PF_SKIP_DIR].ptr;
	else if (r->af && r->af != af)
	r = r->skip[PF_SKIP_AF].ptr;
	else if (r->proto && r->proto != IPPROTO_UDP)
	r = r->skip[PF_SKIP_PROTO].ptr;
	else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
	r->src.neg, kif))
	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	else if (r->src.port_op && !pf_match_port(r->src.port_op,
	r->src.port[0], r->src.port[1], uh->uh_sport))
	r = r->skip[PF_SKIP_SRC_PORT].ptr;
	else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
	r->dst.neg, NULL))
	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
	r->dst.port[0], r->dst.port[1], uh->uh_dport))
	r = r->skip[PF_SKIP_DST_PORT].ptr;
	else if (r->tos && !(r->tos == pd->tos))
	r = TAILQ_NEXT(r, entries);
	else if (r->rule_flag & PFRULE_FRAGMENT)
	r = TAILQ_NEXT(r, entries);
	else if (r->uid.op && (pd->lookup.done \|\| (pd->lookup.done =
	#ifdef __FreeBSD__
	pf_socket_lookup(direction, pd, inp), 1)) &&
	#else
	pf_socket_lookup(direction, pd), 1)) &&
	#endif
	!pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
	pd->lookup.uid))
	r = TAILQ_NEXT(r, entries);
	else if (r->gid.op && (pd->lookup.done \|\| (pd->lookup.done =
	#ifdef __FreeBSD__
	pf_socket_lookup(direction, pd, inp), 1)) &&
	#else
	pf_socket_lookup(direction, pd), 1)) &&
	#endif
	!pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
	pd->lookup.gid))
	r = TAILQ_NEXT(r, entries);
	else if (r->prob && r->prob <= arc4random())
	r = TAILQ_NEXT(r, entries);
	else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
	r = TAILQ_NEXT(r, entries);
	else if (r->os_fingerprint != PF_OSFP_ANY)
	r = TAILQ_NEXT(r, entries);
	else {
	if (r->tag)
	tag = r->tag;
	if (r->rtableid >= 0)
	rtableid = r->rtableid;
	if (r->anchor == NULL) {
	match = 1;
	*rm = r;
	*am = a;
	*rsm = ruleset;
	if ((*rm)->quick)
	break;
	r = TAILQ_NEXT(r, entries);
	} else
	pf_step_into_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match);
	}
	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match))
	break;
	}
	r = *rm;
	a = *am;
	ruleset = *rsm;

	REASON_SET(&reason, PFRES_MATCH);

	if (r->log \|\| (nr != NULL && nr->natpass && nr->log)) {
	if (rewrite)
	#ifdef __FreeBSD__
	m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
	#else
	m_copyback(m, off, sizeof(*uh), uh);
	#endif
	PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
	a, ruleset, pd);
	}

	if ((r->action == PF_DROP) &&
	((r->rule_flag & PFRULE_RETURNICMP) \|\|
	(r->rule_flag & PFRULE_RETURN))) {
	/* undo NAT changes, if they have taken place */
	if (nr != NULL) {
	if (direction == PF_OUT) {
	pf_change_ap(saddr, &uh->uh_sport, pd->ip_sum,
	&uh->uh_sum, &pd->baddr, bport, 1, af);
	rewrite++;
	} else {
	pf_change_ap(daddr, &uh->uh_dport, pd->ip_sum,
	&uh->uh_sum, &pd->baddr, bport, 1, af);
	rewrite++;
	}
	}
	if ((af == AF_INET) && r->return_icmp)
	pf_send_icmp(m, r->return_icmp >> 8,
	r->return_icmp & 255, af, r);
	else if ((af == AF_INET6) && r->return_icmp6)
	pf_send_icmp(m, r->return_icmp6 >> 8,
	r->return_icmp6 & 255, af, r);
	}

	if (r->action == PF_DROP)
	return (PF_DROP);

	if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
	REASON_SET(&reason, PFRES_MEMORY);
	return (PF_DROP);
	}

	if (r->keep_state \|\| nr != NULL) {
	/* create new state */
	struct pf_state *s = NULL;
	struct pf_src_node *sn = NULL;

	/* check maximums */
	if (r->max_states && (r->states >= r->max_states)) {
	pf_status.lcounters[LCNT_STATES]++;
	REASON_SET(&reason, PFRES_MAXSTATES);
	goto cleanup;
	}
	/* src node for filter rule */
	if ((r->rule_flag & PFRULE_SRCTRACK \|\|
	r->rpool.opts & PF_POOL_STICKYADDR) &&
	pf_insert_src_node(&sn, r, saddr, af) != 0) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	/* src node for translation rule */
	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
	((direction == PF_OUT &&
	pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) \|\|
	(pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	s = pool_get(&pf_state_pl, PR_NOWAIT);
	if (s == NULL) {
	REASON_SET(&reason, PFRES_MEMORY);
	cleanup:
	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, sn);
	}
	if (nsn != sn && nsn != NULL && nsn->states == 0 &&
	nsn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, nsn);
	}
	return (PF_DROP);
	}
	bzero(s, sizeof(*s));
	s->rule.ptr = r;
	s->nat_rule.ptr = nr;
	s->anchor.ptr = a;
	STATE_INC_COUNTERS(s);
	s->allow_opts = r->allow_opts;
	s->log = r->log & PF_LOG_ALL;
	if (nr != NULL)
	s->log \|= nr->log & PF_LOG_ALL;
	s->proto = IPPROTO_UDP;
	s->direction = direction;
	s->af = af;
	if (direction == PF_OUT) {
	PF_ACPY(&s->gwy.addr, saddr, af);
	s->gwy.port = uh->uh_sport;
	PF_ACPY(&s->ext.addr, daddr, af);
	s->ext.port = uh->uh_dport;
	if (nr != NULL) {
	PF_ACPY(&s->lan.addr, &pd->baddr, af);
	s->lan.port = bport;
	} else {
	PF_ACPY(&s->lan.addr, &s->gwy.addr, af);
	s->lan.port = s->gwy.port;
	}
	} else {
	PF_ACPY(&s->lan.addr, daddr, af);
	s->lan.port = uh->uh_dport;
	PF_ACPY(&s->ext.addr, saddr, af);
	s->ext.port = uh->uh_sport;
	if (nr != NULL) {
	PF_ACPY(&s->gwy.addr, &pd->baddr, af);
	s->gwy.port = bport;
	} else {
	PF_ACPY(&s->gwy.addr, &s->lan.addr, af);
	s->gwy.port = s->lan.port;
	}
	}
	s->src.state = PFUDPS_SINGLE;
	s->dst.state = PFUDPS_NO_TRAFFIC;
	s->creation = time_second;
	s->expire = time_second;
	s->timeout = PFTM_UDP_FIRST_PACKET;
	pf_set_rt_ifp(s, saddr);
	if (sn != NULL) {
	s->src_node = sn;
	s->src_node->states++;
	}
	if (nsn != NULL) {
	PF_ACPY(&nsn->raddr, &pd->naddr, af);
	s->nat_src_node = nsn;
	s->nat_src_node->states++;
	}
	if (pf_insert_state(BOUND_IFACE(r, kif), s)) {
	REASON_SET(&reason, PFRES_STATEINS);
	pf_src_tree_remove_state(s);
	STATE_DEC_COUNTERS(s);
	pool_put(&pf_state_pl, s);
	return (PF_DROP);
	} else
	*sm = s;
	if (tag > 0) {
	pf_tag_ref(tag);
	s->tag = tag;
	}
	}

	/* copy back packet headers if we performed NAT operations */
	if (rewrite)
	m_copyback(m, off, sizeof(*uh), (caddr_t)uh);

	return (PF_PASS);
	}

	int
	pf_test_icmp(struct pf_rule rm, struct pf_state sm, int direction,
	struct pfi_kif kif, struct mbuf m, int off, void *h,
	struct pf_pdesc pd, struct pf_rule am, struct pf_ruleset *rsm,
	struct ifqueue *ifq)
	{
	struct pf_rule *nr = NULL;
	struct pf_addr saddr = pd->src, daddr = pd->dst;
	struct pf_rule r, a = NULL;
	struct pf_ruleset *ruleset = NULL;
	struct pf_src_node *nsn = NULL;
	u_short reason;
	u_int16_t icmpid = 0, bport, nport = 0;
	sa_family_t af = pd->af;
	u_int8_t icmptype = 0; /* make the compiler happy */
	u_int8_t icmpcode = 0; /* make the compiler happy */
	int state_icmp = 0;
	int tag = -1, rtableid = -1;
	#ifdef INET6
	int rewrite = 0;
	#endif /* INET6 */
	int asd = 0;
	int match = 0;

	if (pf_check_congestion(ifq)) {
	REASON_SET(&reason, PFRES_CONGEST);
	return (PF_DROP);
	}

	switch (pd->proto) {
	#ifdef INET
	case IPPROTO_ICMP:
	icmptype = pd->hdr.icmp->icmp_type;
	icmpcode = pd->hdr.icmp->icmp_code;
	icmpid = pd->hdr.icmp->icmp_id;

	if (icmptype == ICMP_UNREACH \|\|
	icmptype == ICMP_SOURCEQUENCH \|\|
	icmptype == ICMP_REDIRECT \|\|
	icmptype == ICMP_TIMXCEED \|\|
	icmptype == ICMP_PARAMPROB)
	state_icmp++;
	break;
	#endif /* INET */
	#ifdef INET6
	case IPPROTO_ICMPV6:
	icmptype = pd->hdr.icmp6->icmp6_type;
	icmpcode = pd->hdr.icmp6->icmp6_code;
	icmpid = pd->hdr.icmp6->icmp6_id;

	if (icmptype == ICMP6_DST_UNREACH \|\|
	icmptype == ICMP6_PACKET_TOO_BIG \|\|
	icmptype == ICMP6_TIME_EXCEEDED \|\|
	icmptype == ICMP6_PARAM_PROB)
	state_icmp++;
	break;
	#endif /* INET6 */
	}

	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);

	if (direction == PF_OUT) {
	bport = nport = icmpid;
	/* check outgoing packet for BINAT/NAT */
	if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn,
	saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) !=
	NULL) {
	PF_ACPY(&pd->baddr, saddr, af);
	switch (af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
	pd->naddr.v4.s_addr, 0);
	pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
	pd->hdr.icmp->icmp_cksum, icmpid, nport, 0);
	pd->hdr.icmp->icmp_id = nport;
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
	&pd->naddr, 0);
	rewrite++;
	break;
	#endif /* INET6 */
	}
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	} else {
	bport = nport = icmpid;
	/* check incoming packet for BINAT/RDR */
	if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn,
	saddr, icmpid, daddr, icmpid, &pd->naddr, &nport)) !=
	NULL) {
	PF_ACPY(&pd->baddr, daddr, af);
	switch (af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&daddr->v4.s_addr,
	pd->ip_sum, pd->naddr.v4.s_addr, 0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
	&pd->naddr, 0);
	rewrite++;
	break;
	#endif /* INET6 */
	}
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	}

	while (r != NULL) {
	r->evaluations++;
	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	r = r->skip[PF_SKIP_IFP].ptr;
	else if (r->direction && r->direction != direction)
	r = r->skip[PF_SKIP_DIR].ptr;
	else if (r->af && r->af != af)
	r = r->skip[PF_SKIP_AF].ptr;
	else if (r->proto && r->proto != pd->proto)
	r = r->skip[PF_SKIP_PROTO].ptr;
	else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
	r->src.neg, kif))
	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
	r->dst.neg, NULL))
	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	else if (r->type && r->type != icmptype + 1)
	r = TAILQ_NEXT(r, entries);
	else if (r->code && r->code != icmpcode + 1)
	r = TAILQ_NEXT(r, entries);
	else if (r->tos && !(r->tos == pd->tos))
	r = TAILQ_NEXT(r, entries);
	else if (r->rule_flag & PFRULE_FRAGMENT)
	r = TAILQ_NEXT(r, entries);
	else if (r->prob && r->prob <= arc4random())
	r = TAILQ_NEXT(r, entries);
	else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
	r = TAILQ_NEXT(r, entries);
	else if (r->os_fingerprint != PF_OSFP_ANY)
	r = TAILQ_NEXT(r, entries);
	else {
	if (r->tag)
	tag = r->tag;
	if (r->rtableid >= 0)
	rtableid = r->rtableid;
	if (r->anchor == NULL) {
	match = 1;
	*rm = r;
	*am = a;
	*rsm = ruleset;
	if ((*rm)->quick)
	break;
	r = TAILQ_NEXT(r, entries);
	} else
	pf_step_into_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match);
	}
	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match))
	break;
	}
	r = *rm;
	a = *am;
	ruleset = *rsm;

	REASON_SET(&reason, PFRES_MATCH);

	if (r->log \|\| (nr != NULL && nr->natpass && nr->log)) {
	#ifdef INET6
	if (rewrite)
	m_copyback(m, off, sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	#endif /* INET6 */
	PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
	a, ruleset, pd);
	}

	if (r->action != PF_PASS)
	return (PF_DROP);

	if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
	REASON_SET(&reason, PFRES_MEMORY);
	return (PF_DROP);
	}

	if (!state_icmp && (r->keep_state \|\| nr != NULL)) {
	/* create new state */
	struct pf_state *s = NULL;
	struct pf_src_node *sn = NULL;

	/* check maximums */
	if (r->max_states && (r->states >= r->max_states)) {
	pf_status.lcounters[LCNT_STATES]++;
	REASON_SET(&reason, PFRES_MAXSTATES);
	goto cleanup;
	}
	/* src node for filter rule */
	if ((r->rule_flag & PFRULE_SRCTRACK \|\|
	r->rpool.opts & PF_POOL_STICKYADDR) &&
	pf_insert_src_node(&sn, r, saddr, af) != 0) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	/* src node for translation rule */
	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
	((direction == PF_OUT &&
	pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) \|\|
	(pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	s = pool_get(&pf_state_pl, PR_NOWAIT);
	if (s == NULL) {
	REASON_SET(&reason, PFRES_MEMORY);
	cleanup:
	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, sn);
	}
	if (nsn != sn && nsn != NULL && nsn->states == 0 &&
	nsn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, nsn);
	}
	return (PF_DROP);
	}
	bzero(s, sizeof(*s));
	s->rule.ptr = r;
	s->nat_rule.ptr = nr;
	s->anchor.ptr = a;
	STATE_INC_COUNTERS(s);
	s->allow_opts = r->allow_opts;
	s->log = r->log & PF_LOG_ALL;
	if (nr != NULL)
	s->log \|= nr->log & PF_LOG_ALL;
	s->proto = pd->proto;
	s->direction = direction;
	s->af = af;
	if (direction == PF_OUT) {
	PF_ACPY(&s->gwy.addr, saddr, af);
	s->gwy.port = nport;
	PF_ACPY(&s->ext.addr, daddr, af);
	s->ext.port = 0;
	if (nr != NULL) {
	PF_ACPY(&s->lan.addr, &pd->baddr, af);
	s->lan.port = bport;
	} else {
	PF_ACPY(&s->lan.addr, &s->gwy.addr, af);
	s->lan.port = s->gwy.port;
	}
	} else {
	PF_ACPY(&s->lan.addr, daddr, af);
	s->lan.port = nport;
	PF_ACPY(&s->ext.addr, saddr, af);
	s->ext.port = 0;
	if (nr != NULL) {
	PF_ACPY(&s->gwy.addr, &pd->baddr, af);
	s->gwy.port = bport;
	} else {
	PF_ACPY(&s->gwy.addr, &s->lan.addr, af);
	s->gwy.port = s->lan.port;
	}
	}
	s->creation = time_second;
	s->expire = time_second;
	s->timeout = PFTM_ICMP_FIRST_PACKET;
	pf_set_rt_ifp(s, saddr);
	if (sn != NULL) {
	s->src_node = sn;
	s->src_node->states++;
	}
	if (nsn != NULL) {
	PF_ACPY(&nsn->raddr, &pd->naddr, af);
	s->nat_src_node = nsn;
	s->nat_src_node->states++;
	}
	if (pf_insert_state(BOUND_IFACE(r, kif), s)) {
	REASON_SET(&reason, PFRES_STATEINS);
	pf_src_tree_remove_state(s);
	STATE_DEC_COUNTERS(s);
	pool_put(&pf_state_pl, s);
	return (PF_DROP);
	} else
	*sm = s;
	if (tag > 0) {
	pf_tag_ref(tag);
	s->tag = tag;
	}
	}

	#ifdef INET6
	/* copy back packet headers if we performed IPv6 NAT operations */
	if (rewrite)
	m_copyback(m, off, sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	#endif /* INET6 */

	return (PF_PASS);
	}

	int
	pf_test_other(struct pf_rule rm, struct pf_state sm, int direction,
	struct pfi_kif kif, struct mbuf m, int off, void h, struct pf_pdesc pd,
	struct pf_rule am, struct pf_ruleset rsm, struct ifqueue *ifq)
	{
	struct pf_rule *nr = NULL;
	struct pf_rule r, a = NULL;
	struct pf_ruleset *ruleset = NULL;
	struct pf_src_node *nsn = NULL;
	struct pf_addr saddr = pd->src, daddr = pd->dst;
	sa_family_t af = pd->af;
	u_short reason;
	int tag = -1, rtableid = -1;
	int asd = 0;
	int match = 0;

	if (pf_check_congestion(ifq)) {
	REASON_SET(&reason, PFRES_CONGEST);
	return (PF_DROP);
	}

	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);

	if (direction == PF_OUT) {
	/* check outgoing packet for BINAT/NAT */
	if ((nr = pf_get_translation(pd, m, off, PF_OUT, kif, &nsn,
	saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) {
	PF_ACPY(&pd->baddr, saddr, af);
	switch (af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
	pd->naddr.v4.s_addr, 0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	PF_ACPY(saddr, &pd->naddr, af);
	break;
	#endif /* INET6 */
	}
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	} else {
	/* check incoming packet for BINAT/RDR */
	if ((nr = pf_get_translation(pd, m, off, PF_IN, kif, &nsn,
	saddr, 0, daddr, 0, &pd->naddr, NULL)) != NULL) {
	PF_ACPY(&pd->baddr, daddr, af);
	switch (af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&daddr->v4.s_addr,
	pd->ip_sum, pd->naddr.v4.s_addr, 0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	PF_ACPY(daddr, &pd->naddr, af);
	break;
	#endif /* INET6 */
	}
	if (nr->natpass)
	r = NULL;
	pd->nat_rule = nr;
	}
	}

	while (r != NULL) {
	r->evaluations++;
	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	r = r->skip[PF_SKIP_IFP].ptr;
	else if (r->direction && r->direction != direction)
	r = r->skip[PF_SKIP_DIR].ptr;
	else if (r->af && r->af != af)
	r = r->skip[PF_SKIP_AF].ptr;
	else if (r->proto && r->proto != pd->proto)
	r = r->skip[PF_SKIP_PROTO].ptr;
	else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
	r->src.neg, kif))
	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
	r->dst.neg, NULL))
	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	else if (r->tos && !(r->tos == pd->tos))
	r = TAILQ_NEXT(r, entries);
	else if (r->rule_flag & PFRULE_FRAGMENT)
	r = TAILQ_NEXT(r, entries);
	else if (r->prob && r->prob <= arc4random())
	r = TAILQ_NEXT(r, entries);
	else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
	r = TAILQ_NEXT(r, entries);
	else if (r->os_fingerprint != PF_OSFP_ANY)
	r = TAILQ_NEXT(r, entries);
	else {
	if (r->tag)
	tag = r->tag;
	if (r->rtableid >= 0)
	rtableid = r->rtableid;
	if (r->anchor == NULL) {
	match = 1;
	*rm = r;
	*am = a;
	*rsm = ruleset;
	if ((*rm)->quick)
	break;
	r = TAILQ_NEXT(r, entries);
	} else
	pf_step_into_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match);
	}
	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match))
	break;
	}
	r = *rm;
	a = *am;
	ruleset = *rsm;

	REASON_SET(&reason, PFRES_MATCH);

	if (r->log \|\| (nr != NULL && nr->natpass && nr->log))
	PFLOG_PACKET(kif, h, m, af, direction, reason, r->log ? r : nr,
	a, ruleset, pd);

	if ((r->action == PF_DROP) &&
	((r->rule_flag & PFRULE_RETURNICMP) \|\|
	(r->rule_flag & PFRULE_RETURN))) {
	struct pf_addr *a = NULL;

	if (nr != NULL) {
	if (direction == PF_OUT)
	a = saddr;
	else
	a = daddr;
	}
	if (a != NULL) {
	switch (af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&a->v4.s_addr, pd->ip_sum,
	pd->baddr.v4.s_addr, 0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	PF_ACPY(a, &pd->baddr, af);
	break;
	#endif /* INET6 */
	}
	}
	if ((af == AF_INET) && r->return_icmp)
	pf_send_icmp(m, r->return_icmp >> 8,
	r->return_icmp & 255, af, r);
	else if ((af == AF_INET6) && r->return_icmp6)
	pf_send_icmp(m, r->return_icmp6 >> 8,
	r->return_icmp6 & 255, af, r);
	}

	if (r->action != PF_PASS)
	return (PF_DROP);

	if (pf_tag_packet(m, pd->pf_mtag, tag, rtableid)) {
	REASON_SET(&reason, PFRES_MEMORY);
	return (PF_DROP);
	}

	if (r->keep_state \|\| nr != NULL) {
	/* create new state */
	struct pf_state *s = NULL;
	struct pf_src_node *sn = NULL;

	/* check maximums */
	if (r->max_states && (r->states >= r->max_states)) {
	pf_status.lcounters[LCNT_STATES]++;
	REASON_SET(&reason, PFRES_MAXSTATES);
	goto cleanup;
	}
	/* src node for filter rule */
	if ((r->rule_flag & PFRULE_SRCTRACK \|\|
	r->rpool.opts & PF_POOL_STICKYADDR) &&
	pf_insert_src_node(&sn, r, saddr, af) != 0) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	/* src node for translation rule */
	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
	((direction == PF_OUT &&
	pf_insert_src_node(&nsn, nr, &pd->baddr, af) != 0) \|\|
	(pf_insert_src_node(&nsn, nr, saddr, af) != 0))) {
	REASON_SET(&reason, PFRES_SRCLIMIT);
	goto cleanup;
	}
	s = pool_get(&pf_state_pl, PR_NOWAIT);
	if (s == NULL) {
	REASON_SET(&reason, PFRES_MEMORY);
	cleanup:
	if (sn != NULL && sn->states == 0 && sn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, sn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, sn);
	}
	if (nsn != sn && nsn != NULL && nsn->states == 0 &&
	nsn->expire == 0) {
	RB_REMOVE(pf_src_tree, &tree_src_tracking, nsn);
	pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
	pf_status.src_nodes--;
	pool_put(&pf_src_tree_pl, nsn);
	}
	return (PF_DROP);
	}
	bzero(s, sizeof(*s));
	s->rule.ptr = r;
	s->nat_rule.ptr = nr;
	s->anchor.ptr = a;
	STATE_INC_COUNTERS(s);
	s->allow_opts = r->allow_opts;
	s->log = r->log & PF_LOG_ALL;
	if (nr != NULL)
	s->log \|= nr->log & PF_LOG_ALL;
	s->proto = pd->proto;
	s->direction = direction;
	s->af = af;
	if (direction == PF_OUT) {
	PF_ACPY(&s->gwy.addr, saddr, af);
	PF_ACPY(&s->ext.addr, daddr, af);
	if (nr != NULL)
	PF_ACPY(&s->lan.addr, &pd->baddr, af);
	else
	PF_ACPY(&s->lan.addr, &s->gwy.addr, af);
	} else {
	PF_ACPY(&s->lan.addr, daddr, af);
	PF_ACPY(&s->ext.addr, saddr, af);
	if (nr != NULL)
	PF_ACPY(&s->gwy.addr, &pd->baddr, af);
	else
	PF_ACPY(&s->gwy.addr, &s->lan.addr, af);
	}
	s->src.state = PFOTHERS_SINGLE;
	s->dst.state = PFOTHERS_NO_TRAFFIC;
	s->creation = time_second;
	s->expire = time_second;
	s->timeout = PFTM_OTHER_FIRST_PACKET;
	pf_set_rt_ifp(s, saddr);
	if (sn != NULL) {
	s->src_node = sn;
	s->src_node->states++;
	}
	if (nsn != NULL) {
	PF_ACPY(&nsn->raddr, &pd->naddr, af);
	s->nat_src_node = nsn;
	s->nat_src_node->states++;
	}
	if (pf_insert_state(BOUND_IFACE(r, kif), s)) {
	REASON_SET(&reason, PFRES_STATEINS);
	pf_src_tree_remove_state(s);
	STATE_DEC_COUNTERS(s);
	pool_put(&pf_state_pl, s);
	return (PF_DROP);
	} else
	*sm = s;
	if (tag > 0) {
	pf_tag_ref(tag);
	s->tag = tag;
	}
	}

	return (PF_PASS);
	}

	int
	pf_test_fragment(struct pf_rule *rm, int direction, struct pfi_kif kif,
	struct mbuf m, void h, struct pf_pdesc pd, struct pf_rule *am,
	struct pf_ruleset **rsm)
	{
	struct pf_rule r, a = NULL;
	struct pf_ruleset *ruleset = NULL;
	sa_family_t af = pd->af;
	u_short reason;
	int tag = -1;
	int asd = 0;
	int match = 0;

	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
	while (r != NULL) {
	r->evaluations++;
	if (pfi_kif_match(r->kif, kif) == r->ifnot)
	r = r->skip[PF_SKIP_IFP].ptr;
	else if (r->direction && r->direction != direction)
	r = r->skip[PF_SKIP_DIR].ptr;
	else if (r->af && r->af != af)
	r = r->skip[PF_SKIP_AF].ptr;
	else if (r->proto && r->proto != pd->proto)
	r = r->skip[PF_SKIP_PROTO].ptr;
	else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
	r->src.neg, kif))
	r = r->skip[PF_SKIP_SRC_ADDR].ptr;
	else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
	r->dst.neg, NULL))
	r = r->skip[PF_SKIP_DST_ADDR].ptr;
	else if (r->tos && !(r->tos == pd->tos))
	r = TAILQ_NEXT(r, entries);
	else if (r->os_fingerprint != PF_OSFP_ANY)
	r = TAILQ_NEXT(r, entries);
	else if (pd->proto == IPPROTO_UDP &&
	(r->src.port_op \|\| r->dst.port_op))
	r = TAILQ_NEXT(r, entries);
	else if (pd->proto == IPPROTO_TCP &&
	(r->src.port_op \|\| r->dst.port_op \|\| r->flagset))
	r = TAILQ_NEXT(r, entries);
	else if ((pd->proto == IPPROTO_ICMP \|\|
	pd->proto == IPPROTO_ICMPV6) &&
	(r->type \|\| r->code))
	r = TAILQ_NEXT(r, entries);
	else if (r->prob && r->prob <= arc4random())
	r = TAILQ_NEXT(r, entries);
	else if (r->match_tag && !pf_match_tag(m, r, pd->pf_mtag, &tag))
	r = TAILQ_NEXT(r, entries);
	else {
	if (r->anchor == NULL) {
	match = 1;
	*rm = r;
	*am = a;
	*rsm = ruleset;
	if ((*rm)->quick)
	break;
	r = TAILQ_NEXT(r, entries);
	} else
	pf_step_into_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match);
	}
	if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
	PF_RULESET_FILTER, &r, &a, &match))
	break;
	}
	r = *rm;
	a = *am;
	ruleset = *rsm;

	REASON_SET(&reason, PFRES_MATCH);

	if (r->log)
	PFLOG_PACKET(kif, h, m, af, direction, reason, r, a, ruleset,
	pd);

	if (r->action != PF_PASS)
	return (PF_DROP);

	if (pf_tag_packet(m, pd->pf_mtag, tag, -1)) {
	REASON_SET(&reason, PFRES_MEMORY);
	return (PF_DROP);
	}

	return (PF_PASS);
	}

	int
	pf_test_state_tcp(struct pf_state *state, int direction, struct pfi_kif kif,
	struct mbuf m, int off, void h, struct pf_pdesc *pd,
	u_short *reason)
	{
	struct pf_state_cmp key;
	struct tcphdr *th = pd->hdr.tcp;
	u_int16_t win = ntohs(th->th_win);
	u_int32_t ack, end, seq, orig_seq;
	u_int8_t sws, dws;
	int ackskew;
	int copyback = 0;
	struct pf_state_peer src, dst;

	key.af = pd->af;
	key.proto = IPPROTO_TCP;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd->src, key.af);
	PF_ACPY(&key.gwy.addr, pd->dst, key.af);
	key.ext.port = th->th_sport;
	key.gwy.port = th->th_dport;
	} else {
	PF_ACPY(&key.lan.addr, pd->src, key.af);
	PF_ACPY(&key.ext.addr, pd->dst, key.af);
	key.lan.port = th->th_sport;
	key.ext.port = th->th_dport;
	}

	STATE_LOOKUP();

	if (direction == (*state)->direction) {
	src = &(*state)->src;
	dst = &(*state)->dst;
	} else {
	src = &(*state)->dst;
	dst = &(*state)->src;
	}

	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
	if (direction != (*state)->direction) {
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_SYNPROXY_DROP);
	}
	if (th->th_flags & TH_SYN) {
	if (ntohl(th->th_seq) != (*state)->src.seqlo) {
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_DROP);
	}
	#ifdef __FreeBSD__
	pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
	#else
	pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
	#endif
	pd->src, th->th_dport, th->th_sport,
	(*state)->src.seqhi, ntohl(th->th_seq) + 1,
	TH_SYN\|TH_ACK, 0, (*state)->src.mss, 0, 1,
	0, NULL, NULL);
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_SYNPROXY_DROP);
	} else if (!(th->th_flags & TH_ACK) \|\|
	(ntohl(th->th_ack) != (*state)->src.seqhi + 1) \|\|
	(ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_DROP);
	} else if ((*state)->src_node != NULL &&
	pf_src_connlimit(state)) {
	REASON_SET(reason, PFRES_SRCLIMIT);
	return (PF_DROP);
	} else
	(*state)->src.state = PF_TCPS_PROXY_DST;
	}
	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
	struct pf_state_host src, dst;

	if (direction == PF_OUT) {
	src = &(*state)->gwy;
	dst = &(*state)->ext;
	} else {
	src = &(*state)->ext;
	dst = &(*state)->lan;
	}
	if (direction == (*state)->direction) {
	if (((th->th_flags & (TH_SYN\|TH_ACK)) != TH_ACK) \|\|
	(ntohl(th->th_ack) != (*state)->src.seqhi + 1) \|\|
	(ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_DROP);
	}
	(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
	if ((*state)->dst.seqhi == 1)
	(*state)->dst.seqhi = htonl(arc4random());
	#ifdef __FreeBSD__
	pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
	&src->addr,
	#else
	pf_send_tcp((*state)->rule.ptr, pd->af, &src->addr,
	#endif
	&dst->addr, src->port, dst->port,
	(*state)->dst.seqhi, 0, TH_SYN, 0,
	(state)->src.mss, 0, 0, (state)->tag, NULL, NULL);
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_SYNPROXY_DROP);
	} else if (((th->th_flags & (TH_SYN\|TH_ACK)) !=
	(TH_SYN\|TH_ACK)) \|\|
	(ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_DROP);
	} else {
	(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
	(*state)->dst.seqlo = ntohl(th->th_seq);
	#ifdef __FreeBSD__
	pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
	#else
	pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
	#endif
	pd->src, th->th_dport, th->th_sport,
	ntohl(th->th_ack), ntohl(th->th_seq) + 1,
	TH_ACK, (*state)->src.max_win, 0, 0, 0,
	(*state)->tag, NULL, NULL);
	#ifdef __FreeBSD__
	pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
	&src->addr,
	#else
	pf_send_tcp((*state)->rule.ptr, pd->af, &src->addr,
	#endif
	&dst->addr, src->port, dst->port,
	(state)->src.seqhi + 1, (state)->src.seqlo + 1,
	TH_ACK, (*state)->dst.max_win, 0, 0, 1,
	0, NULL, NULL);
	(state)->src.seqdiff = (state)->dst.seqhi -
	(*state)->src.seqlo;
	(state)->dst.seqdiff = (state)->src.seqhi -
	(*state)->dst.seqlo;
	(state)->src.seqhi = (state)->src.seqlo +
	(*state)->dst.max_win;
	(state)->dst.seqhi = (state)->dst.seqlo +
	(*state)->src.max_win;
	(state)->src.wscale = (state)->dst.wscale = 0;
	(state)->src.state = (state)->dst.state =
	TCPS_ESTABLISHED;
	REASON_SET(reason, PFRES_SYNPROXY);
	return (PF_SYNPROXY_DROP);
	}
	}

	if (((th->th_flags & (TH_SYN\|TH_ACK)) == TH_SYN) &&
	dst->state >= TCPS_FIN_WAIT_2 &&
	src->state >= TCPS_FIN_WAIT_2) {
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: state reuse ");
	pf_print_state(*state);
	pf_print_flags(th->th_flags);
	printf("\n");
	}
	/* XXX make sure it's the same direction ?? */
	(state)->src.state = (state)->dst.state = TCPS_CLOSED;
	pf_unlink_state(*state);
	*state = NULL;
	return (PF_DROP);
	}

	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
	sws = src->wscale & PF_WSCALE_MASK;
	dws = dst->wscale & PF_WSCALE_MASK;
	} else
	sws = dws = 0;

	/*
	* Sequence tracking algorithm from Guido van Rooij's paper:
	* http://www.madison-gurkha.com/publications/tcp_filtering/
	* tcp_filtering.ps
	*/

	orig_seq = seq = ntohl(th->th_seq);
	if (src->seqlo == 0) {
	/* First packet from this end. Set its state */

	if ((pd->flags & PFDESC_TCP_NORM \|\| dst->scrub) &&
	src->scrub == NULL) {
	if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
	REASON_SET(reason, PFRES_MEMORY);
	return (PF_DROP);
	}
	}

	/* Deferred generation of sequence number modulator */
	if (dst->seqdiff && !src->seqdiff) {
	#ifdef __FreeBSD__
	while ((src->seqdiff = pf_new_isn(*state) - seq) == 0)
	;
	#else
	while ((src->seqdiff = tcp_rndiss_next() - seq) == 0)
	;
	#endif
	ack = ntohl(th->th_ack) - dst->seqdiff;
	pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
	src->seqdiff), 0);
	pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
	copyback = 1;
	} else {
	ack = ntohl(th->th_ack);
	}

	end = seq + pd->p_len;
	if (th->th_flags & TH_SYN) {
	end++;
	if (dst->wscale & PF_WSCALE_FLAG) {
	src->wscale = pf_get_wscale(m, off, th->th_off,
	pd->af);
	if (src->wscale & PF_WSCALE_FLAG) {
	/* Remove scale factor from initial
	* window */
	sws = src->wscale & PF_WSCALE_MASK;
	win = ((u_int32_t)win + (1 << sws) - 1)
	>> sws;
	dws = dst->wscale & PF_WSCALE_MASK;
	} else {
	/* fixup other window */
	dst->max_win <<= dst->wscale &
	PF_WSCALE_MASK;
	/* in case of a retrans SYN\|ACK */
	dst->wscale = 0;
	}
	}
	}
	if (th->th_flags & TH_FIN)
	end++;

	src->seqlo = seq;
	if (src->state < TCPS_SYN_SENT)
	src->state = TCPS_SYN_SENT;

	/*
	* May need to slide the window (seqhi may have been set by
	* the crappy stack check or if we picked up the connection
	* after establishment)
	*/
	if (src->seqhi == 1 \|\|
	SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
	src->seqhi = end + MAX(1, dst->max_win << dws);
	if (win > src->max_win)
	src->max_win = win;

	} else {
	ack = ntohl(th->th_ack) - dst->seqdiff;
	if (src->seqdiff) {
	/* Modulate sequence numbers */
	pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
	src->seqdiff), 0);
	pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
	copyback = 1;
	}
	end = seq + pd->p_len;
	if (th->th_flags & TH_SYN)
	end++;
	if (th->th_flags & TH_FIN)
	end++;
	}

	if ((th->th_flags & TH_ACK) == 0) {
	/* Let it pass through the ack skew check */
	ack = dst->seqlo;
	} else if ((ack == 0 &&
	(th->th_flags & (TH_ACK\|TH_RST)) == (TH_ACK\|TH_RST)) \|\|
	/* broken tcp stacks do not set ack */
	(dst->state < TCPS_SYN_SENT)) {
	/*
	* Many stacks (ours included) will set the ACK number in an
	* FIN\|ACK if the SYN times out -- no sequence to ACK.
	*/
	ack = dst->seqlo;
	}

	if (seq == end) {
	/* Ease sequencing restrictions on no data packets */
	seq = src->seqlo;
	end = seq;
	}

	ackskew = dst->seqlo - ack;


	/*
	* Need to demodulate the sequence numbers in any TCP SACK options
	* (Selective ACK). We could optionally validate the SACK values
	* against the current ACK window, either forwards or backwards, but
	* I'm not confident that SACK has been implemented properly
	* everywhere. It wouldn't surprise me if several stacks accidently
	* SACK too far backwards of previously ACKed data. There really aren't
	* any security implications of bad SACKing unless the target stack
	* doesn't validate the option length correctly. Someone trying to
	* spoof into a TCP connection won't bother blindly sending SACK
	* options anyway.
	*/
	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
	if (pf_modulate_sack(m, off, pd, th, dst))
	copyback = 1;
	}


	#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
	if (SEQ_GEQ(src->seqhi, end) &&
	/* Last octet inside other's window space */
	SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
	/* Retrans: not more than one window back */
	(ackskew >= -MAXACKWINDOW) &&
	/* Acking not more than one reassembled fragment backwards */
	(ackskew <= (MAXACKWINDOW << sws)) &&
	/* Acking not more than one window forward */
	((th->th_flags & TH_RST) == 0 \|\| orig_seq == src->seqlo \|\|
	(orig_seq == src->seqlo + 1) \|\| (pd->flags & PFDESC_IP_REAS) == 0)) {
	/* Require an exact/+1 sequence match on resets when possible */

	if (dst->scrub \|\| src->scrub) {
	if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
	*state, src, dst, &copyback))
	return (PF_DROP);
	}

	/* update max window */
	if (src->max_win < win)
	src->max_win = win;
	/* synchronize sequencing */
	if (SEQ_GT(end, src->seqlo))
	src->seqlo = end;
	/* slide the window of what the other end can send */
	if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
	dst->seqhi = ack + MAX((win << sws), 1);


	/* update states */
	if (th->th_flags & TH_SYN)
	if (src->state < TCPS_SYN_SENT)
	src->state = TCPS_SYN_SENT;
	if (th->th_flags & TH_FIN)
	if (src->state < TCPS_CLOSING)
	src->state = TCPS_CLOSING;
	if (th->th_flags & TH_ACK) {
	if (dst->state == TCPS_SYN_SENT) {
	dst->state = TCPS_ESTABLISHED;
	if (src->state == TCPS_ESTABLISHED &&
	(*state)->src_node != NULL &&
	pf_src_connlimit(state)) {
	REASON_SET(reason, PFRES_SRCLIMIT);
	return (PF_DROP);
	}
	} else if (dst->state == TCPS_CLOSING)
	dst->state = TCPS_FIN_WAIT_2;
	}
	if (th->th_flags & TH_RST)
	src->state = dst->state = TCPS_TIME_WAIT;

	/* update expire time */
	(*state)->expire = time_second;
	if (src->state >= TCPS_FIN_WAIT_2 &&
	dst->state >= TCPS_FIN_WAIT_2)
	(*state)->timeout = PFTM_TCP_CLOSED;
	else if (src->state >= TCPS_CLOSING &&
	dst->state >= TCPS_CLOSING)
	(*state)->timeout = PFTM_TCP_FIN_WAIT;
	else if (src->state < TCPS_ESTABLISHED \|\|
	dst->state < TCPS_ESTABLISHED)
	(*state)->timeout = PFTM_TCP_OPENING;
	else if (src->state >= TCPS_CLOSING \|\|
	dst->state >= TCPS_CLOSING)
	(*state)->timeout = PFTM_TCP_CLOSING;
	else
	(*state)->timeout = PFTM_TCP_ESTABLISHED;

	/* Fall through to PASS packet */

	} else if ((dst->state < TCPS_SYN_SENT \|\|
	dst->state >= TCPS_FIN_WAIT_2 \|\|
	src->state >= TCPS_FIN_WAIT_2) &&
	SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
	/* Within a window forward of the originating packet */
	SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
	/* Within a window backward of the originating packet */

	/*
	* This currently handles three situations:
	* 1) Stupid stacks will shotgun SYNs before their peer
	* replies.
	* 2) When PF catches an already established stream (the
	* firewall rebooted, the state table was flushed, routes
	* changed...)
	* 3) Packets get funky immediately after the connection
	* closes (this should catch Solaris spurious ACK\|FINs
	* that web servers like to spew after a close)
	*
	* This must be a little more careful than the above code
	* since packet floods will also be caught here. We don't
	* update the TTL here to mitigate the damage of a packet
	* flood and so the same code can handle awkward establishment
	* and a loosened connection close.
	* In the establishment case, a correct peer response will
	* validate the connection, go through the normal state code
	* and keep updating the state TTL.
	*/

	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: loose state match: ");
	pf_print_state(*state);
	pf_print_flags(th->th_flags);
	printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
	"pkts=%llu:%llu\n", seq, orig_seq, ack, pd->p_len,
	#ifdef __FreeBSD__
	ackskew, (unsigned long long)(*state)->packets[0],
	(unsigned long long)(*state)->packets[1]);
	#else
	ackskew, (*state)->packets[0],
	(*state)->packets[1]);
	#endif
	}

	if (dst->scrub \|\| src->scrub) {
	if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
	*state, src, dst, &copyback))
	return (PF_DROP);
	}

	/* update max window */
	if (src->max_win < win)
	src->max_win = win;
	/* synchronize sequencing */
	if (SEQ_GT(end, src->seqlo))
	src->seqlo = end;
	/* slide the window of what the other end can send */
	if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
	dst->seqhi = ack + MAX((win << sws), 1);

	/*
	* Cannot set dst->seqhi here since this could be a shotgunned
	* SYN and not an already established connection.
	*/

	if (th->th_flags & TH_FIN)
	if (src->state < TCPS_CLOSING)
	src->state = TCPS_CLOSING;
	if (th->th_flags & TH_RST)
	src->state = dst->state = TCPS_TIME_WAIT;

	/* Fall through to PASS packet */

	} else {
	if ((*state)->dst.state == TCPS_SYN_SENT &&
	(*state)->src.state == TCPS_SYN_SENT) {
	/* Send RST for state mismatches during handshake */
	if (!(th->th_flags & TH_RST))
	#ifdef __FreeBSD__
	pf_send_tcp(m, (*state)->rule.ptr, pd->af,
	#else
	pf_send_tcp((*state)->rule.ptr, pd->af,
	#endif
	pd->dst, pd->src, th->th_dport,
	th->th_sport, ntohl(th->th_ack), 0,
	TH_RST, 0, 0,
	(*state)->rule.ptr->return_ttl, 1, 0,
	pd->eh, kif->pfik_ifp);
	src->seqlo = 0;
	src->seqhi = 1;
	src->max_win = 1;
	} else if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: BAD state: ");
	pf_print_state(*state);
	pf_print_flags(th->th_flags);
	printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
	"pkts=%llu:%llu dir=%s,%s\n",
	seq, orig_seq, ack, pd->p_len, ackskew,
	#ifdef __FreeBSD__
	(unsigned long long)(*state)->packets[0],
	(unsigned long long)(*state)->packets[1],
	#else
	(state)->packets[0], (state)->packets[1],
	#endif
	direction == PF_IN ? "in" : "out",
	direction == (*state)->direction ? "fwd" : "rev");
	printf("pf: State failure on: %c %c %c %c \| %c %c\n",
	SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
	SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
	' ': '2',
	(ackskew >= -MAXACKWINDOW) ? ' ' : '3',
	(ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
	SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
	SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
	}
	REASON_SET(reason, PFRES_BADSTATE);
	return (PF_DROP);
	}

	/* Any packets which have gotten here are to be passed */

	/* translate source/destination address, if necessary */
	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_OUT)
	pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
	&th->th_sum, &(*state)->gwy.addr,
	(*state)->gwy.port, 0, pd->af);
	else
	pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
	&th->th_sum, &(*state)->lan.addr,
	(*state)->lan.port, 0, pd->af);
	m_copyback(m, off, sizeof(*th), (caddr_t)th);
	} else if (copyback) {
	/* Copyback sequence modulation or stateful scrub changes */
	m_copyback(m, off, sizeof(*th), (caddr_t)th);
	}

	return (PF_PASS);
	}

	int
	pf_test_state_udp(struct pf_state *state, int direction, struct pfi_kif kif,
	struct mbuf m, int off, void h, struct pf_pdesc *pd)
	{
	struct pf_state_peer src, dst;
	struct pf_state_cmp key;
	struct udphdr *uh = pd->hdr.udp;

	key.af = pd->af;
	key.proto = IPPROTO_UDP;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd->src, key.af);
	PF_ACPY(&key.gwy.addr, pd->dst, key.af);
	key.ext.port = uh->uh_sport;
	key.gwy.port = uh->uh_dport;
	} else {
	PF_ACPY(&key.lan.addr, pd->src, key.af);
	PF_ACPY(&key.ext.addr, pd->dst, key.af);
	key.lan.port = uh->uh_sport;
	key.ext.port = uh->uh_dport;
	}

	STATE_LOOKUP();

	if (direction == (*state)->direction) {
	src = &(*state)->src;
	dst = &(*state)->dst;
	} else {
	src = &(*state)->dst;
	dst = &(*state)->src;
	}

	/* update states */
	if (src->state < PFUDPS_SINGLE)
	src->state = PFUDPS_SINGLE;
	if (dst->state == PFUDPS_SINGLE)
	dst->state = PFUDPS_MULTIPLE;

	/* update expire time */
	(*state)->expire = time_second;
	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
	(*state)->timeout = PFTM_UDP_MULTIPLE;
	else
	(*state)->timeout = PFTM_UDP_SINGLE;

	/* translate source/destination address, if necessary */
	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_OUT)
	pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
	&uh->uh_sum, &(*state)->gwy.addr,
	(*state)->gwy.port, 1, pd->af);
	else
	pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
	&uh->uh_sum, &(*state)->lan.addr,
	(*state)->lan.port, 1, pd->af);
	m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
	}

	return (PF_PASS);
	}

	int
	pf_test_state_icmp(struct pf_state *state, int direction, struct pfi_kif kif,
	struct mbuf m, int off, void h, struct pf_pdesc pd, u_short reason)
	{
	struct pf_addr saddr = pd->src, daddr = pd->dst;
	u_int16_t icmpid = 0; /* make the compiler happy */
	u_int16_t icmpsum = NULL; / make the compiler happy */
	u_int8_t icmptype = 0; /* make the compiler happy */
	int state_icmp = 0;
	struct pf_state_cmp key;

	switch (pd->proto) {
	#ifdef INET
	case IPPROTO_ICMP:
	icmptype = pd->hdr.icmp->icmp_type;
	icmpid = pd->hdr.icmp->icmp_id;
	icmpsum = &pd->hdr.icmp->icmp_cksum;

	if (icmptype == ICMP_UNREACH \|\|
	icmptype == ICMP_SOURCEQUENCH \|\|
	icmptype == ICMP_REDIRECT \|\|
	icmptype == ICMP_TIMXCEED \|\|
	icmptype == ICMP_PARAMPROB)
	state_icmp++;
	break;
	#endif /* INET */
	#ifdef INET6
	case IPPROTO_ICMPV6:
	icmptype = pd->hdr.icmp6->icmp6_type;
	icmpid = pd->hdr.icmp6->icmp6_id;
	icmpsum = &pd->hdr.icmp6->icmp6_cksum;

	if (icmptype == ICMP6_DST_UNREACH \|\|
	icmptype == ICMP6_PACKET_TOO_BIG \|\|
	icmptype == ICMP6_TIME_EXCEEDED \|\|
	icmptype == ICMP6_PARAM_PROB)
	state_icmp++;
	break;
	#endif /* INET6 */
	}

	if (!state_icmp) {

	/*
	* ICMP query/reply message not related to a TCP/UDP packet.
	* Search for an ICMP state.
	*/
	key.af = pd->af;
	key.proto = pd->proto;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd->src, key.af);
	PF_ACPY(&key.gwy.addr, pd->dst, key.af);
	key.ext.port = 0;
	key.gwy.port = icmpid;
	} else {
	PF_ACPY(&key.lan.addr, pd->src, key.af);
	PF_ACPY(&key.ext.addr, pd->dst, key.af);
	key.lan.port = icmpid;
	key.ext.port = 0;
	}

	STATE_LOOKUP();

	(*state)->expire = time_second;
	(*state)->timeout = PFTM_ICMP_ERROR_REPLY;

	/* translate source/destination address, if necessary */
	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_OUT) {
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&saddr->v4.s_addr,
	pd->ip_sum,
	(*state)->gwy.addr.v4.s_addr, 0);
	pd->hdr.icmp->icmp_cksum =
	pf_cksum_fixup(
	pd->hdr.icmp->icmp_cksum, icmpid,
	(*state)->gwy.port, 0);
	pd->hdr.icmp->icmp_id =
	(*state)->gwy.port;
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	pf_change_a6(saddr,
	&pd->hdr.icmp6->icmp6_cksum,
	&(*state)->gwy.addr, 0);
	m_copyback(m, off,
	sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	break;
	#endif /* INET6 */
	}
	} else {
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&daddr->v4.s_addr,
	pd->ip_sum,
	(*state)->lan.addr.v4.s_addr, 0);
	pd->hdr.icmp->icmp_cksum =
	pf_cksum_fixup(
	pd->hdr.icmp->icmp_cksum, icmpid,
	(*state)->lan.port, 0);
	pd->hdr.icmp->icmp_id =
	(*state)->lan.port;
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	pf_change_a6(daddr,
	&pd->hdr.icmp6->icmp6_cksum,
	&(*state)->lan.addr, 0);
	m_copyback(m, off,
	sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	break;
	#endif /* INET6 */
	}
	}
	}

	return (PF_PASS);

	} else {
	/*
	* ICMP error message in response to a TCP/UDP packet.
	* Extract the inner TCP/UDP header and search for that state.
	*/

	struct pf_pdesc pd2;
	#ifdef INET
	struct ip h2;
	#endif /* INET */
	#ifdef INET6
	struct ip6_hdr h2_6;
	int terminal = 0;
	#endif /* INET6 */
	int ipoff2 = 0; /* make the compiler happy */
	int off2 = 0; /* make the compiler happy */

	pd2.af = pd->af;
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	/* offset of h2 in mbuf chain */
	ipoff2 = off + ICMP_MINLEN;

	if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
	NULL, reason, pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMP error message too short "
	"(ip)\n"));
	return (PF_DROP);
	}
	/*
	* ICMP error messages don't refer to non-first
	* fragments
	*/
	if (h2.ip_off & htons(IP_OFFMASK)) {
	REASON_SET(reason, PFRES_FRAG);
	return (PF_DROP);
	}

	/* offset of protocol header that follows h2 */
	off2 = ipoff2 + (h2.ip_hl << 2);

	pd2.proto = h2.ip_p;
	pd2.src = (struct pf_addr *)&h2.ip_src;
	pd2.dst = (struct pf_addr *)&h2.ip_dst;
	pd2.ip_sum = &h2.ip_sum;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	ipoff2 = off + sizeof(struct icmp6_hdr);

	if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
	NULL, reason, pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMP error message too short "
	"(ip6)\n"));
	return (PF_DROP);
	}
	pd2.proto = h2_6.ip6_nxt;
	pd2.src = (struct pf_addr *)&h2_6.ip6_src;
	pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
	pd2.ip_sum = NULL;
	off2 = ipoff2 + sizeof(h2_6);
	do {
	switch (pd2.proto) {
	case IPPROTO_FRAGMENT:
	/*
	* ICMPv6 error messages for
	* non-first fragments
	*/
	REASON_SET(reason, PFRES_FRAG);
	return (PF_DROP);
	case IPPROTO_AH:
	case IPPROTO_HOPOPTS:
	case IPPROTO_ROUTING:
	case IPPROTO_DSTOPTS: {
	/* get next header and header length */
	struct ip6_ext opt6;

	if (!pf_pull_hdr(m, off2, &opt6,
	sizeof(opt6), NULL, reason,
	pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMPv6 short opt\n"));
	return (PF_DROP);
	}
	if (pd2.proto == IPPROTO_AH)
	off2 += (opt6.ip6e_len + 2) * 4;
	else
	off2 += (opt6.ip6e_len + 1) * 8;
	pd2.proto = opt6.ip6e_nxt;
	/* goto the next header */
	break;
	}
	default:
	terminal++;
	break;
	}
	} while (!terminal);
	break;
	#endif /* INET6 */
	#ifdef __FreeBSD__
	default:
	panic("AF not supported: %d", pd->af);
	#endif
	}

	switch (pd2.proto) {
	case IPPROTO_TCP: {
	struct tcphdr th;
	u_int32_t seq;
	struct pf_state_peer src, dst;
	u_int8_t dws;
	int copyback = 0;

	/*
	* Only the first 8 bytes of the TCP header can be
	* expected. Don't access any TCP header fields after
	* th_seq, an ackskew test is not possible.
	*/
	if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
	pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMP error message too short "
	"(tcp)\n"));
	return (PF_DROP);
	}

	key.af = pd2.af;
	key.proto = IPPROTO_TCP;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd2.dst, key.af);
	PF_ACPY(&key.gwy.addr, pd2.src, key.af);
	key.ext.port = th.th_dport;
	key.gwy.port = th.th_sport;
	} else {
	PF_ACPY(&key.lan.addr, pd2.dst, key.af);
	PF_ACPY(&key.ext.addr, pd2.src, key.af);
	key.lan.port = th.th_dport;
	key.ext.port = th.th_sport;
	}

	STATE_LOOKUP();

	if (direction == (*state)->direction) {
	src = &(*state)->dst;
	dst = &(*state)->src;
	} else {
	src = &(*state)->src;
	dst = &(*state)->dst;
	}

	if (src->wscale && dst->wscale)
	dws = dst->wscale & PF_WSCALE_MASK;
	else
	dws = 0;

	/* Demodulate sequence number */
	seq = ntohl(th.th_seq) - src->seqdiff;
	if (src->seqdiff) {
	pf_change_a(&th.th_seq, icmpsum,
	htonl(seq), 0);
	copyback = 1;
	}

	if (!SEQ_GEQ(src->seqhi, seq) \|\|
	!SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))) {
	if (pf_status.debug >= PF_DEBUG_MISC) {
	printf("pf: BAD ICMP %d:%d ",
	icmptype, pd->hdr.icmp->icmp_code);
	pf_print_host(pd->src, 0, pd->af);
	printf(" -> ");
	pf_print_host(pd->dst, 0, pd->af);
	printf(" state: ");
	pf_print_state(*state);
	printf(" seq=%u\n", seq);
	}
	REASON_SET(reason, PFRES_BADSTATE);
	return (PF_DROP);
	}

	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_IN) {
	pf_change_icmp(pd2.src, &th.th_sport,
	daddr, &(*state)->lan.addr,
	(*state)->lan.port, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, pd2.af);
	} else {
	pf_change_icmp(pd2.dst, &th.th_dport,
	saddr, &(*state)->gwy.addr,
	(*state)->gwy.port, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, pd2.af);
	}
	copyback = 1;
	}

	if (copyback) {
	switch (pd2.af) {
	#ifdef INET
	case AF_INET:
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	m_copyback(m, ipoff2, sizeof(h2),
	(caddr_t)&h2);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	m_copyback(m, off,
	sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	m_copyback(m, ipoff2, sizeof(h2_6),
	(caddr_t)&h2_6);
	break;
	#endif /* INET6 */
	}
	m_copyback(m, off2, 8, (caddr_t)&th);
	}

	return (PF_PASS);
	break;
	}
	case IPPROTO_UDP: {
	struct udphdr uh;

	if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
	NULL, reason, pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMP error message too short "
	"(udp)\n"));
	return (PF_DROP);
	}

	key.af = pd2.af;
	key.proto = IPPROTO_UDP;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd2.dst, key.af);
	PF_ACPY(&key.gwy.addr, pd2.src, key.af);
	key.ext.port = uh.uh_dport;
	key.gwy.port = uh.uh_sport;
	} else {
	PF_ACPY(&key.lan.addr, pd2.dst, key.af);
	PF_ACPY(&key.ext.addr, pd2.src, key.af);
	key.lan.port = uh.uh_dport;
	key.ext.port = uh.uh_sport;
	}

	STATE_LOOKUP();

	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_IN) {
	pf_change_icmp(pd2.src, &uh.uh_sport,
	daddr, &(*state)->lan.addr,
	(*state)->lan.port, &uh.uh_sum,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 1, pd2.af);
	} else {
	pf_change_icmp(pd2.dst, &uh.uh_dport,
	saddr, &(*state)->gwy.addr,
	(*state)->gwy.port, &uh.uh_sum,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 1, pd2.af);
	}
	switch (pd2.af) {
	#ifdef INET
	case AF_INET:
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	m_copyback(m, ipoff2, sizeof(h2),
	(caddr_t)&h2);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	m_copyback(m, off,
	sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	m_copyback(m, ipoff2, sizeof(h2_6),
	(caddr_t)&h2_6);
	break;
	#endif /* INET6 */
	}
	m_copyback(m, off2, sizeof(uh),
	(caddr_t)&uh);
	}

	return (PF_PASS);
	break;
	}
	#ifdef INET
	case IPPROTO_ICMP: {
	struct icmp iih;

	if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
	NULL, reason, pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMP error message too short i"
	"(icmp)\n"));
	return (PF_DROP);
	}

	key.af = pd2.af;
	key.proto = IPPROTO_ICMP;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd2.dst, key.af);
	PF_ACPY(&key.gwy.addr, pd2.src, key.af);
	key.ext.port = 0;
	key.gwy.port = iih.icmp_id;
	} else {
	PF_ACPY(&key.lan.addr, pd2.dst, key.af);
	PF_ACPY(&key.ext.addr, pd2.src, key.af);
	key.lan.port = iih.icmp_id;
	key.ext.port = 0;
	}

	STATE_LOOKUP();

	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_IN) {
	pf_change_icmp(pd2.src, &iih.icmp_id,
	daddr, &(*state)->lan.addr,
	(*state)->lan.port, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, AF_INET);
	} else {
	pf_change_icmp(pd2.dst, &iih.icmp_id,
	saddr, &(*state)->gwy.addr,
	(*state)->gwy.port, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, AF_INET);
	}
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	m_copyback(m, ipoff2, sizeof(h2),
	(caddr_t)&h2);
	m_copyback(m, off2, ICMP_MINLEN,
	(caddr_t)&iih);
	}

	return (PF_PASS);
	break;
	}
	#endif /* INET */
	#ifdef INET6
	case IPPROTO_ICMPV6: {
	struct icmp6_hdr iih;

	if (!pf_pull_hdr(m, off2, &iih,
	sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: ICMP error message too short "
	"(icmp6)\n"));
	return (PF_DROP);
	}

	key.af = pd2.af;
	key.proto = IPPROTO_ICMPV6;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd2.dst, key.af);
	PF_ACPY(&key.gwy.addr, pd2.src, key.af);
	key.ext.port = 0;
	key.gwy.port = iih.icmp6_id;
	} else {
	PF_ACPY(&key.lan.addr, pd2.dst, key.af);
	PF_ACPY(&key.ext.addr, pd2.src, key.af);
	key.lan.port = iih.icmp6_id;
	key.ext.port = 0;
	}

	STATE_LOOKUP();

	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_IN) {
	pf_change_icmp(pd2.src, &iih.icmp6_id,
	daddr, &(*state)->lan.addr,
	(*state)->lan.port, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, AF_INET6);
	} else {
	pf_change_icmp(pd2.dst, &iih.icmp6_id,
	saddr, &(*state)->gwy.addr,
	(*state)->gwy.port, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, AF_INET6);
	}
	m_copyback(m, off, sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	m_copyback(m, ipoff2, sizeof(h2_6),
	(caddr_t)&h2_6);
	m_copyback(m, off2, sizeof(struct icmp6_hdr),
	(caddr_t)&iih);
	}

	return (PF_PASS);
	break;
	}
	#endif /* INET6 */
	default: {
	key.af = pd2.af;
	key.proto = pd2.proto;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd2.dst, key.af);
	PF_ACPY(&key.gwy.addr, pd2.src, key.af);
	key.ext.port = 0;
	key.gwy.port = 0;
	} else {
	PF_ACPY(&key.lan.addr, pd2.dst, key.af);
	PF_ACPY(&key.ext.addr, pd2.src, key.af);
	key.lan.port = 0;
	key.ext.port = 0;
	}

	STATE_LOOKUP();

	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_IN) {
	pf_change_icmp(pd2.src, NULL,
	daddr, &(*state)->lan.addr,
	0, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, pd2.af);
	} else {
	pf_change_icmp(pd2.dst, NULL,
	saddr, &(*state)->gwy.addr,
	0, NULL,
	pd2.ip_sum, icmpsum,
	pd->ip_sum, 0, pd2.af);
	}
	switch (pd2.af) {
	#ifdef INET
	case AF_INET:
	m_copyback(m, off, ICMP_MINLEN,
	(caddr_t)pd->hdr.icmp);
	m_copyback(m, ipoff2, sizeof(h2),
	(caddr_t)&h2);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	m_copyback(m, off,
	sizeof(struct icmp6_hdr),
	(caddr_t)pd->hdr.icmp6);
	m_copyback(m, ipoff2, sizeof(h2_6),
	(caddr_t)&h2_6);
	break;
	#endif /* INET6 */
	}
	}

	return (PF_PASS);
	break;
	}
	}
	}
	}

	int
	pf_test_state_other(struct pf_state *state, int direction, struct pfi_kif kif,
	struct pf_pdesc *pd)
	{
	struct pf_state_peer src, dst;
	struct pf_state_cmp key;

	key.af = pd->af;
	key.proto = pd->proto;
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, pd->src, key.af);
	PF_ACPY(&key.gwy.addr, pd->dst, key.af);
	key.ext.port = 0;
	key.gwy.port = 0;
	} else {
	PF_ACPY(&key.lan.addr, pd->src, key.af);
	PF_ACPY(&key.ext.addr, pd->dst, key.af);
	key.lan.port = 0;
	key.ext.port = 0;
	}

	STATE_LOOKUP();

	if (direction == (*state)->direction) {
	src = &(*state)->src;
	dst = &(*state)->dst;
	} else {
	src = &(*state)->dst;
	dst = &(*state)->src;
	}

	/* update states */
	if (src->state < PFOTHERS_SINGLE)
	src->state = PFOTHERS_SINGLE;
	if (dst->state == PFOTHERS_SINGLE)
	dst->state = PFOTHERS_MULTIPLE;

	/* update expire time */
	(*state)->expire = time_second;
	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
	(*state)->timeout = PFTM_OTHER_MULTIPLE;
	else
	(*state)->timeout = PFTM_OTHER_SINGLE;

	/* translate source/destination address, if necessary */
	if (STATE_TRANSLATE(*state)) {
	if (direction == PF_OUT)
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&pd->src->v4.s_addr,
	pd->ip_sum, (*state)->gwy.addr.v4.s_addr,
	0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	PF_ACPY(pd->src, &(*state)->gwy.addr, pd->af);
	break;
	#endif /* INET6 */
	}
	else
	switch (pd->af) {
	#ifdef INET
	case AF_INET:
	pf_change_a(&pd->dst->v4.s_addr,
	pd->ip_sum, (*state)->lan.addr.v4.s_addr,
	0);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	PF_ACPY(pd->dst, &(*state)->lan.addr, pd->af);
	break;
	#endif /* INET6 */
	}
	}

	return (PF_PASS);
	}

	/*
	* ipoff and off are measured from the start of the mbuf chain.
	* h must be at "ipoff" on the mbuf chain.
	*/
	void *
	pf_pull_hdr(struct mbuf m, int off, void p, int len,
	u_short actionp, u_short reasonp, sa_family_t af)
	{
	switch (af) {
	#ifdef INET
	case AF_INET: {
	struct ip h = mtod(m, struct ip );
	u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;

	if (fragoff) {
	if (fragoff >= len)
	ACTION_SET(actionp, PF_PASS);
	else {
	ACTION_SET(actionp, PF_DROP);
	REASON_SET(reasonp, PFRES_FRAG);
	}
	return (NULL);
	}
	if (m->m_pkthdr.len < off + len \|\|
	ntohs(h->ip_len) < off + len) {
	ACTION_SET(actionp, PF_DROP);
	REASON_SET(reasonp, PFRES_SHORT);
	return (NULL);
	}
	break;
	}
	#endif /* INET */
	#ifdef INET6
	case AF_INET6: {
	struct ip6_hdr h = mtod(m, struct ip6_hdr );

	if (m->m_pkthdr.len < off + len \|\|
	(ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
	(unsigned)(off + len)) {
	ACTION_SET(actionp, PF_DROP);
	REASON_SET(reasonp, PFRES_SHORT);
	return (NULL);
	}
	break;
	}
	#endif /* INET6 */
	}
	m_copydata(m, off, len, p);
	return (p);
	}

	int
	pf_routable(struct pf_addr addr, sa_family_t af, struct pfi_kif kif)
	{
	struct sockaddr_in *dst;
	int ret = 1;
	int check_mpath;
	#ifndef __FreeBSD__
	extern int ipmultipath;
	#endif
	#ifdef INET6
	#ifndef __FreeBSD__
	extern int ip6_multipath;
	#endif
	struct sockaddr_in6 *dst6;
	struct route_in6 ro;
	#else
	struct route ro;
	#endif
	struct radix_node *rn;
	struct rtentry *rt;
	struct ifnet *ifp;

	check_mpath = 0;
	bzero(&ro, sizeof(ro));
	switch (af) {
	case AF_INET:
	dst = satosin(&ro.ro_dst);
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = addr->v4;
	#ifndef __FreeBSD__ /* MULTIPATH_ROUTING */
	if (ipmultipath)
	check_mpath = 1;
	#endif
	break;
	#ifdef INET6
	case AF_INET6:
	dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
	dst6->sin6_family = AF_INET6;
	dst6->sin6_len = sizeof(*dst6);
	dst6->sin6_addr = addr->v6;
	#ifndef __FreeBSD__ /* MULTIPATH_ROUTING */
	if (ip6_multipath)
	check_mpath = 1;
	#endif
	break;
	#endif /* INET6 */
	default:
	return (0);
	}

	/* Skip checks for ipsec interfaces */
	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
	goto out;

	#ifdef __FreeBSD__
	/* XXX MRT not always INET / / stick with table 0 though */
	if (af == AF_INET)
	in_rtalloc_ign((struct route *)&ro, RTF_CLONING, 0);
	else
	rtalloc_ign((struct route *)&ro, RTF_CLONING);
	#else /* ! __FreeBSD__ */
	rtalloc_noclone((struct route *)&ro, NO_CLONING);
	#endif

	if (ro.ro_rt != NULL) {
	/* No interface given, this is a no-route check */
	if (kif == NULL)
	goto out;

	if (kif->pfik_ifp == NULL) {
	ret = 0;
	goto out;
	}

	/* Perform uRPF check if passed input interface */
	ret = 0;
	rn = (struct radix_node *)ro.ro_rt;
	do {
	rt = (struct rtentry *)rn;
	#ifndef __FreeBSD__ /* CARPDEV */
	if (rt->rt_ifp->if_type == IFT_CARP)
	ifp = rt->rt_ifp->if_carpdev;
	else
	#endif
	ifp = rt->rt_ifp;

	if (kif->pfik_ifp == ifp)
	ret = 1;
	#ifdef __FreeBSD__ /* MULTIPATH_ROUTING */
	rn = NULL;
	#else
	rn = rn_mpath_next(rn);
	#endif
	} while (check_mpath == 1 && rn != NULL && ret == 0);
	} else
	ret = 0;
	out:
	if (ro.ro_rt != NULL)
	RTFREE(ro.ro_rt);
	return (ret);
	}

	int
	pf_rtlabel_match(struct pf_addr addr, sa_family_t af, struct pf_addr_wrap aw)
	{
	struct sockaddr_in *dst;
	#ifdef INET6
	struct sockaddr_in6 *dst6;
	struct route_in6 ro;
	#else
	struct route ro;
	#endif
	int ret = 0;

	bzero(&ro, sizeof(ro));
	switch (af) {
	case AF_INET:
	dst = satosin(&ro.ro_dst);
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = addr->v4;
	break;
	#ifdef INET6
	case AF_INET6:
	dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
	dst6->sin6_family = AF_INET6;
	dst6->sin6_len = sizeof(*dst6);
	dst6->sin6_addr = addr->v6;
	break;
	#endif /* INET6 */
	default:
	return (0);
	}

	#ifdef __FreeBSD__
	# ifdef RTF_PRCLONING
	rtalloc_ign((struct route *)&ro, (RTF_CLONING\|RTF_PRCLONING));
	# else /* !RTF_PRCLONING */
	if (af == AF_INET)
	in_rtalloc_ign((struct route *)&ro, RTF_CLONING, 0);
	else
	rtalloc_ign((struct route *)&ro, RTF_CLONING);
	# endif
	#else /* ! __FreeBSD__ */
	rtalloc_noclone((struct route *)&ro, NO_CLONING);
	#endif

	if (ro.ro_rt != NULL) {
	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	#else
	if (ro.ro_rt->rt_labelid == aw->v.rtlabel)
	ret = 1;
	#endif
	RTFREE(ro.ro_rt);
	}

	return (ret);
	}

	#ifdef INET

	void
	pf_route(struct mbuf *m, struct pf_rule r, int dir, struct ifnet *oifp,
	struct pf_state s, struct pf_pdesc pd)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf m0, m1;
	struct route iproute;
	struct route *ro = NULL;
	struct sockaddr_in *dst;
	struct ip *ip;
	struct ifnet *ifp = NULL;
	struct pf_addr naddr;
	struct pf_src_node *sn = NULL;
	int error = 0;
	#ifdef __FreeBSD__
	int sw_csum;
	#endif
	#ifdef IPSEC
	struct m_tag *mtag;
	#endif /* IPSEC */

	if (m == NULL \|\| *m == NULL \|\| r == NULL \|\|
	(dir != PF_IN && dir != PF_OUT) \|\| oifp == NULL)
	panic("pf_route: invalid parameters");

	if (pd->pf_mtag->routed++ > 3) {
	m0 = *m;
	*m = NULL;
	goto bad;
	}

	if (r->rt == PF_DUPTO) {
	#ifdef __FreeBSD__
	if ((m0 = m_dup(*m, M_DONTWAIT)) == NULL)
	#else
	if ((m0 = m_copym2(*m, 0, M_COPYALL, M_NOWAIT)) == NULL)
	#endif
	return;
	} else {
	if ((r->rt == PF_REPLYTO) == (r->direction == dir))
	return;
	m0 = *m;
	}

	if (m0->m_len < sizeof(struct ip)) {
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_route: m0->m_len < sizeof(struct ip)\n"));
	goto bad;
	}

	ip = mtod(m0, struct ip *);

	ro = &iproute;
	bzero((caddr_t)ro, sizeof(*ro));
	dst = satosin(&ro->ro_dst);
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = ip->ip_dst;

	if (r->rt == PF_FASTROUTE) {
	in_rtalloc(ro, 0);
	if (ro->ro_rt == 0) {
	V_ipstat.ips_noroute++;
	goto bad;
	}

	ifp = ro->ro_rt->rt_ifp;
	ro->ro_rt->rt_use++;

	if (ro->ro_rt->rt_flags & RTF_GATEWAY)
	dst = satosin(ro->ro_rt->rt_gateway);
	} else {
	if (TAILQ_EMPTY(&r->rpool.list)) {
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_route: TAILQ_EMPTY(&r->rpool.list)\n"));
	goto bad;
	}
	if (s == NULL) {
	pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
	&naddr, NULL, &sn);
	if (!PF_AZERO(&naddr, AF_INET))
	dst->sin_addr.s_addr = naddr.v4.s_addr;
	ifp = r->rpool.cur->kif ?
	r->rpool.cur->kif->pfik_ifp : NULL;
	} else {
	if (!PF_AZERO(&s->rt_addr, AF_INET))
	dst->sin_addr.s_addr =
	s->rt_addr.v4.s_addr;
	ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
	}
	}
	if (ifp == NULL)
	goto bad;

	if (oifp != ifp) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	if (pf_test(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
	PF_LOCK();
	goto bad;
	} else if (m0 == NULL) {
	PF_LOCK();
	goto done;
	}
	PF_LOCK();
	#else
	if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
	goto bad;
	else if (m0 == NULL)
	goto done;
	#endif
	if (m0->m_len < sizeof(struct ip)) {
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_route: m0->m_len < sizeof(struct ip)\n"));
	goto bad;
	}
	ip = mtod(m0, struct ip *);
	}

	#ifdef __FreeBSD__
	/* Copied from FreeBSD 5.1-CURRENT ip_output. */
	m0->m_pkthdr.csum_flags \|= CSUM_IP;
	sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
	if (sw_csum & CSUM_DELAY_DATA) {
	/*
	* XXX: in_delayed_cksum assumes HBO for ip->ip_len (at least)
	*/
	NTOHS(ip->ip_len);
	NTOHS(ip->ip_off); /* XXX: needed? */
	in_delayed_cksum(m0);
	HTONS(ip->ip_len);
	HTONS(ip->ip_off);
	sw_csum &= ~CSUM_DELAY_DATA;
	}
	m0->m_pkthdr.csum_flags &= ifp->if_hwassist;

	if (ntohs(ip->ip_len) <= ifp->if_mtu \|\|
	(ifp->if_hwassist & CSUM_FRAGMENT &&
	((ip->ip_off & htons(IP_DF)) == 0))) {
	/*
	* ip->ip_len = htons(ip->ip_len);
	* ip->ip_off = htons(ip->ip_off);
	*/
	ip->ip_sum = 0;
	if (sw_csum & CSUM_DELAY_IP) {
	/* From KAME */
	if (ip->ip_v == IPVERSION &&
	(ip->ip_hl << 2) == sizeof(*ip)) {
	ip->ip_sum = in_cksum_hdr(ip);
	} else {
	ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
	}
	}
	PF_UNLOCK();
	error = (*ifp->if_output)(ifp, m0, sintosa(dst), ro->ro_rt);
	PF_LOCK();
	goto done;
	}

	#else
	/* Copied from ip_output. */
	#ifdef IPSEC
	/*
	* If deferred crypto processing is needed, check that the
	* interface supports it.
	*/
	if ((mtag = m_tag_find(m0, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL))
	!= NULL && (ifp->if_capabilities & IFCAP_IPSEC) == 0) {
	/* Notify IPsec to do its own crypto. */
	ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
	goto bad;
	}
	#endif /* IPSEC */

	/* Catch routing changes wrt. hardware checksumming for TCP or UDP. */
	if (m0->m_pkthdr.csum_flags & M_TCPV4_CSUM_OUT) {
	if (!(ifp->if_capabilities & IFCAP_CSUM_TCPv4) \|\|
	ifp->if_bridge != NULL) {
	in_delayed_cksum(m0);
	m0->m_pkthdr.csum_flags &= ~M_TCPV4_CSUM_OUT; /* Clear */
	}
	} else if (m0->m_pkthdr.csum_flags & M_UDPV4_CSUM_OUT) {
	if (!(ifp->if_capabilities & IFCAP_CSUM_UDPv4) \|\|
	ifp->if_bridge != NULL) {
	in_delayed_cksum(m0);
	m0->m_pkthdr.csum_flags &= ~M_UDPV4_CSUM_OUT; /* Clear */
	}
	}

	if (ntohs(ip->ip_len) <= ifp->if_mtu) {
	if ((ifp->if_capabilities & IFCAP_CSUM_IPv4) &&
	ifp->if_bridge == NULL) {
	m0->m_pkthdr.csum_flags \|= M_IPV4_CSUM_OUT;
	V_ipstat.ips_outhwcsum++;
	} else {
	ip->ip_sum = 0;
	ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
	}
	/* Update relevant hardware checksum stats for TCP/UDP */
	if (m0->m_pkthdr.csum_flags & M_TCPV4_CSUM_OUT)
	V_tcpstat.tcps_outhwcsum++;
	else if (m0->m_pkthdr.csum_flags & M_UDPV4_CSUM_OUT)
	V_udpstat.udps_outhwcsum++;
	error = (*ifp->if_output)(ifp, m0, sintosa(dst), NULL);
	goto done;
	}
	#endif
	/*
	* Too large for interface; fragment if possible.
	* Must be able to put at least 8 bytes per fragment.
	*/
	if (ip->ip_off & htons(IP_DF)) {
	V_ipstat.ips_cantfrag++;
	if (r->rt != PF_DUPTO) {
	#ifdef __FreeBSD__
	/* icmp_error() expects host byte ordering */
	NTOHS(ip->ip_len);
	NTOHS(ip->ip_off);
	PF_UNLOCK();
	icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
	ifp->if_mtu);
	PF_LOCK();
	#else
	icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
	ifp->if_mtu);
	#endif
	goto done;
	} else
	goto bad;
	}

	m1 = m0;
	#ifdef __FreeBSD__
	/*
	* XXX: is cheaper + less error prone than own function
	*/
	NTOHS(ip->ip_len);
	NTOHS(ip->ip_off);
	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
	#else
	error = ip_fragment(m0, ifp, ifp->if_mtu);
	#endif
	if (error) {
	#ifndef __FreeBSD__ /* ip_fragment does not do m_freem() on FreeBSD */
	m0 = NULL;
	#endif
	goto bad;
	}

	for (m0 = m1; m0; m0 = m1) {
	m1 = m0->m_nextpkt;
	m0->m_nextpkt = 0;
	#ifdef __FreeBSD__
	if (error == 0) {
	PF_UNLOCK();
	error = (*ifp->if_output)(ifp, m0, sintosa(dst),
	NULL);
	PF_LOCK();
	} else
	#else
	if (error == 0)
	error = (*ifp->if_output)(ifp, m0, sintosa(dst),
	NULL);
	else
	#endif
	m_freem(m0);
	}

	if (error == 0)
	V_ipstat.ips_fragmented++;

	done:
	if (r->rt != PF_DUPTO)
	*m = NULL;
	if (ro == &iproute && ro->ro_rt)
	RTFREE(ro->ro_rt);
	return;

	bad:
	m_freem(m0);
	goto done;
	}
	#endif /* INET */

	#ifdef INET6
	void
	pf_route6(struct mbuf *m, struct pf_rule r, int dir, struct ifnet *oifp,
	struct pf_state s, struct pf_pdesc pd)
	{
	struct mbuf *m0;
	struct route_in6 ip6route;
	struct route_in6 *ro;
	struct sockaddr_in6 *dst;
	struct ip6_hdr *ip6;
	struct ifnet *ifp = NULL;
	struct pf_addr naddr;
	struct pf_src_node *sn = NULL;
	int error = 0;

	if (m == NULL \|\| *m == NULL \|\| r == NULL \|\|
	(dir != PF_IN && dir != PF_OUT) \|\| oifp == NULL)
	panic("pf_route6: invalid parameters");

	if (pd->pf_mtag->routed++ > 3) {
	m0 = *m;
	*m = NULL;
	goto bad;
	}

	if (r->rt == PF_DUPTO) {
	#ifdef __FreeBSD__
	if ((m0 = m_dup(*m, M_DONTWAIT)) == NULL)
	#else
	if ((m0 = m_copym2(*m, 0, M_COPYALL, M_NOWAIT)) == NULL)
	#endif
	return;
	} else {
	if ((r->rt == PF_REPLYTO) == (r->direction == dir))
	return;
	m0 = *m;
	}

	if (m0->m_len < sizeof(struct ip6_hdr)) {
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
	goto bad;
	}
	ip6 = mtod(m0, struct ip6_hdr *);

	ro = &ip6route;
	bzero((caddr_t)ro, sizeof(*ro));
	dst = (struct sockaddr_in6 *)&ro->ro_dst;
	dst->sin6_family = AF_INET6;
	dst->sin6_len = sizeof(*dst);
	dst->sin6_addr = ip6->ip6_dst;

	/* Cheat. XXX why only in the v6 case??? */
	if (r->rt == PF_FASTROUTE) {
	#ifdef __FreeBSD__
	m0->m_flags \|= M_SKIP_FIREWALL;
	PF_UNLOCK();
	ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
	PF_LOCK();
	#else
	mtag = m_tag_get(PACKET_TAG_PF_GENERATED, 0, M_NOWAIT);
	if (mtag == NULL)
	goto bad;
	m_tag_prepend(m0, mtag);
	pd->pf_mtag->flags \|= PF_TAG_GENERATED;
	ip6_output(m0, NULL, NULL, 0, NULL, NULL);
	#endif
	return;
	}

	if (TAILQ_EMPTY(&r->rpool.list)) {
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_route6: TAILQ_EMPTY(&r->rpool.list)\n"));
	goto bad;
	}
	if (s == NULL) {
	pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
	&naddr, NULL, &sn);
	if (!PF_AZERO(&naddr, AF_INET6))
	PF_ACPY((struct pf_addr *)&dst->sin6_addr,
	&naddr, AF_INET6);
	ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
	} else {
	if (!PF_AZERO(&s->rt_addr, AF_INET6))
	PF_ACPY((struct pf_addr *)&dst->sin6_addr,
	&s->rt_addr, AF_INET6);
	ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
	}
	if (ifp == NULL)
	goto bad;

	if (oifp != ifp) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	if (pf_test6(PF_OUT, ifp, &m0, NULL, NULL) != PF_PASS) {
	PF_LOCK();
	goto bad;
	} else if (m0 == NULL) {
	PF_LOCK();
	goto done;
	}
	PF_LOCK();
	#else
	if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
	goto bad;
	else if (m0 == NULL)
	goto done;
	#endif
	if (m0->m_len < sizeof(struct ip6_hdr)) {
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_route6: m0->m_len < sizeof(struct ip6_hdr)\n"));
	goto bad;
	}
	ip6 = mtod(m0, struct ip6_hdr *);
	}

	/*
	* If the packet is too large for the outgoing interface,
	* send back an icmp6 error.
	*/
	if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr))
	dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index);
	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	error = nd6_output(ifp, ifp, m0, dst, NULL);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	} else {
	in6_ifstat_inc(ifp, ifs6_in_toobig);
	#ifdef __FreeBSD__
	if (r->rt != PF_DUPTO) {
	PF_UNLOCK();
	icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
	PF_LOCK();
	} else
	#else
	if (r->rt != PF_DUPTO)
	icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
	else
	#endif
	goto bad;
	}

	done:
	if (r->rt != PF_DUPTO)
	*m = NULL;
	return;

	bad:
	m_freem(m0);
	goto done;
	}
	#endif /* INET6 */


	#ifdef __FreeBSD__
	/*
	* FreeBSD supports cksum offloads for the following drivers.
	* em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
	* ti(4), txp(4), xl(4)
	*
	* CSUM_DATA_VALID \| CSUM_PSEUDO_HDR :
	* network driver performed cksum including pseudo header, need to verify
	* csum_data
	* CSUM_DATA_VALID :
	* network driver performed cksum, needs to additional pseudo header
	* cksum computation with partial csum_data(i.e. lack of H/W support for
	* pseudo header, for instance hme(4), sk(4) and possibly gem(4))
	*
	* After validating the cksum of packet, set both flag CSUM_DATA_VALID and
	* CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
	* TCP/UDP layer.
	* Also, set csum_data to 0xffff to force cksum validation.
	*/
	int
	pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
	{
	u_int16_t sum = 0;
	int hw_assist = 0;
	struct ip *ip;

	if (off < sizeof(struct ip) \|\| len < sizeof(struct udphdr))
	return (1);
	if (m->m_pkthdr.len < off + len)
	return (1);

	switch (p) {
	case IPPROTO_TCP:
	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
	sum = m->m_pkthdr.csum_data;
	} else {
	ip = mtod(m, struct ip *);
	sum = in_pseudo(ip->ip_src.s_addr,
	ip->ip_dst.s_addr, htonl((u_short)len +
	m->m_pkthdr.csum_data + IPPROTO_TCP));
	}
	sum ^= 0xffff;
	++hw_assist;
	}
	break;
	case IPPROTO_UDP:
	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
	sum = m->m_pkthdr.csum_data;
	} else {
	ip = mtod(m, struct ip *);
	sum = in_pseudo(ip->ip_src.s_addr,
	ip->ip_dst.s_addr, htonl((u_short)len +
	m->m_pkthdr.csum_data + IPPROTO_UDP));
	}
	sum ^= 0xffff;
	++hw_assist;
	}
	break;
	case IPPROTO_ICMP:
	#ifdef INET6
	case IPPROTO_ICMPV6:
	#endif /* INET6 */
	break;
	default:
	return (1);
	}

	if (!hw_assist) {
	switch (af) {
	case AF_INET:
	if (p == IPPROTO_ICMP) {
	if (m->m_len < off)
	return (1);
	m->m_data += off;
	m->m_len -= off;
	sum = in_cksum(m, len);
	m->m_data -= off;
	m->m_len += off;
	} else {
	if (m->m_len < sizeof(struct ip))
	return (1);
	sum = in4_cksum(m, p, off, len);
	}
	break;
	#ifdef INET6
	case AF_INET6:
	if (m->m_len < sizeof(struct ip6_hdr))
	return (1);
	sum = in6_cksum(m, p, off, len);
	break;
	#endif /* INET6 */
	default:
	return (1);
	}
	}
	if (sum) {
	switch (p) {
	case IPPROTO_TCP:
	+ {
	+ INIT_VNET_INET(curvnet);
	V_tcpstat.tcps_rcvbadsum++;
	break;
	+ }
	case IPPROTO_UDP:
	+ {
	+ INIT_VNET_INET(curvnet);
	V_udpstat.udps_badsum++;
	break;
	+ }
	case IPPROTO_ICMP:
	+ {
	+ INIT_VNET_INET(curvnet);
	V_icmpstat.icps_checksum++;
	break;
	+ }
	#ifdef INET6
	case IPPROTO_ICMPV6:
	+ {
	+ INIT_VNET_INET6(curvnet);
	V_icmp6stat.icp6s_checksum++;
	break;
	+ }
	#endif /* INET6 */
	}
	return (1);
	} else {
	if (p == IPPROTO_TCP \|\| p == IPPROTO_UDP) {
	m->m_pkthdr.csum_flags \|=
	(CSUM_DATA_VALID \| CSUM_PSEUDO_HDR);
	m->m_pkthdr.csum_data = 0xffff;
	}
	}
	return (0);
	}
	#else /* !__FreeBSD__ */
	/*
	* check protocol (tcp/udp/icmp/icmp6) checksum and set mbuf flag
	* off is the offset where the protocol header starts
	* len is the total length of protocol header plus payload
	* returns 0 when the checksum is valid, otherwise returns 1.
	*/
	int
	pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p,
	sa_family_t af)
	{
	u_int16_t flag_ok, flag_bad;
	u_int16_t sum;

	switch (p) {
	case IPPROTO_TCP:
	flag_ok = M_TCP_CSUM_IN_OK;
	flag_bad = M_TCP_CSUM_IN_BAD;
	break;
	case IPPROTO_UDP:
	flag_ok = M_UDP_CSUM_IN_OK;
	flag_bad = M_UDP_CSUM_IN_BAD;
	break;
	case IPPROTO_ICMP:
	#ifdef INET6
	case IPPROTO_ICMPV6:
	#endif /* INET6 */
	flag_ok = flag_bad = 0;
	break;
	default:
	return (1);
	}
	if (m->m_pkthdr.csum_flags & flag_ok)
	return (0);
	if (m->m_pkthdr.csum_flags & flag_bad)
	return (1);
	if (off < sizeof(struct ip) \|\| len < sizeof(struct udphdr))
	return (1);
	if (m->m_pkthdr.len < off + len)
	return (1);
	switch (af) {
	#ifdef INET
	case AF_INET:
	if (p == IPPROTO_ICMP) {
	if (m->m_len < off)
	return (1);
	m->m_data += off;
	m->m_len -= off;
	sum = in_cksum(m, len);
	m->m_data -= off;
	m->m_len += off;
	} else {
	if (m->m_len < sizeof(struct ip))
	return (1);
	sum = in4_cksum(m, p, off, len);
	}
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	if (m->m_len < sizeof(struct ip6_hdr))
	return (1);
	sum = in6_cksum(m, p, off, len);
	break;
	#endif /* INET6 */
	default:
	return (1);
	}
	if (sum) {
	m->m_pkthdr.csum_flags \|= flag_bad;
	switch (p) {
	case IPPROTO_TCP:
	V_tcpstat.tcps_rcvbadsum++;
	break;
	case IPPROTO_UDP:
	V_udpstat.udps_badsum++;
	break;
	case IPPROTO_ICMP:
	V_icmpstat.icps_checksum++;
	break;
	#ifdef INET6
	case IPPROTO_ICMPV6:
	V_icmp6stat.icp6s_checksum++;
	break;
	#endif /* INET6 */
	}
	return (1);
	}
	m->m_pkthdr.csum_flags \|= flag_ok;
	return (0);
	}
	#endif /* __FreeBSD__ */

	#ifdef INET
	int
	#ifdef __FreeBSD__
	pf_test(int dir, struct ifnet ifp, struct mbuf *m0,
	struct ether_header eh, struct inpcb inp)
	#else
	pf_test(int dir, struct ifnet ifp, struct mbuf *m0,
	struct ether_header *eh)
	#endif
	{
	struct pfi_kif *kif;
	u_short action, reason = 0, log = 0;
	struct mbuf m = m0;
	struct ip h = NULL; / make the compiler happy */
	struct pf_rule a = NULL, r = &pf_default_rule, tr, nr;
	struct pf_state *s = NULL;
	struct pf_ruleset *ruleset = NULL;
	struct pf_pdesc pd;
	int off, dirndx, pqid = 0;

	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	if (!pf_status.running)
	#ifdef __FreeBSD__
	{
	PF_UNLOCK();
	#endif
	return (PF_PASS);
	#ifdef __FreeBSD__
	}
	#endif

	memset(&pd, 0, sizeof(pd));
	if ((pd.pf_mtag = pf_get_mtag(m)) == NULL) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_test: pf_get_mtag returned NULL\n"));
	return (PF_DROP);
	}
	#ifdef __FreeBSD__
	if (m->m_flags & M_SKIP_FIREWALL) {
	PF_UNLOCK();
	return (PF_PASS);
	}
	#else
	if (pd.pf_mtag->flags & PF_TAG_GENERATED)
	return (PF_PASS);
	#endif

	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	#else
	if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
	ifp = ifp->if_carpdev;
	#endif

	kif = (struct pfi_kif *)ifp->if_pf_kif;
	if (kif == NULL) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
	return (PF_DROP);
	}
	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	return (PF_PASS);
	}

	#ifdef __FreeBSD__
	M_ASSERTPKTHDR(m);
	#else
	#ifdef DIAGNOSTIC
	if ((m->m_flags & M_PKTHDR) == 0)
	panic("non-M_PKTHDR is passed to pf_test");
	#endif /* DIAGNOSTIC */
	#endif /* __FreeBSD__ */

	if (m->m_pkthdr.len < (int)sizeof(*h)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_SHORT);
	log = 1;
	goto done;
	}

	/* We do IP header normalization and packet reassembly here */
	if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
	action = PF_DROP;
	goto done;
	}
	m = *m0;
	h = mtod(m, struct ip *);

	off = h->ip_hl << 2;
	if (off < (int)sizeof(*h)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_SHORT);
	log = 1;
	goto done;
	}

	pd.src = (struct pf_addr *)&h->ip_src;
	pd.dst = (struct pf_addr *)&h->ip_dst;
	PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET);
	pd.ip_sum = &h->ip_sum;
	pd.proto = h->ip_p;
	pd.af = AF_INET;
	pd.tos = h->ip_tos;
	pd.tot_len = ntohs(h->ip_len);
	pd.eh = eh;

	/* handle fragments that didn't get reassembled by normalization */
	if (h->ip_off & htons(IP_MF \| IP_OFFMASK)) {
	action = pf_test_fragment(&r, dir, kif, m, h,
	&pd, &a, &ruleset);
	goto done;
	}

	switch (h->ip_p) {

	case IPPROTO_TCP: {
	struct tcphdr th;

	pd.hdr.tcp = &th;
	if (!pf_pull_hdr(m, off, &th, sizeof(th),
	&action, &reason, AF_INET)) {
	log = action != PF_PASS;
	goto done;
	}
	if (dir == PF_IN && pf_check_proto_cksum(m, off,
	ntohs(h->ip_len) - off, IPPROTO_TCP, AF_INET)) {
	REASON_SET(&reason, PFRES_PROTCKSUM);
	action = PF_DROP;
	goto done;
	}
	pd.p_len = pd.tot_len - off - (th.th_off << 2);
	if ((th.th_flags & TH_ACK) && pd.p_len == 0)
	pqid = 1;
	action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
	if (action == PF_DROP)
	goto done;
	action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
	&reason);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_tcp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, NULL, inp);
	#else
	action = pf_test_tcp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, &ipintrq);
	#endif
	break;
	}

	case IPPROTO_UDP: {
	struct udphdr uh;

	pd.hdr.udp = &uh;
	if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
	&action, &reason, AF_INET)) {
	log = action != PF_PASS;
	goto done;
	}
	if (dir == PF_IN && uh.uh_sum && pf_check_proto_cksum(m,
	off, ntohs(h->ip_len) - off, IPPROTO_UDP, AF_INET)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_PROTCKSUM);
	goto done;
	}
	if (uh.uh_dport == 0 \|\|
	ntohs(uh.uh_ulen) > m->m_pkthdr.len - off \|\|
	ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_SHORT);
	goto done;
	}
	action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_udp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, NULL, inp);
	#else
	action = pf_test_udp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, &ipintrq);
	#endif
	break;
	}

	case IPPROTO_ICMP: {
	struct icmp ih;

	pd.hdr.icmp = &ih;
	if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
	&action, &reason, AF_INET)) {
	log = action != PF_PASS;
	goto done;
	}
	if (dir == PF_IN && pf_check_proto_cksum(m, off,
	ntohs(h->ip_len) - off, IPPROTO_ICMP, AF_INET)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_PROTCKSUM);
	goto done;
	}
	action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
	&reason);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_icmp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, NULL);
	#else
	action = pf_test_icmp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, &ipintrq);
	#endif
	break;
	}

	default:
	action = pf_test_state_other(&s, dir, kif, &pd);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_other(&r, &s, dir, kif, m, off, h,
	&pd, &a, &ruleset, NULL);
	#else
	action = pf_test_other(&r, &s, dir, kif, m, off, h,
	&pd, &a, &ruleset, &ipintrq);
	#endif
	break;
	}

	done:
	if (action == PF_PASS && h->ip_hl > 5 &&
	!((s && s->allow_opts) \|\| r->allow_opts)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_IPOPTIONS);
	log = 1;
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: dropping packet with ip options\n"));
	}

	if ((s && s->tag) \|\| r->rtableid)
	pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, r->rtableid);

	#ifdef ALTQ
	if (action == PF_PASS && r->qid) {
	if (pqid \|\| (pd.tos & IPTOS_LOWDELAY))
	pd.pf_mtag->qid = r->pqid;
	else
	pd.pf_mtag->qid = r->qid;
	/* add hints for ecn */
	pd.pf_mtag->af = AF_INET;
	pd.pf_mtag->hdr = h;
	}
	#endif /* ALTQ */

	/*
	* connections redirected to loopback should not match sockets
	* bound specifically to loopback due to security implications,
	* see tcp_input() and in_pcblookup_listen().
	*/
	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP \|\|
	pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
	(s->nat_rule.ptr->action == PF_RDR \|\|
	s->nat_rule.ptr->action == PF_BINAT) &&
	(ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
	pd.pf_mtag->flags \|= PF_TAG_TRANSLATE_LOCALHOST;

	if (log) {
	struct pf_rule *lr;

	if (s != NULL && s->nat_rule.ptr != NULL &&
	s->nat_rule.ptr->log & PF_LOG_ALL)
	lr = s->nat_rule.ptr;
	else
	lr = r;
	PFLOG_PACKET(kif, h, m, AF_INET, dir, reason, lr, a, ruleset,
	&pd);
	}

	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;

	if (action == PF_PASS \|\| r->action == PF_DROP) {
	dirndx = (dir == PF_OUT);
	r->packets[dirndx]++;
	r->bytes[dirndx] += pd.tot_len;
	if (a != NULL) {
	a->packets[dirndx]++;
	a->bytes[dirndx] += pd.tot_len;
	}
	if (s != NULL) {
	if (s->nat_rule.ptr != NULL) {
	s->nat_rule.ptr->packets[dirndx]++;
	s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
	}
	if (s->src_node != NULL) {
	s->src_node->packets[dirndx]++;
	s->src_node->bytes[dirndx] += pd.tot_len;
	}
	if (s->nat_src_node != NULL) {
	s->nat_src_node->packets[dirndx]++;
	s->nat_src_node->bytes[dirndx] += pd.tot_len;
	}
	dirndx = (dir == s->direction) ? 0 : 1;
	s->packets[dirndx]++;
	s->bytes[dirndx] += pd.tot_len;
	}
	tr = r;
	nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
	if (nr != NULL) {
	struct pf_addr *x;
	/*
	* XXX: we need to make sure that the addresses
	* passed to pfr_update_stats() are the same than
	* the addresses used during matching (pfr_match)
	*/
	if (r == &pf_default_rule) {
	tr = nr;
	x = (s == NULL \|\| s->direction == dir) ?
	&pd.baddr : &pd.naddr;
	} else
	x = (s == NULL \|\| s->direction == dir) ?
	&pd.naddr : &pd.baddr;
	if (x == &pd.baddr \|\| s == NULL) {
	/* we need to change the address */
	if (dir == PF_OUT)
	pd.src = x;
	else
	pd.dst = x;
	}
	}
	if (tr->src.addr.type == PF_ADDR_TABLE)
	pfr_update_stats(tr->src.addr.p.tbl, (s == NULL \|\|
	s->direction == dir) ? pd.src : pd.dst, pd.af,
	pd.tot_len, dir == PF_OUT, r->action == PF_PASS,
	tr->src.neg);
	if (tr->dst.addr.type == PF_ADDR_TABLE)
	pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL \|\|
	s->direction == dir) ? pd.dst : pd.src, pd.af,
	pd.tot_len, dir == PF_OUT, r->action == PF_PASS,
	tr->dst.neg);
	}


	if (action == PF_SYNPROXY_DROP) {
	m_freem(*m0);
	*m0 = NULL;
	action = PF_PASS;
	} else if (r->rt)
	/* pf_route can free the mbuf causing m0 to become NULL /
	pf_route(m0, r, dir, ifp, s, &pd);

	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif

	return (action);
	}
	#endif /* INET */

	#ifdef INET6
	int
	#ifdef __FreeBSD__
	pf_test6(int dir, struct ifnet ifp, struct mbuf *m0,
	struct ether_header eh, struct inpcb inp)
	#else
	pf_test6(int dir, struct ifnet ifp, struct mbuf *m0,
	struct ether_header *eh)
	#endif
	{
	struct pfi_kif *kif;
	u_short action, reason = 0, log = 0;
	struct mbuf m = m0, *n = NULL;
	struct ip6_hdr *h;
	struct pf_rule a = NULL, r = &pf_default_rule, tr, nr;
	struct pf_state *s = NULL;
	struct pf_ruleset *ruleset = NULL;
	struct pf_pdesc pd;
	int off, terminal = 0, dirndx, rh_cnt = 0;

	#ifdef __FreeBSD__
	PF_LOCK();
	#endif

	if (!pf_status.running)
	#ifdef __FreeBSD__
	{
	PF_UNLOCK();
	#endif
	return (PF_PASS);
	#ifdef __FreeBSD__
	}
	#endif

	memset(&pd, 0, sizeof(pd));
	if ((pd.pf_mtag = pf_get_mtag(m)) == NULL) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_test6: pf_get_mtag returned NULL\n"));
	return (PF_DROP);
	}
	if (pd.pf_mtag->flags & PF_TAG_GENERATED)
	return (PF_PASS);

	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	#else
	if (ifp->if_type == IFT_CARP && ifp->if_carpdev)
	ifp = ifp->if_carpdev;
	#endif

	kif = (struct pfi_kif *)ifp->if_pf_kif;
	if (kif == NULL) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	DPFPRINTF(PF_DEBUG_URGENT,
	("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
	return (PF_DROP);
	}
	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	return (PF_PASS);
	}

	#ifdef __FreeBSD__
	M_ASSERTPKTHDR(m);
	#else
	#ifdef DIAGNOSTIC
	if ((m->m_flags & M_PKTHDR) == 0)
	panic("non-M_PKTHDR is passed to pf_test6");
	#endif /* DIAGNOSTIC */
	#endif

	#ifdef __FreeBSD__
	h = NULL; /* make the compiler happy */
	#endif

	if (m->m_pkthdr.len < (int)sizeof(*h)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_SHORT);
	log = 1;
	goto done;
	}

	/* We do IP header normalization and packet reassembly here */
	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
	action = PF_DROP;
	goto done;
	}
	m = *m0;
	h = mtod(m, struct ip6_hdr *);

	#if 1
	/*
	* we do not support jumbogram yet. if we keep going, zero ip6_plen
	* will do something bad, so drop the packet for now.
	*/
	if (htons(h->ip6_plen) == 0) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_NORM); /XXX/
	goto done;
	}
	#endif

	pd.src = (struct pf_addr *)&h->ip6_src;
	pd.dst = (struct pf_addr *)&h->ip6_dst;
	PF_ACPY(&pd.baddr, dir == PF_OUT ? pd.src : pd.dst, AF_INET6);
	pd.ip_sum = NULL;
	pd.af = AF_INET6;
	pd.tos = 0;
	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
	pd.eh = eh;

	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
	pd.proto = h->ip6_nxt;
	do {
	switch (pd.proto) {
	case IPPROTO_FRAGMENT:
	action = pf_test_fragment(&r, dir, kif, m, h,
	&pd, &a, &ruleset);
	if (action == PF_DROP)
	REASON_SET(&reason, PFRES_FRAG);
	goto done;
	case IPPROTO_ROUTING: {
	struct ip6_rthdr rthdr;

	if (rh_cnt++) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: IPv6 more than one rthdr\n"));
	action = PF_DROP;
	REASON_SET(&reason, PFRES_IPOPTIONS);
	log = 1;
	goto done;
	}
	if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
	&reason, pd.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: IPv6 short rthdr\n"));
	action = PF_DROP;
	REASON_SET(&reason, PFRES_SHORT);
	log = 1;
	goto done;
	}
	if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: IPv6 rthdr0\n"));
	action = PF_DROP;
	REASON_SET(&reason, PFRES_IPOPTIONS);
	log = 1;
	goto done;
	}
	/* fallthrough */
	}
	case IPPROTO_AH:
	case IPPROTO_HOPOPTS:
	case IPPROTO_DSTOPTS: {
	/* get next header and header length */
	struct ip6_ext opt6;

	if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
	NULL, &reason, pd.af)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: IPv6 short opt\n"));
	action = PF_DROP;
	log = 1;
	goto done;
	}
	if (pd.proto == IPPROTO_AH)
	off += (opt6.ip6e_len + 2) * 4;
	else
	off += (opt6.ip6e_len + 1) * 8;
	pd.proto = opt6.ip6e_nxt;
	/* goto the next header */
	break;
	}
	default:
	terminal++;
	break;
	}
	} while (!terminal);

	/* if there's no routing header, use unmodified mbuf for checksumming */
	if (!n)
	n = m;

	switch (pd.proto) {

	case IPPROTO_TCP: {
	struct tcphdr th;

	pd.hdr.tcp = &th;
	if (!pf_pull_hdr(m, off, &th, sizeof(th),
	&action, &reason, AF_INET6)) {
	log = action != PF_PASS;
	goto done;
	}
	if (dir == PF_IN && pf_check_proto_cksum(n, off,
	ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)),
	IPPROTO_TCP, AF_INET6)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_PROTCKSUM);
	goto done;
	}
	pd.p_len = pd.tot_len - off - (th.th_off << 2);
	action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
	if (action == PF_DROP)
	goto done;
	action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
	&reason);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_tcp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, NULL, inp);
	#else
	action = pf_test_tcp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, &ip6intrq);
	#endif
	break;
	}

	case IPPROTO_UDP: {
	struct udphdr uh;

	pd.hdr.udp = &uh;
	if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
	&action, &reason, AF_INET6)) {
	log = action != PF_PASS;
	goto done;
	}
	if (dir == PF_IN && uh.uh_sum && pf_check_proto_cksum(n,
	off, ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)),
	IPPROTO_UDP, AF_INET6)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_PROTCKSUM);
	goto done;
	}
	if (uh.uh_dport == 0 \|\|
	ntohs(uh.uh_ulen) > m->m_pkthdr.len - off \|\|
	ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_SHORT);
	goto done;
	}
	action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_udp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, NULL, inp);
	#else
	action = pf_test_udp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, &ip6intrq);
	#endif
	break;
	}

	case IPPROTO_ICMPV6: {
	struct icmp6_hdr ih;

	pd.hdr.icmp6 = &ih;
	if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
	&action, &reason, AF_INET6)) {
	log = action != PF_PASS;
	goto done;
	}
	if (dir == PF_IN && pf_check_proto_cksum(n, off,
	ntohs(h->ip6_plen) - (off - sizeof(struct ip6_hdr)),
	IPPROTO_ICMPV6, AF_INET6)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_PROTCKSUM);
	goto done;
	}
	action = pf_test_state_icmp(&s, dir, kif,
	m, off, h, &pd, &reason);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_icmp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, NULL);
	#else
	action = pf_test_icmp(&r, &s, dir, kif,
	m, off, h, &pd, &a, &ruleset, &ip6intrq);
	#endif
	break;
	}

	default:
	action = pf_test_state_other(&s, dir, kif, &pd);
	if (action == PF_PASS) {
	#if NPFSYNC
	pfsync_update_state(s);
	#endif /* NPFSYNC */
	r = s->rule.ptr;
	a = s->anchor.ptr;
	log = s->log;
	} else if (s == NULL)
	#ifdef __FreeBSD__
	action = pf_test_other(&r, &s, dir, kif, m, off, h,
	&pd, &a, &ruleset, NULL);
	#else
	action = pf_test_other(&r, &s, dir, kif, m, off, h,
	&pd, &a, &ruleset, &ip6intrq);
	#endif
	break;
	}

	done:
	/* handle dangerous IPv6 extension headers. */
	if (action == PF_PASS && rh_cnt &&
	!((s && s->allow_opts) \|\| r->allow_opts)) {
	action = PF_DROP;
	REASON_SET(&reason, PFRES_IPOPTIONS);
	log = 1;
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: dropping packet with dangerous v6 headers\n"));
	}

	if ((s && s->tag) \|\| r->rtableid)
	pf_tag_packet(m, pd.pf_mtag, s ? s->tag : 0, r->rtableid);

	#ifdef ALTQ
	if (action == PF_PASS && r->qid) {
	if (pd.tos & IPTOS_LOWDELAY)
	pd.pf_mtag->qid = r->pqid;
	else
	pd.pf_mtag->qid = r->qid;
	/* add hints for ecn */
	pd.pf_mtag->af = AF_INET6;
	pd.pf_mtag->hdr = h;
	}
	#endif /* ALTQ */

	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP \|\|
	pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
	(s->nat_rule.ptr->action == PF_RDR \|\|
	s->nat_rule.ptr->action == PF_BINAT) &&
	IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
	pd.pf_mtag->flags \|= PF_TAG_TRANSLATE_LOCALHOST;

	if (log) {
	struct pf_rule *lr;

	if (s != NULL && s->nat_rule.ptr != NULL &&
	s->nat_rule.ptr->log & PF_LOG_ALL)
	lr = s->nat_rule.ptr;
	else
	lr = r;
	PFLOG_PACKET(kif, h, m, AF_INET6, dir, reason, lr, a, ruleset,
	&pd);
	}

	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;

	if (action == PF_PASS \|\| r->action == PF_DROP) {
	dirndx = (dir == PF_OUT);
	r->packets[dirndx]++;
	r->bytes[dirndx] += pd.tot_len;
	if (a != NULL) {
	a->packets[dirndx]++;
	a->bytes[dirndx] += pd.tot_len;
	}
	if (s != NULL) {
	if (s->nat_rule.ptr != NULL) {
	s->nat_rule.ptr->packets[dirndx]++;
	s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
	}
	if (s->src_node != NULL) {
	s->src_node->packets[dirndx]++;
	s->src_node->bytes[dirndx] += pd.tot_len;
	}
	if (s->nat_src_node != NULL) {
	s->nat_src_node->packets[dirndx]++;
	s->nat_src_node->bytes[dirndx] += pd.tot_len;
	}
	dirndx = (dir == s->direction) ? 0 : 1;
	s->packets[dirndx]++;
	s->bytes[dirndx] += pd.tot_len;
	}
	tr = r;
	nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
	if (nr != NULL) {
	struct pf_addr *x;
	/*
	* XXX: we need to make sure that the addresses
	* passed to pfr_update_stats() are the same than
	* the addresses used during matching (pfr_match)
	*/
	if (r == &pf_default_rule) {
	tr = nr;
	x = (s == NULL \|\| s->direction == dir) ?
	&pd.baddr : &pd.naddr;
	} else {
	x = (s == NULL \|\| s->direction == dir) ?
	&pd.naddr : &pd.baddr;
	}
	if (x == &pd.baddr \|\| s == NULL) {
	if (dir == PF_OUT)
	pd.src = x;
	else
	pd.dst = x;
	}
	}
	if (tr->src.addr.type == PF_ADDR_TABLE)
	pfr_update_stats(tr->src.addr.p.tbl, (s == NULL \|\|
	s->direction == dir) ? pd.src : pd.dst, pd.af,
	pd.tot_len, dir == PF_OUT, r->action == PF_PASS,
	tr->src.neg);
	if (tr->dst.addr.type == PF_ADDR_TABLE)
	pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL \|\|
	s->direction == dir) ? pd.dst : pd.src, pd.af,
	pd.tot_len, dir == PF_OUT, r->action == PF_PASS,
	tr->dst.neg);
	}


	if (action == PF_SYNPROXY_DROP) {
	m_freem(*m0);
	*m0 = NULL;
	action = PF_PASS;
	} else if (r->rt)
	/* pf_route6 can free the mbuf causing m0 to become NULL /
	pf_route6(m0, r, dir, ifp, s, &pd);

	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	return (action);
	}
	#endif /* INET6 */

	int
	pf_check_congestion(struct ifqueue *ifq)
	{
	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	return (0);
	#else
	if (ifq->ifq_congestion)
	return (1);
	else
	return (0);
	#endif
	}
	Index: head/sys/contrib/pf/net/pf_if.c
	===================================================================
	--- head/sys/contrib/pf/net/pf_if.c (revision 183549)
	+++ head/sys/contrib/pf/net/pf_if.c (revision 183550)
	@@ -1,946 +1,948 @@
	/* $OpenBSD: pf_if.c,v 1.46 2006/12/13 09:01:59 itojun Exp $ */

	/*
	* Copyright 2005 Henning Brauer <henning@openbsd.org>
	* Copyright 2005 Ryan McBride <mcbride@openbsd.org>
	* Copyright (c) 2001 Daniel Hartmeier
	* Copyright (c) 2003 Cedric Berger
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials provided
	* with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#if defined(__FreeBSD__)
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");
	#endif

	#include <sys/param.h>
	#include <sys/systm.h>
	#ifdef __FreeBSD__
	#include <sys/malloc.h>
	#endif
	#include <sys/mbuf.h>
	#include <sys/filio.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/kernel.h>
	#ifndef __FreeBSD__
	#include <sys/device.h>
	#endif
	#include <sys/time.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>

	#include <net/pfvar.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#endif /* INET6 */

	struct pfi_kif *pfi_all = NULL;
	struct pfi_statehead pfi_statehead;
	#ifdef __FreeBSD__
	uma_zone_t pfi_addr_pl;
	#else
	struct pool pfi_addr_pl;
	#endif
	struct pfi_ifhead pfi_ifs;
	long pfi_update = 1;
	struct pfr_addr *pfi_buffer;
	int pfi_buffer_cnt;
	int pfi_buffer_max;
	#ifdef __FreeBSD__
	eventhandler_tag pfi_attach_cookie = NULL;
	eventhandler_tag pfi_detach_cookie = NULL;
	eventhandler_tag pfi_attach_group_cookie = NULL;
	eventhandler_tag pfi_change_group_cookie = NULL;
	eventhandler_tag pfi_detach_group_cookie = NULL;
	eventhandler_tag pfi_ifaddr_event_cookie = NULL;
	#endif

	void pfi_kif_update(struct pfi_kif *);
	void pfi_dynaddr_update(struct pfi_dynaddr *dyn);
	void pfi_table_update(struct pfr_ktable , struct pfi_kif ,
	int, int);
	void pfi_kifaddr_update(void *);
	void pfi_instance_add(struct ifnet *, int, int);
	void pfi_address_add(struct sockaddr *, int, int);
	int pfi_if_compare(struct pfi_kif , struct pfi_kif );
	int pfi_skip_if(const char , struct pfi_kif );
	int pfi_unmask(void *);
	#ifdef __FreeBSD__
	void pfi_attach_ifnet_event(void * __unused, struct ifnet *);
	void pfi_detach_ifnet_event(void * __unused, struct ifnet *);
	void pfi_attach_group_event(void * __unused, struct ifg_group *);
	void pfi_change_group_event(void * __unused, char *);
	void pfi_detach_group_event(void * __unused, struct ifg_group *);
	void pfi_ifaddr_event(void * __unused, struct ifnet *);

	extern struct ifgrouphead ifg_head;
	#endif

	RB_PROTOTYPE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare);
	RB_GENERATE(pfi_ifhead, pfi_kif, pfik_tree, pfi_if_compare);

	#define PFI_BUFFER_MAX 0x10000
	#define PFI_MTYPE M_IFADDR

	void
	pfi_initialize(void)
	{
	+ INIT_VNET_NET(curvnet);
	+
	if (pfi_all != NULL) /* already initialized */
	return;

	TAILQ_INIT(&pfi_statehead);
	#ifndef __FreeBSD__
	pool_init(&pfi_addr_pl, sizeof(struct pfi_dynaddr), 0, 0, 0,
	"pfiaddrpl", &pool_allocator_nointr);
	#endif
	pfi_buffer_max = 64;
	pfi_buffer = malloc(pfi_buffer_max * sizeof(*pfi_buffer),
	PFI_MTYPE, M_WAITOK);

	if ((pfi_all = pfi_kif_get(IFG_ALL)) == NULL)
	panic("pfi_kif_get for pfi_all failed");

	#ifdef __FreeBSD__
	struct ifg_group *ifg;
	struct ifnet *ifp;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
	pfi_attach_ifgroup(ifg);
	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
	pfi_attach_ifnet(ifp);
	IFNET_RUNLOCK();

	pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event,
	pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
	pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event,
	pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
	pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event,
	pfi_attach_group_event, NULL, EVENTHANDLER_PRI_ANY);
	pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event,
	pfi_change_group_event, NULL, EVENTHANDLER_PRI_ANY);
	pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event,
	pfi_detach_group_event, NULL, EVENTHANDLER_PRI_ANY);
	pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event,
	pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
	#endif
	}

	#ifdef __FreeBSD__
	void
	pfi_cleanup(void)
	{
	struct pfi_kif *p;

	PF_UNLOCK();
	EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie);
	EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie);
	EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie);
	EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie);
	EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie);
	EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie);
	PF_LOCK();

	pfi_all = NULL;
	while ((p = RB_MIN(pfi_ifhead, &pfi_ifs))) {
	if (p->pfik_rules \|\| p->pfik_states) {
	printf("pfi_cleanup: dangling refs for %s\n",
	p->pfik_name);
	}

	RB_REMOVE(pfi_ifhead, &pfi_ifs, p);
	free(p, PFI_MTYPE);
	}

	free(pfi_buffer, PFI_MTYPE);
	}
	#endif

	struct pfi_kif *
	pfi_kif_get(const char *kif_name)
	{
	struct pfi_kif *kif;
	struct pfi_kif_cmp s;

	bzero(&s, sizeof(s));
	strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name));
	if ((kif = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)&s)) != NULL)
	return (kif);

	/* create new one */
	#ifdef __FreeBSD__
	if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_NOWAIT)) == NULL)
	#else
	if ((kif = malloc(sizeof(*kif), PFI_MTYPE, M_DONTWAIT)) == NULL)
	#endif
	return (NULL);

	bzero(kif, sizeof(*kif));
	strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name));
	#ifdef __FreeBSD__
	/*
	* It seems that the value of time_second is in unintialzied state
	* when pf sets interface statistics clear time in boot phase if pf
	* was statically linked to kernel. Instead of setting the bogus
	* time value have pfi_get_ifaces handle this case. In
	* pfi_get_ifaces it uses boottime.tv_sec if it sees the time is 0.
	*/
	kif->pfik_tzero = time_second > 1 ? time_second : 0;
	#else
	kif->pfik_tzero = time_second;
	#endif
	TAILQ_INIT(&kif->pfik_dynaddrs);

	RB_INSERT(pfi_ifhead, &pfi_ifs, kif);
	return (kif);
	}

	void
	pfi_kif_ref(struct pfi_kif *kif, enum pfi_kif_refs what)
	{
	switch (what) {
	case PFI_KIF_REF_RULE:
	kif->pfik_rules++;
	break;
	case PFI_KIF_REF_STATE:
	if (!kif->pfik_states++)
	TAILQ_INSERT_TAIL(&pfi_statehead, kif, pfik_w_states);
	break;
	default:
	panic("pfi_kif_ref with unknown type");
	}
	}

	void
	pfi_kif_unref(struct pfi_kif *kif, enum pfi_kif_refs what)
	{
	if (kif == NULL)
	return;

	switch (what) {
	case PFI_KIF_REF_NONE:
	break;
	case PFI_KIF_REF_RULE:
	if (kif->pfik_rules <= 0) {
	printf("pfi_kif_unref: rules refcount <= 0\n");
	return;
	}
	kif->pfik_rules--;
	break;
	case PFI_KIF_REF_STATE:
	if (kif->pfik_states <= 0) {
	printf("pfi_kif_unref: state refcount <= 0\n");
	return;
	}
	if (!--kif->pfik_states)
	TAILQ_REMOVE(&pfi_statehead, kif, pfik_w_states);
	break;
	default:
	panic("pfi_kif_unref with unknown type");
	}

	if (kif->pfik_ifp != NULL \|\| kif->pfik_group != NULL \|\| kif == pfi_all)
	return;

	if (kif->pfik_rules \|\| kif->pfik_states)
	return;

	RB_REMOVE(pfi_ifhead, &pfi_ifs, kif);
	free(kif, PFI_MTYPE);
	}

	int
	pfi_kif_match(struct pfi_kif rule_kif, struct pfi_kif packet_kif)
	{
	struct ifg_list *p;

	if (rule_kif == NULL \|\| rule_kif == packet_kif)
	return (1);

	if (rule_kif->pfik_group != NULL)
	TAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next)
	if (p->ifgl_group == rule_kif->pfik_group)
	return (1);

	return (0);
	}

	void
	pfi_attach_ifnet(struct ifnet *ifp)
	{
	struct pfi_kif *kif;
	int s;

	pfi_initialize();
	s = splsoftnet();
	pfi_update++;
	if ((kif = pfi_kif_get(ifp->if_xname)) == NULL)
	panic("pfi_kif_get failed");

	kif->pfik_ifp = ifp;
	ifp->if_pf_kif = (caddr_t)kif;

	#ifndef __FreeBSD__
	if ((kif->pfik_ah_cookie = hook_establish(ifp->if_addrhooks, 1,
	pfi_kifaddr_update, kif)) == NULL)
	panic("pfi_attach_ifnet: cannot allocate '%s' address hook",
	ifp->if_xname);
	#endif

	pfi_kif_update(kif);

	splx(s);
	}

	void
	pfi_detach_ifnet(struct ifnet *ifp)
	{
	int s;
	struct pfi_kif *kif;

	if ((kif = (struct pfi_kif *)ifp->if_pf_kif) == NULL)
	return;

	s = splsoftnet();
	pfi_update++;
	#ifndef __FreeBSD__
	hook_disestablish(ifp->if_addrhooks, kif->pfik_ah_cookie);
	#endif
	pfi_kif_update(kif);

	kif->pfik_ifp = NULL;
	ifp->if_pf_kif = NULL;
	pfi_kif_unref(kif, PFI_KIF_REF_NONE);
	splx(s);
	}

	void
	pfi_attach_ifgroup(struct ifg_group *ifg)
	{
	struct pfi_kif *kif;
	int s;

	pfi_initialize();
	s = splsoftnet();
	pfi_update++;
	if ((kif = pfi_kif_get(ifg->ifg_group)) == NULL)
	panic("pfi_kif_get failed");

	kif->pfik_group = ifg;
	ifg->ifg_pf_kif = (caddr_t)kif;

	splx(s);
	}

	void
	pfi_detach_ifgroup(struct ifg_group *ifg)
	{
	int s;
	struct pfi_kif *kif;

	if ((kif = (struct pfi_kif *)ifg->ifg_pf_kif) == NULL)
	return;

	s = splsoftnet();
	pfi_update++;

	kif->pfik_group = NULL;
	ifg->ifg_pf_kif = NULL;
	pfi_kif_unref(kif, PFI_KIF_REF_NONE);
	splx(s);
	}

	void
	pfi_group_change(const char *group)
	{
	struct pfi_kif *kif;
	int s;

	s = splsoftnet();
	pfi_update++;
	if ((kif = pfi_kif_get(group)) == NULL)
	panic("pfi_kif_get failed");

	pfi_kif_update(kif);

	splx(s);
	}

	int
	pfi_match_addr(struct pfi_dynaddr dyn, struct pf_addr a, sa_family_t af)
	{
	switch (af) {
	#ifdef INET
	case AF_INET:
	switch (dyn->pfid_acnt4) {
	case 0:
	return (0);
	case 1:
	return (PF_MATCHA(0, &dyn->pfid_addr4,
	&dyn->pfid_mask4, a, AF_INET));
	default:
	return (pfr_match_addr(dyn->pfid_kt, a, AF_INET));
	}
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	switch (dyn->pfid_acnt6) {
	case 0:
	return (0);
	case 1:
	return (PF_MATCHA(0, &dyn->pfid_addr6,
	&dyn->pfid_mask6, a, AF_INET6));
	default:
	return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6));
	}
	break;
	#endif /* INET6 */
	default:
	return (0);
	}
	}

	int
	pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af)
	{
	struct pfi_dynaddr *dyn;
	char tblname[PF_TABLE_NAME_SIZE];
	struct pf_ruleset *ruleset = NULL;
	int s, rv = 0;

	if (aw->type != PF_ADDR_DYNIFTL)
	return (0);
	if ((dyn = pool_get(&pfi_addr_pl, PR_NOWAIT)) == NULL)
	return (1);
	bzero(dyn, sizeof(*dyn));

	s = splsoftnet();
	if (!strcmp(aw->v.ifname, "self"))
	dyn->pfid_kif = pfi_kif_get(IFG_ALL);
	else
	dyn->pfid_kif = pfi_kif_get(aw->v.ifname);
	if (dyn->pfid_kif == NULL) {
	rv = 1;
	goto _bad;
	}
	pfi_kif_ref(dyn->pfid_kif, PFI_KIF_REF_RULE);

	dyn->pfid_net = pfi_unmask(&aw->v.a.mask);
	if (af == AF_INET && dyn->pfid_net == 32)
	dyn->pfid_net = 128;
	strlcpy(tblname, aw->v.ifname, sizeof(tblname));
	if (aw->iflags & PFI_AFLAG_NETWORK)
	strlcat(tblname, ":network", sizeof(tblname));
	if (aw->iflags & PFI_AFLAG_BROADCAST)
	strlcat(tblname, ":broadcast", sizeof(tblname));
	if (aw->iflags & PFI_AFLAG_PEER)
	strlcat(tblname, ":peer", sizeof(tblname));
	if (aw->iflags & PFI_AFLAG_NOALIAS)
	strlcat(tblname, ":0", sizeof(tblname));
	if (dyn->pfid_net != 128)
	snprintf(tblname + strlen(tblname),
	sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net);
	if ((ruleset = pf_find_or_create_ruleset(PF_RESERVED_ANCHOR)) == NULL) {
	rv = 1;
	goto _bad;
	}

	if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) {
	rv = 1;
	goto _bad;
	}

	dyn->pfid_kt->pfrkt_flags \|= PFR_TFLAG_ACTIVE;
	dyn->pfid_iflags = aw->iflags;
	dyn->pfid_af = af;

	TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
	aw->p.dyn = dyn;
	pfi_kif_update(dyn->pfid_kif);
	splx(s);
	return (0);

	_bad:
	if (dyn->pfid_kt != NULL)
	pfr_detach_table(dyn->pfid_kt);
	if (ruleset != NULL)
	pf_remove_if_empty_ruleset(ruleset);
	if (dyn->pfid_kif != NULL)
	pfi_kif_unref(dyn->pfid_kif, PFI_KIF_REF_RULE);
	pool_put(&pfi_addr_pl, dyn);
	splx(s);
	return (rv);
	}

	void
	pfi_kif_update(struct pfi_kif *kif)
	{
	struct ifg_list *ifgl;
	struct pfi_dynaddr *p;

	/* update all dynaddr */
	TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry)
	pfi_dynaddr_update(p);

	/* again for all groups kif is member of */
	if (kif->pfik_ifp != NULL)
	TAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next)
	pfi_kif_update((struct pfi_kif *)
	ifgl->ifgl_group->ifg_pf_kif);
	}

	void
	pfi_dynaddr_update(struct pfi_dynaddr *dyn)
	{
	struct pfi_kif *kif;
	struct pfr_ktable *kt;

	if (dyn == NULL \|\| dyn->pfid_kif == NULL \|\| dyn->pfid_kt == NULL)
	panic("pfi_dynaddr_update");

	kif = dyn->pfid_kif;
	kt = dyn->pfid_kt;

	if (kt->pfrkt_larg != pfi_update) {
	/* this table needs to be brought up-to-date */
	pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags);
	kt->pfrkt_larg = pfi_update;
	}
	pfr_dynaddr_update(kt, dyn);
	}

	void
	pfi_table_update(struct pfr_ktable kt, struct pfi_kif kif, int net, int flags)
	{
	int e, size2 = 0;
	struct ifg_member *ifgm;

	pfi_buffer_cnt = 0;

	if (kif->pfik_ifp != NULL)
	pfi_instance_add(kif->pfik_ifp, net, flags);
	else if (kif->pfik_group != NULL)
	TAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next)
	pfi_instance_add(ifgm->ifgm_ifp, net, flags);

	if ((e = pfr_set_addrs(&kt->pfrkt_t, pfi_buffer, pfi_buffer_cnt, &size2,
	NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK)))
	printf("pfi_table_update: cannot set %d new addresses "
	"into table %s: %d\n", pfi_buffer_cnt, kt->pfrkt_name, e);
	}

	void
	pfi_instance_add(struct ifnet *ifp, int net, int flags)
	{
	struct ifaddr *ia;
	int got4 = 0, got6 = 0;
	int net2, af;

	if (ifp == NULL)
	return;
	TAILQ_FOREACH(ia, &ifp->if_addrlist, ifa_list) {
	if (ia->ifa_addr == NULL)
	continue;
	af = ia->ifa_addr->sa_family;
	if (af != AF_INET && af != AF_INET6)
	continue;
	#ifdef __FreeBSD__
	/*
	* XXX: For point-to-point interfaces, (ifname:0) and IPv4,
	* jump over addresses without a proper route to work
	* around a problem with ppp not fully removing the
	* address used during IPCP.
	*/
	if ((ifp->if_flags & IFF_POINTOPOINT) &&
	!(ia->ifa_flags & IFA_ROUTE) &&
	(flags & PFI_AFLAG_NOALIAS) && (af == AF_INET))
	continue;
	#endif
	if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6)
	continue;
	if ((flags & PFI_AFLAG_BROADCAST) &&
	!(ifp->if_flags & IFF_BROADCAST))
	continue;
	if ((flags & PFI_AFLAG_PEER) &&
	!(ifp->if_flags & IFF_POINTOPOINT))
	continue;
	if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 &&
	IN6_IS_ADDR_LINKLOCAL(
	&((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr))
	continue;
	if (flags & PFI_AFLAG_NOALIAS) {
	if (af == AF_INET && got4)
	continue;
	if (af == AF_INET6 && got6)
	continue;
	}
	if (af == AF_INET)
	got4 = 1;
	else if (af == AF_INET6)
	got6 = 1;
	net2 = net;
	if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) {
	if (af == AF_INET)
	net2 = pfi_unmask(&((struct sockaddr_in *)
	ia->ifa_netmask)->sin_addr);
	else if (af == AF_INET6)
	net2 = pfi_unmask(&((struct sockaddr_in6 *)
	ia->ifa_netmask)->sin6_addr);
	}
	if (af == AF_INET && net2 > 32)
	net2 = 32;
	if (flags & PFI_AFLAG_BROADCAST)
	pfi_address_add(ia->ifa_broadaddr, af, net2);
	else if (flags & PFI_AFLAG_PEER)
	pfi_address_add(ia->ifa_dstaddr, af, net2);
	else
	pfi_address_add(ia->ifa_addr, af, net2);
	}
	}

	void
	pfi_address_add(struct sockaddr *sa, int af, int net)
	{
	struct pfr_addr *p;
	int i;

	if (pfi_buffer_cnt >= pfi_buffer_max) {
	int new_max = pfi_buffer_max * 2;

	if (new_max > PFI_BUFFER_MAX) {
	printf("pfi_address_add: address buffer full (%d/%d)\n",
	pfi_buffer_cnt, PFI_BUFFER_MAX);
	return;
	}
	p = malloc(new_max * sizeof(*pfi_buffer), PFI_MTYPE,
	#ifdef __FreeBSD__
	M_NOWAIT);
	#else
	M_DONTWAIT);
	#endif
	if (p == NULL) {
	printf("pfi_address_add: no memory to grow buffer "
	"(%d/%d)\n", pfi_buffer_cnt, PFI_BUFFER_MAX);
	return;
	}
	memcpy(pfi_buffer, p, pfi_buffer_cnt * sizeof(*pfi_buffer));
	/* no need to zero buffer */
	free(pfi_buffer, PFI_MTYPE);
	pfi_buffer = p;
	pfi_buffer_max = new_max;
	}
	if (af == AF_INET && net > 32)
	net = 128;
	p = pfi_buffer + pfi_buffer_cnt++;
	bzero(p, sizeof(*p));
	p->pfra_af = af;
	p->pfra_net = net;
	if (af == AF_INET)
	p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr;
	else if (af == AF_INET6) {
	p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr;
	if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr))
	p->pfra_ip6addr.s6_addr16[1] = 0;
	}
	/* mask network address bits */
	if (net < 128)
	((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8));
	for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++)
	((caddr_t)p)[i] = 0;
	}

	void
	pfi_dynaddr_remove(struct pf_addr_wrap *aw)
	{
	int s;

	if (aw->type != PF_ADDR_DYNIFTL \|\| aw->p.dyn == NULL \|\|
	aw->p.dyn->pfid_kif == NULL \|\| aw->p.dyn->pfid_kt == NULL)
	return;

	s = splsoftnet();
	TAILQ_REMOVE(&aw->p.dyn->pfid_kif->pfik_dynaddrs, aw->p.dyn, entry);
	pfi_kif_unref(aw->p.dyn->pfid_kif, PFI_KIF_REF_RULE);
	aw->p.dyn->pfid_kif = NULL;
	pfr_detach_table(aw->p.dyn->pfid_kt);
	aw->p.dyn->pfid_kt = NULL;
	pool_put(&pfi_addr_pl, aw->p.dyn);
	aw->p.dyn = NULL;
	splx(s);
	}

	void
	pfi_dynaddr_copyout(struct pf_addr_wrap *aw)
	{
	if (aw->type != PF_ADDR_DYNIFTL \|\| aw->p.dyn == NULL \|\|
	aw->p.dyn->pfid_kif == NULL)
	return;
	aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6;
	}

	void
	pfi_kifaddr_update(void *v)
	{
	int s;
	struct pfi_kif kif = (struct pfi_kif )v;

	s = splsoftnet();
	pfi_update++;
	pfi_kif_update(kif);
	splx(s);
	}

	int
	pfi_if_compare(struct pfi_kif p, struct pfi_kif q)
	{
	return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ));
	}

	void
	pfi_fill_oldstatus(struct pf_status *pfs)
	{
	struct pfi_kif *p;
	struct pfi_kif_cmp key;
	int i, j, k, s;

	strlcpy(key.pfik_name, pfs->ifname, sizeof(key.pfik_name));
	s = splsoftnet();
	p = RB_FIND(pfi_ifhead, &pfi_ifs, (struct pfi_kif *)&key);
	if (p == NULL) {
	splx(s);
	return;
	}
	bzero(pfs->pcounters, sizeof(pfs->pcounters));
	bzero(pfs->bcounters, sizeof(pfs->bcounters));
	for (i = 0; i < 2; i++)
	for (j = 0; j < 2; j++)
	for (k = 0; k < 2; k++) {
	pfs->pcounters[i][j][k] =
	p->pfik_packets[i][j][k];
	pfs->bcounters[i][j] +=
	p->pfik_bytes[i][j][k];
	}
	splx(s);
	}

	int
	pfi_clr_istats(const char *name)
	{
	struct pfi_kif *p;
	int s;

	s = splsoftnet();
	RB_FOREACH(p, pfi_ifhead, &pfi_ifs) {
	if (pfi_skip_if(name, p))
	continue;
	bzero(p->pfik_packets, sizeof(p->pfik_packets));
	bzero(p->pfik_bytes, sizeof(p->pfik_bytes));
	p->pfik_tzero = time_second;
	}
	splx(s);

	return (0);
	}

	int
	pfi_get_ifaces(const char name, struct pfi_kif buf, int *size)
	{
	struct pfi_kif p, nextp;
	int s, n = 0;
	#ifdef __FreeBSD__
	int error;
	#endif

	s = splsoftnet();
	for (p = RB_MIN(pfi_ifhead, &pfi_ifs); p; p = nextp) {
	nextp = RB_NEXT(pfi_ifhead, &pfi_ifs, p);
	if (pfi_skip_if(name, p))
	continue;
	if (*size > n++) {
	if (!p->pfik_tzero)
	p->pfik_tzero = time_second;
	pfi_kif_ref(p, PFI_KIF_REF_RULE);
	#ifdef __FreeBSD__
	PF_COPYOUT(p, buf++, sizeof(*buf), error);
	if (error) {
	#else
	if (copyout(p, buf++, sizeof(*buf))) {
	#endif
	pfi_kif_unref(p, PFI_KIF_REF_RULE);
	splx(s);
	return (EFAULT);
	}
	nextp = RB_NEXT(pfi_ifhead, &pfi_ifs, p);
	pfi_kif_unref(p, PFI_KIF_REF_RULE);
	}
	}
	splx(s);
	*size = n;
	return (0);
	}

	int
	pfi_skip_if(const char filter, struct pfi_kif p)
	{
	int n;

	if (filter == NULL \|\| !*filter)
	return (0);
	if (!strcmp(p->pfik_name, filter))
	return (0); /* exact match */
	n = strlen(filter);
	if (n < 1 \|\| n >= IFNAMSIZ)
	return (1); /* sanity check */
	if (filter[n-1] >= '0' && filter[n-1] <= '9')
	return (1); /* only do exact match in that case */
	if (strncmp(p->pfik_name, filter, n))
	return (1); /* prefix doesn't match */
	return (p->pfik_name[n] < '0' \|\| p->pfik_name[n] > '9');
	}

	int
	pfi_set_flags(const char *name, int flags)
	{
	struct pfi_kif *p;
	int s;

	s = splsoftnet();
	RB_FOREACH(p, pfi_ifhead, &pfi_ifs) {
	if (pfi_skip_if(name, p))
	continue;
	p->pfik_flags \|= flags;
	}
	splx(s);
	return (0);
	}

	int
	pfi_clear_flags(const char *name, int flags)
	{
	struct pfi_kif *p;
	int s;

	s = splsoftnet();
	RB_FOREACH(p, pfi_ifhead, &pfi_ifs) {
	if (pfi_skip_if(name, p))
	continue;
	p->pfik_flags &= ~flags;
	}
	splx(s);
	return (0);
	}

	/* from pf_print_state.c */
	int
	pfi_unmask(void *addr)
	{
	struct pf_addr *m = addr;
	int i = 31, j = 0, b = 0;
	u_int32_t tmp;

	while (j < 4 && m->addr32[j] == 0xffffffff) {
	b += 32;
	j++;
	}
	if (j < 4) {
	tmp = ntohl(m->addr32[j]);
	for (i = 31; tmp & (1 << i); --i)
	b++;
	}
	return (b);
	}

	#ifdef __FreeBSD__
	void
	pfi_attach_ifnet_event(void arg __unused, struct ifnet ifp)
	{
	PF_LOCK();
	pfi_attach_ifnet(ifp);
	#ifdef ALTQ
	pf_altq_ifnet_event(ifp, 0);
	#endif
	PF_UNLOCK();
	}

	void
	pfi_detach_ifnet_event(void arg __unused, struct ifnet ifp)
	{
	PF_LOCK();
	pfi_detach_ifnet(ifp);
	#ifdef ALTQ
	pf_altq_ifnet_event(ifp, 1);
	#endif
	PF_UNLOCK();
	}

	void
	pfi_attach_group_event(void arg __unused, struct ifg_group ifg)
	{
	PF_LOCK();
	pfi_attach_ifgroup(ifg);
	PF_UNLOCK();
	}

	void
	pfi_change_group_event(void arg __unused, char gname)
	{
	PF_LOCK();
	pfi_group_change(gname);
	PF_UNLOCK();
	}

	void
	pfi_detach_group_event(void arg __unused, struct ifg_group ifg)
	{
	PF_LOCK();
	pfi_detach_ifgroup(ifg);
	PF_UNLOCK();
	}

	void
	pfi_ifaddr_event(void arg __unused, struct ifnet ifp)
	{
	PF_LOCK();
	if (ifp && ifp->if_pf_kif)
	pfi_kifaddr_update(ifp->if_pf_kif);
	PF_UNLOCK();
	}
	#endif /* __FreeBSD__ */
	Index: head/sys/contrib/pf/net/pf_ioctl.c
	===================================================================
	--- head/sys/contrib/pf/net/pf_ioctl.c (revision 183549)
	+++ head/sys/contrib/pf/net/pf_ioctl.c (revision 183550)
	@@ -1,3895 +1,3897 @@
	/* $OpenBSD: pf_ioctl.c,v 1.175 2007/02/26 22:47:43 deraadt Exp $ */

	/*
	* Copyright (c) 2001 Daniel Hartmeier
	* Copyright (c) 2002,2003 Henning Brauer
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials provided
	* with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* Effort sponsored in part by the Defense Advanced Research Projects
	* Agency (DARPA) and Air Force Research Laboratory, Air Force
	* Materiel Command, USAF, under agreement number F30602-01-2-0537.
	*
	*/

	#ifdef __FreeBSD__
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");
	#endif

	#ifdef __FreeBSD__
	#include "opt_bpf.h"
	#include "opt_pf.h"

	#ifdef DEV_BPF
	#define NBPFILTER DEV_BPF
	#else
	#define NBPFILTER 0
	#endif

	#ifdef DEV_PFLOG
	#define NPFLOG DEV_PFLOG
	#else
	#define NPFLOG 0
	#endif

	#ifdef DEV_PFSYNC
	#define NPFSYNC DEV_PFSYNC
	#else
	#define NPFSYNC 0
	#endif

	#else
	#include "bpfilter.h"
	#include "pflog.h"
	#include "pfsync.h"
	#endif

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/filio.h>
	#include <sys/fcntl.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/kernel.h>
	#include <sys/time.h>
	#include <sys/malloc.h>
	#ifdef __FreeBSD__
	#include <sys/module.h>
	#include <sys/conf.h>
	#include <sys/proc.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>
	#else
	#include <sys/timeout.h>
	#include <sys/pool.h>
	#endif
	#include <sys/proc.h>
	#include <sys/malloc.h>
	#include <sys/kthread.h>
	#ifndef __FreeBSD__
	#include <sys/rwlock.h>
	#include <uvm/uvm_extern.h>
	#endif

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_icmp.h>

	#ifdef __FreeBSD__
	#include <sys/md5.h>
	#else
	#include <dev/rndvar.h>
	#include <crypto/md5.h>
	#endif
	#include <net/pfvar.h>

	#if NPFSYNC > 0
	#include <net/if_pfsync.h>
	#endif /* NPFSYNC > 0 */

	#include <net/if_pflog.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet/in_pcb.h>
	#endif /* INET6 */

	#ifdef ALTQ
	#include <altq/altq.h>
	#endif

	#ifdef __FreeBSD__
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <net/pfil.h>
	#endif /* __FreeBSD__ */

	#ifdef __FreeBSD__
	void init_zone_var(void);
	void cleanup_pf_zone(void);
	int pfattach(void);
	#else
	void pfattach(int);
	void pf_thread_create(void *);
	int pfopen(dev_t, int, int, struct proc *);
	int pfclose(dev_t, int, int, struct proc *);
	#endif
	struct pf_pool pf_get_pool(char , u_int32_t, u_int8_t, u_int32_t,
	u_int8_t, u_int8_t, u_int8_t);

	void pf_mv_pool(struct pf_palist , struct pf_palist );
	void pf_empty_pool(struct pf_palist *);
	#ifdef __FreeBSD__
	int pfioctl(struct cdev , u_long, caddr_t, int, struct thread );
	#else
	int pfioctl(struct cdev , u_long, caddr_t, int, struct proc );
	#endif
	#ifdef ALTQ
	int pf_begin_altq(u_int32_t *);
	int pf_rollback_altq(u_int32_t);
	int pf_commit_altq(u_int32_t);
	int pf_enable_altq(struct pf_altq *);
	int pf_disable_altq(struct pf_altq *);
	#endif /* ALTQ */
	int pf_begin_rules(u_int32_t , int, const char );
	int pf_rollback_rules(u_int32_t, int, char *);
	int pf_setup_pfsync_matching(struct pf_ruleset *);
	void pf_hash_rule(MD5_CTX , struct pf_rule );
	void pf_hash_rule_addr(MD5_CTX , struct pf_rule_addr );
	int pf_commit_rules(u_int32_t, int, char *);

	struct pf_rule pf_default_rule;
	#ifdef __FreeBSD__
	struct sx pf_consistency_lock;
	SX_SYSINIT(pf_consistency_lock, &pf_consistency_lock, "pf_statetbl_lock");
	#else
	struct rwlock pf_consistency_lock = RWLOCK_INITIALIZER;
	#endif
	#ifdef ALTQ
	static int pf_altq_running;
	#endif

	#define TAGID_MAX 50000
	TAILQ_HEAD(pf_tags, pf_tagname) pf_tags = TAILQ_HEAD_INITIALIZER(pf_tags),
	pf_qids = TAILQ_HEAD_INITIALIZER(pf_qids);

	#if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
	#error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
	#endif
	u_int16_t tagname2tag(struct pf_tags , char );
	void tag2tagname(struct pf_tags , u_int16_t, char );
	void tag_unref(struct pf_tags *, u_int16_t);
	int pf_rtlabel_add(struct pf_addr_wrap *);
	void pf_rtlabel_remove(struct pf_addr_wrap *);
	void pf_rtlabel_copyout(struct pf_addr_wrap *);

	#define DPFPRINTF(n, x) if (pf_status.debug >= (n)) printf x


	#ifdef __FreeBSD__
	static struct cdev *pf_dev;

	/*
	* XXX - These are new and need to be checked when moveing to a new version
	*/
	static void pf_clear_states(void);
	static int pf_clear_tables(void);
	static void pf_clear_srcnodes(void);
	/*
	* XXX - These are new and need to be checked when moveing to a new version
	*/

	/*
	* Wrapper functions for pfil(9) hooks
	*/
	static int pf_check_in(void arg, struct mbuf m, struct ifnet ifp,
	int dir, struct inpcb *inp);
	static int pf_check_out(void arg, struct mbuf m, struct ifnet ifp,
	int dir, struct inpcb *inp);
	#ifdef INET6
	static int pf_check6_in(void arg, struct mbuf m, struct ifnet ifp,
	int dir, struct inpcb *inp);
	static int pf_check6_out(void arg, struct mbuf m, struct ifnet ifp,
	int dir, struct inpcb *inp);
	#endif

	static int hook_pf(void);
	static int dehook_pf(void);
	static int shutdown_pf(void);
	static int pf_load(void);
	static int pf_unload(void);

	static struct cdevsw pf_cdevsw = {
	.d_ioctl = pfioctl,
	.d_name = PF_NAME,
	.d_version = D_VERSION,
	};

	static volatile int pf_pfil_hooked = 0;
	int pf_end_threads = 0;
	struct mtx pf_task_mtx;
	pflog_packet_t *pflog_packet_ptr = NULL;

	int debug_pfugidhack = 0;
	SYSCTL_INT(_debug, OID_AUTO, pfugidhack, CTLFLAG_RW, &debug_pfugidhack, 0,
	"Enable/disable pf user/group rules mpsafe hack");

	void
	init_pf_mutex(void)
	{
	mtx_init(&pf_task_mtx, "pf task mtx", NULL, MTX_DEF);
	}

	void
	destroy_pf_mutex(void)
	{
	mtx_destroy(&pf_task_mtx);
	}

	void
	init_zone_var(void)
	{
	pf_src_tree_pl = pf_rule_pl = NULL;
	pf_state_pl = pf_altq_pl = pf_pooladdr_pl = NULL;
	pf_frent_pl = pf_frag_pl = pf_cache_pl = pf_cent_pl = NULL;
	pf_state_scrub_pl = NULL;
	pfr_ktable_pl = pfr_kentry_pl = NULL;
	}

	void
	cleanup_pf_zone(void)
	{
	UMA_DESTROY(pf_src_tree_pl);
	UMA_DESTROY(pf_rule_pl);
	UMA_DESTROY(pf_state_pl);
	UMA_DESTROY(pf_altq_pl);
	UMA_DESTROY(pf_pooladdr_pl);
	UMA_DESTROY(pf_frent_pl);
	UMA_DESTROY(pf_frag_pl);
	UMA_DESTROY(pf_cache_pl);
	UMA_DESTROY(pf_cent_pl);
	UMA_DESTROY(pfr_ktable_pl);
	UMA_DESTROY(pfr_kentry_pl2);
	UMA_DESTROY(pfr_kentry_pl);
	UMA_DESTROY(pf_state_scrub_pl);
	UMA_DESTROY(pfi_addr_pl);
	}

	int
	pfattach(void)
	{
	u_int32_t *my_timeout = pf_default_rule.timeout;
	int error = 1;

	do {
	UMA_CREATE(pf_src_tree_pl,struct pf_src_node, "pfsrctrpl");
	UMA_CREATE(pf_rule_pl, struct pf_rule, "pfrulepl");
	UMA_CREATE(pf_state_pl, struct pf_state, "pfstatepl");
	UMA_CREATE(pf_altq_pl, struct pf_altq, "pfaltqpl");
	UMA_CREATE(pf_pooladdr_pl, struct pf_pooladdr, "pfpooladdrpl");
	UMA_CREATE(pfr_ktable_pl, struct pfr_ktable, "pfrktable");
	UMA_CREATE(pfr_kentry_pl, struct pfr_kentry, "pfrkentry");
	UMA_CREATE(pfr_kentry_pl2, struct pfr_kentry, "pfrkentry2");
	UMA_CREATE(pf_frent_pl, struct pf_frent, "pffrent");
	UMA_CREATE(pf_frag_pl, struct pf_fragment, "pffrag");
	UMA_CREATE(pf_cache_pl, struct pf_fragment, "pffrcache");
	UMA_CREATE(pf_cent_pl, struct pf_frcache, "pffrcent");
	UMA_CREATE(pf_state_scrub_pl, struct pf_state_scrub,
	"pfstatescrub");
	UMA_CREATE(pfi_addr_pl, struct pfi_dynaddr, "pfiaddrpl");
	error = 0;
	} while(0);
	if (error) {
	cleanup_pf_zone();
	return (error);
	}
	pfr_initialize();
	pfi_initialize();
	if ( (error = pf_osfp_initialize()) ) {
	cleanup_pf_zone();
	pf_osfp_cleanup();
	return (error);
	}

	pf_pool_limits[PF_LIMIT_STATES].pp = pf_state_pl;
	pf_pool_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT;
	pf_pool_limits[PF_LIMIT_SRC_NODES].pp = pf_src_tree_pl;
	pf_pool_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT;
	pf_pool_limits[PF_LIMIT_FRAGS].pp = pf_frent_pl;
	pf_pool_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT;
	pf_pool_limits[PF_LIMIT_TABLES].pp = pfr_ktable_pl;
	pf_pool_limits[PF_LIMIT_TABLES].limit = PFR_KTABLE_HIWAT;
	pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].pp = pfr_kentry_pl;
	pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].limit = PFR_KENTRY_HIWAT;
	uma_zone_set_max(pf_pool_limits[PF_LIMIT_STATES].pp,
	pf_pool_limits[PF_LIMIT_STATES].limit);

	RB_INIT(&tree_src_tracking);
	RB_INIT(&pf_anchors);
	pf_init_ruleset(&pf_main_ruleset);
	TAILQ_INIT(&pf_altqs[0]);
	TAILQ_INIT(&pf_altqs[1]);
	TAILQ_INIT(&pf_pabuf);
	pf_altqs_active = &pf_altqs[0];
	pf_altqs_inactive = &pf_altqs[1];
	TAILQ_INIT(&state_list);

	/* default rule should never be garbage collected */
	pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
	pf_default_rule.action = PF_PASS;
	pf_default_rule.nr = -1;
	pf_default_rule.rtableid = -1;

	/* initialize default timeouts */
	my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL;
	my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL;
	my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL;
	my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL;
	my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL;
	my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL;
	my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL;
	my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL;
	my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL;
	my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL;
	my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL;
	my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL;
	my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL;
	my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL;
	my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL;
	my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL;
	my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL;
	my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL;
	my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START;
	my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END;

	pf_normalize_init();
	bzero(&pf_status, sizeof(pf_status));
	pf_status.debug = PF_DEBUG_URGENT;

	pf_pfil_hooked = 0;

	/* XXX do our best to avoid a conflict */
	pf_status.hostid = arc4random();

	if (kproc_create(pf_purge_thread, NULL, NULL, 0, 0, "pfpurge"))
	return (ENXIO);

	return (error);
	}
	#else /* !__FreeBSD__ */
	void
	pfattach(int num)
	{
	u_int32_t *timeout = pf_default_rule.timeout;

	pool_init(&pf_rule_pl, sizeof(struct pf_rule), 0, 0, 0, "pfrulepl",
	&pool_allocator_nointr);
	pool_init(&pf_src_tree_pl, sizeof(struct pf_src_node), 0, 0, 0,
	"pfsrctrpl", NULL);
	pool_init(&pf_state_pl, sizeof(struct pf_state), 0, 0, 0, "pfstatepl",
	NULL);
	pool_init(&pf_altq_pl, sizeof(struct pf_altq), 0, 0, 0, "pfaltqpl",
	&pool_allocator_nointr);
	pool_init(&pf_pooladdr_pl, sizeof(struct pf_pooladdr), 0, 0, 0,
	"pfpooladdrpl", &pool_allocator_nointr);
	pfr_initialize();
	pfi_initialize();
	pf_osfp_initialize();

	pool_sethardlimit(pf_pool_limits[PF_LIMIT_STATES].pp,
	pf_pool_limits[PF_LIMIT_STATES].limit, NULL, 0);

	if (ctob(physmem) <= 10010241024)
	pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].limit =
	PFR_KENTRY_HIWAT_SMALL;

	RB_INIT(&tree_src_tracking);
	RB_INIT(&pf_anchors);
	pf_init_ruleset(&pf_main_ruleset);
	TAILQ_INIT(&pf_altqs[0]);
	TAILQ_INIT(&pf_altqs[1]);
	TAILQ_INIT(&pf_pabuf);
	pf_altqs_active = &pf_altqs[0];
	pf_altqs_inactive = &pf_altqs[1];
	TAILQ_INIT(&state_list);

	/* default rule should never be garbage collected */
	pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
	pf_default_rule.action = PF_PASS;
	pf_default_rule.nr = -1;
	pf_default_rule.rtableid = -1;

	/* initialize default timeouts */
	timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL;
	timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL;
	timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL;
	timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL;
	timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL;
	timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL;
	timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL;
	timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL;
	timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL;
	timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL;
	timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL;
	timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL;
	timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL;
	timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL;
	timeout[PFTM_FRAG] = PFTM_FRAG_VAL;
	timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL;
	timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL;
	timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL;
	timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START;
	timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END;

	pf_normalize_init();
	bzero(&pf_status, sizeof(pf_status));
	pf_status.debug = PF_DEBUG_URGENT;

	/* XXX do our best to avoid a conflict */
	pf_status.hostid = arc4random();

	/* require process context to purge states, so perform in a thread */
	kproc_create_deferred(pf_thread_create, NULL);
	}

	void
	pf_thread_create(void *v)
	{
	if (kproc_create(pf_purge_thread, NULL, NULL, "pfpurge"))
	panic("pfpurge thread");
	}

	int
	pfopen(struct cdev dev, int flags, int fmt, struct proc p)
	{
	if (dev2unit(dev) >= 1)
	return (ENXIO);
	return (0);
	}

	int
	pfclose(struct cdev dev, int flags, int fmt, struct proc p)
	{
	if (dev2unit(dev) >= 1)
	return (ENXIO);
	return (0);
	}
	#endif /* __FreeBSD__ */

	struct pf_pool *
	pf_get_pool(char *anchor, u_int32_t ticket, u_int8_t rule_action,
	u_int32_t rule_number, u_int8_t r_last, u_int8_t active,
	u_int8_t check_ticket)
	{
	struct pf_ruleset *ruleset;
	struct pf_rule *rule;
	int rs_num;

	ruleset = pf_find_ruleset(anchor);
	if (ruleset == NULL)
	return (NULL);
	rs_num = pf_get_ruleset_number(rule_action);
	if (rs_num >= PF_RULESET_MAX)
	return (NULL);
	if (active) {
	if (check_ticket && ticket !=
	ruleset->rules[rs_num].active.ticket)
	return (NULL);
	if (r_last)
	rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
	pf_rulequeue);
	else
	rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
	} else {
	if (check_ticket && ticket !=
	ruleset->rules[rs_num].inactive.ticket)
	return (NULL);
	if (r_last)
	rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
	pf_rulequeue);
	else
	rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr);
	}
	if (!r_last) {
	while ((rule != NULL) && (rule->nr != rule_number))
	rule = TAILQ_NEXT(rule, entries);
	}
	if (rule == NULL)
	return (NULL);

	return (&rule->rpool);
	}

	void
	pf_mv_pool(struct pf_palist poola, struct pf_palist poolb)
	{
	struct pf_pooladdr *mv_pool_pa;

	while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) {
	TAILQ_REMOVE(poola, mv_pool_pa, entries);
	TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries);
	}
	}

	void
	pf_empty_pool(struct pf_palist *poola)
	{
	struct pf_pooladdr *empty_pool_pa;

	while ((empty_pool_pa = TAILQ_FIRST(poola)) != NULL) {
	pfi_dynaddr_remove(&empty_pool_pa->addr);
	pf_tbladdr_remove(&empty_pool_pa->addr);
	pfi_kif_unref(empty_pool_pa->kif, PFI_KIF_REF_RULE);
	TAILQ_REMOVE(poola, empty_pool_pa, entries);
	pool_put(&pf_pooladdr_pl, empty_pool_pa);
	}
	}

	void
	pf_rm_rule(struct pf_rulequeue rulequeue, struct pf_rule rule)
	{
	if (rulequeue != NULL) {
	if (rule->states <= 0) {
	/*
	* XXX - we need to remove the table before detaching
	* the rule to make sure the table code does not delete
	* the anchor under our feet.
	*/
	pf_tbladdr_remove(&rule->src.addr);
	pf_tbladdr_remove(&rule->dst.addr);
	if (rule->overload_tbl)
	pfr_detach_table(rule->overload_tbl);
	}
	TAILQ_REMOVE(rulequeue, rule, entries);
	rule->entries.tqe_prev = NULL;
	rule->nr = -1;
	}

	if (rule->states > 0 \|\| rule->src_nodes > 0 \|\|
	rule->entries.tqe_prev != NULL)
	return;
	pf_tag_unref(rule->tag);
	pf_tag_unref(rule->match_tag);
	#ifdef ALTQ
	if (rule->pqid != rule->qid)
	pf_qid_unref(rule->pqid);
	pf_qid_unref(rule->qid);
	#endif
	pf_rtlabel_remove(&rule->src.addr);
	pf_rtlabel_remove(&rule->dst.addr);
	pfi_dynaddr_remove(&rule->src.addr);
	pfi_dynaddr_remove(&rule->dst.addr);
	if (rulequeue == NULL) {
	pf_tbladdr_remove(&rule->src.addr);
	pf_tbladdr_remove(&rule->dst.addr);
	if (rule->overload_tbl)
	pfr_detach_table(rule->overload_tbl);
	}
	pfi_kif_unref(rule->kif, PFI_KIF_REF_RULE);
	pf_anchor_remove(rule);
	pf_empty_pool(&rule->rpool.list);
	pool_put(&pf_rule_pl, rule);
	}

	u_int16_t
	tagname2tag(struct pf_tags head, char tagname)
	{
	struct pf_tagname tag, p = NULL;
	u_int16_t new_tagid = 1;

	TAILQ_FOREACH(tag, head, entries)
	if (strcmp(tagname, tag->name) == 0) {
	tag->ref++;
	return (tag->tag);
	}

	/*
	* to avoid fragmentation, we do a linear search from the beginning
	* and take the first free slot we find. if there is none or the list
	* is empty, append a new entry at the end.
	*/

	/* new entry */
	if (!TAILQ_EMPTY(head))
	for (p = TAILQ_FIRST(head); p != NULL &&
	p->tag == new_tagid; p = TAILQ_NEXT(p, entries))
	new_tagid = p->tag + 1;

	if (new_tagid > TAGID_MAX)
	return (0);

	/* allocate and fill new struct pf_tagname */
	tag = (struct pf_tagname *)malloc(sizeof(struct pf_tagname),
	M_TEMP, M_NOWAIT);
	if (tag == NULL)
	return (0);
	bzero(tag, sizeof(struct pf_tagname));
	strlcpy(tag->name, tagname, sizeof(tag->name));
	tag->tag = new_tagid;
	tag->ref++;

	if (p != NULL) /* insert new entry before p */
	TAILQ_INSERT_BEFORE(p, tag, entries);
	else /* either list empty or no free slot in between */
	TAILQ_INSERT_TAIL(head, tag, entries);

	return (tag->tag);
	}

	void
	tag2tagname(struct pf_tags head, u_int16_t tagid, char p)
	{
	struct pf_tagname *tag;

	TAILQ_FOREACH(tag, head, entries)
	if (tag->tag == tagid) {
	strlcpy(p, tag->name, PF_TAG_NAME_SIZE);
	return;
	}
	}

	void
	tag_unref(struct pf_tags *head, u_int16_t tag)
	{
	struct pf_tagname p, next;

	if (tag == 0)
	return;

	for (p = TAILQ_FIRST(head); p != NULL; p = next) {
	next = TAILQ_NEXT(p, entries);
	if (tag == p->tag) {
	if (--p->ref == 0) {
	TAILQ_REMOVE(head, p, entries);
	free(p, M_TEMP);
	}
	break;
	}
	}
	}

	u_int16_t
	pf_tagname2tag(char *tagname)
	{
	return (tagname2tag(&pf_tags, tagname));
	}

	void
	pf_tag2tagname(u_int16_t tagid, char *p)
	{
	tag2tagname(&pf_tags, tagid, p);
	}

	void
	pf_tag_ref(u_int16_t tag)
	{
	struct pf_tagname *t;

	TAILQ_FOREACH(t, &pf_tags, entries)
	if (t->tag == tag)
	break;
	if (t != NULL)
	t->ref++;
	}

	void
	pf_tag_unref(u_int16_t tag)
	{
	tag_unref(&pf_tags, tag);
	}

	int
	pf_rtlabel_add(struct pf_addr_wrap *a)
	{
	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	return (0);
	#else
	if (a->type == PF_ADDR_RTLABEL &&
	(a->v.rtlabel = rtlabel_name2id(a->v.rtlabelname)) == 0)
	return (-1);
	return (0);
	#endif
	}

	void
	pf_rtlabel_remove(struct pf_addr_wrap *a)
	{
	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	#else
	if (a->type == PF_ADDR_RTLABEL)
	rtlabel_unref(a->v.rtlabel);
	#endif
	}

	void
	pf_rtlabel_copyout(struct pf_addr_wrap *a)
	{
	#ifdef __FreeBSD__
	/* XXX_IMPORT: later */
	if (a->type == PF_ADDR_RTLABEL && a->v.rtlabel)
	strlcpy(a->v.rtlabelname, "?", sizeof(a->v.rtlabelname));
	#else
	const char *name;

	if (a->type == PF_ADDR_RTLABEL && a->v.rtlabel) {
	if ((name = rtlabel_id2name(a->v.rtlabel)) == NULL)
	strlcpy(a->v.rtlabelname, "?",
	sizeof(a->v.rtlabelname));
	else
	strlcpy(a->v.rtlabelname, name,
	sizeof(a->v.rtlabelname));
	}
	#endif
	}

	#ifdef ALTQ
	u_int32_t
	pf_qname2qid(char *qname)
	{
	return ((u_int32_t)tagname2tag(&pf_qids, qname));
	}

	void
	pf_qid2qname(u_int32_t qid, char *p)
	{
	tag2tagname(&pf_qids, (u_int16_t)qid, p);
	}

	void
	pf_qid_unref(u_int32_t qid)
	{
	tag_unref(&pf_qids, (u_int16_t)qid);
	}

	int
	pf_begin_altq(u_int32_t *ticket)
	{
	struct pf_altq *altq;
	int error = 0;

	/* Purge the old altq list */
	while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) {
	TAILQ_REMOVE(pf_altqs_inactive, altq, entries);
	#ifdef __FreeBSD__
	if (altq->qname[0] == 0 &&
	(altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
	#else
	if (altq->qname[0] == 0) {
	#endif
	/* detach and destroy the discipline */
	error = altq_remove(altq);
	} else
	pf_qid_unref(altq->qid);
	pool_put(&pf_altq_pl, altq);
	}
	if (error)
	return (error);
	*ticket = ++ticket_altqs_inactive;
	altqs_inactive_open = 1;
	return (0);
	}

	int
	pf_rollback_altq(u_int32_t ticket)
	{
	struct pf_altq *altq;
	int error = 0;

	if (!altqs_inactive_open \|\| ticket != ticket_altqs_inactive)
	return (0);
	/* Purge the old altq list */
	while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) {
	TAILQ_REMOVE(pf_altqs_inactive, altq, entries);
	#ifdef __FreeBSD__
	if (altq->qname[0] == 0 &&
	(altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
	#else
	if (altq->qname[0] == 0) {
	#endif
	/* detach and destroy the discipline */
	error = altq_remove(altq);
	} else
	pf_qid_unref(altq->qid);
	pool_put(&pf_altq_pl, altq);
	}
	altqs_inactive_open = 0;
	return (error);
	}

	int
	pf_commit_altq(u_int32_t ticket)
	{
	struct pf_altqqueue *old_altqs;
	struct pf_altq *altq;
	int s, err, error = 0;

	if (!altqs_inactive_open \|\| ticket != ticket_altqs_inactive)
	return (EBUSY);

	/* swap altqs, keep the old. */
	s = splsoftnet();
	old_altqs = pf_altqs_active;
	pf_altqs_active = pf_altqs_inactive;
	pf_altqs_inactive = old_altqs;
	ticket_altqs_active = ticket_altqs_inactive;

	/* Attach new disciplines */
	TAILQ_FOREACH(altq, pf_altqs_active, entries) {
	#ifdef __FreeBSD__
	if (altq->qname[0] == 0 &&
	(altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
	#else
	if (altq->qname[0] == 0) {
	#endif
	/* attach the discipline */
	error = altq_pfattach(altq);
	if (error == 0 && pf_altq_running)
	error = pf_enable_altq(altq);
	if (error != 0) {
	splx(s);
	return (error);
	}
	}
	}

	/* Purge the old altq list */
	while ((altq = TAILQ_FIRST(pf_altqs_inactive)) != NULL) {
	TAILQ_REMOVE(pf_altqs_inactive, altq, entries);
	#ifdef __FreeBSD__
	if (altq->qname[0] == 0 &&
	(altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
	#else
	if (altq->qname[0] == 0) {
	#endif
	/* detach and destroy the discipline */
	if (pf_altq_running)
	error = pf_disable_altq(altq);
	err = altq_pfdetach(altq);
	if (err != 0 && error == 0)
	error = err;
	err = altq_remove(altq);
	if (err != 0 && error == 0)
	error = err;
	} else
	pf_qid_unref(altq->qid);
	pool_put(&pf_altq_pl, altq);
	}
	splx(s);

	altqs_inactive_open = 0;
	return (error);
	}

	int
	pf_enable_altq(struct pf_altq *altq)
	{
	struct ifnet *ifp;
	struct tb_profile tb;
	int s, error = 0;

	if ((ifp = ifunit(altq->ifname)) == NULL)
	return (EINVAL);

	if (ifp->if_snd.altq_type != ALTQT_NONE)
	error = altq_enable(&ifp->if_snd);

	/* set tokenbucket regulator */
	if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
	tb.rate = altq->ifbandwidth;
	tb.depth = altq->tbrsize;
	s = splnet();
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	error = tbr_set(&ifp->if_snd, &tb);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	splx(s);
	}

	return (error);
	}

	int
	pf_disable_altq(struct pf_altq *altq)
	{
	struct ifnet *ifp;
	struct tb_profile tb;
	int s, error;

	if ((ifp = ifunit(altq->ifname)) == NULL)
	return (EINVAL);

	/*
	* when the discipline is no longer referenced, it was overridden
	* by a new one. if so, just return.
	*/
	if (altq->altq_disc != ifp->if_snd.altq_disc)
	return (0);

	error = altq_disable(&ifp->if_snd);

	if (error == 0) {
	/* clear tokenbucket regulator */
	tb.rate = 0;
	s = splnet();
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	error = tbr_set(&ifp->if_snd, &tb);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	splx(s);
	}

	return (error);
	}

	#ifdef __FreeBSD__
	void
	pf_altq_ifnet_event(struct ifnet *ifp, int remove)
	{
	struct ifnet *ifp1;
	struct pf_altq a1, a2, *a3;
	u_int32_t ticket;
	int error = 0;

	/* Interrupt userland queue modifications */
	if (altqs_inactive_open)
	pf_rollback_altq(ticket_altqs_inactive);

	/* Start new altq ruleset */
	if (pf_begin_altq(&ticket))
	return;

	/* Copy the current active set */
	TAILQ_FOREACH(a1, pf_altqs_active, entries) {
	a2 = pool_get(&pf_altq_pl, PR_NOWAIT);
	if (a2 == NULL) {
	error = ENOMEM;
	break;
	}
	bcopy(a1, a2, sizeof(struct pf_altq));

	if (a2->qname[0] != 0) {
	if ((a2->qid = pf_qname2qid(a2->qname)) == 0) {
	error = EBUSY;
	pool_put(&pf_altq_pl, a2);
	break;
	}
	a2->altq_disc = NULL;
	TAILQ_FOREACH(a3, pf_altqs_inactive, entries) {
	if (strncmp(a3->ifname, a2->ifname,
	IFNAMSIZ) == 0 && a3->qname[0] == 0) {
	a2->altq_disc = a3->altq_disc;
	break;
	}
	}
	}
	/* Deactivate the interface in question */
	a2->local_flags &= ~PFALTQ_FLAG_IF_REMOVED;
	if ((ifp1 = ifunit(a2->ifname)) == NULL \|\|
	(remove && ifp1 == ifp)) {
	a2->local_flags \|= PFALTQ_FLAG_IF_REMOVED;
	} else {
	PF_UNLOCK();
	error = altq_add(a2);
	PF_LOCK();

	if (ticket != ticket_altqs_inactive)
	error = EBUSY;

	if (error) {
	pool_put(&pf_altq_pl, a2);
	break;
	}
	}

	TAILQ_INSERT_TAIL(pf_altqs_inactive, a2, entries);
	}

	if (error != 0)
	pf_rollback_altq(ticket);
	else
	pf_commit_altq(ticket);
	}
	#endif
	#endif /* ALTQ */

	int
	pf_begin_rules(u_int32_t ticket, int rs_num, const char anchor)
	{
	struct pf_ruleset *rs;
	struct pf_rule *rule;

	if (rs_num < 0 \|\| rs_num >= PF_RULESET_MAX)
	return (EINVAL);
	rs = pf_find_or_create_ruleset(anchor);
	if (rs == NULL)
	return (EINVAL);
	while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
	pf_rm_rule(rs->rules[rs_num].inactive.ptr, rule);
	rs->rules[rs_num].inactive.rcount--;
	}
	*ticket = ++rs->rules[rs_num].inactive.ticket;
	rs->rules[rs_num].inactive.open = 1;
	return (0);
	}

	int
	pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor)
	{
	struct pf_ruleset *rs;
	struct pf_rule *rule;

	if (rs_num < 0 \|\| rs_num >= PF_RULESET_MAX)
	return (EINVAL);
	rs = pf_find_ruleset(anchor);
	if (rs == NULL \|\| !rs->rules[rs_num].inactive.open \|\|
	rs->rules[rs_num].inactive.ticket != ticket)
	return (0);
	while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
	pf_rm_rule(rs->rules[rs_num].inactive.ptr, rule);
	rs->rules[rs_num].inactive.rcount--;
	}
	rs->rules[rs_num].inactive.open = 0;
	return (0);
	}

	#define PF_MD5_UPD(st, elm) \
	MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm))

	#define PF_MD5_UPD_STR(st, elm) \
	MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm))

	#define PF_MD5_UPD_HTONL(st, elm, stor) do { \
	(stor) = htonl((st)->elm); \
	MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\
	} while (0)

	#define PF_MD5_UPD_HTONS(st, elm, stor) do { \
	(stor) = htons((st)->elm); \
	MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\
	} while (0)

	void
	pf_hash_rule_addr(MD5_CTX ctx, struct pf_rule_addr pfr)
	{
	PF_MD5_UPD(pfr, addr.type);
	switch (pfr->addr.type) {
	case PF_ADDR_DYNIFTL:
	PF_MD5_UPD(pfr, addr.v.ifname);
	PF_MD5_UPD(pfr, addr.iflags);
	break;
	case PF_ADDR_TABLE:
	PF_MD5_UPD(pfr, addr.v.tblname);
	break;
	case PF_ADDR_ADDRMASK:
	/* XXX ignore af? */
	PF_MD5_UPD(pfr, addr.v.a.addr.addr32);
	PF_MD5_UPD(pfr, addr.v.a.mask.addr32);
	break;
	case PF_ADDR_RTLABEL:
	PF_MD5_UPD(pfr, addr.v.rtlabelname);
	break;
	}

	PF_MD5_UPD(pfr, port[0]);
	PF_MD5_UPD(pfr, port[1]);
	PF_MD5_UPD(pfr, neg);
	PF_MD5_UPD(pfr, port_op);
	}

	void
	pf_hash_rule(MD5_CTX ctx, struct pf_rule rule)
	{
	u_int16_t x;
	u_int32_t y;

	pf_hash_rule_addr(ctx, &rule->src);
	pf_hash_rule_addr(ctx, &rule->dst);
	PF_MD5_UPD_STR(rule, label);
	PF_MD5_UPD_STR(rule, ifname);
	PF_MD5_UPD_STR(rule, match_tagname);
	PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */
	PF_MD5_UPD_HTONL(rule, os_fingerprint, y);
	PF_MD5_UPD_HTONL(rule, prob, y);
	PF_MD5_UPD_HTONL(rule, uid.uid[0], y);
	PF_MD5_UPD_HTONL(rule, uid.uid[1], y);
	PF_MD5_UPD(rule, uid.op);
	PF_MD5_UPD_HTONL(rule, gid.gid[0], y);
	PF_MD5_UPD_HTONL(rule, gid.gid[1], y);
	PF_MD5_UPD(rule, gid.op);
	PF_MD5_UPD_HTONL(rule, rule_flag, y);
	PF_MD5_UPD(rule, action);
	PF_MD5_UPD(rule, direction);
	PF_MD5_UPD(rule, af);
	PF_MD5_UPD(rule, quick);
	PF_MD5_UPD(rule, ifnot);
	PF_MD5_UPD(rule, match_tag_not);
	PF_MD5_UPD(rule, natpass);
	PF_MD5_UPD(rule, keep_state);
	PF_MD5_UPD(rule, proto);
	PF_MD5_UPD(rule, type);
	PF_MD5_UPD(rule, code);
	PF_MD5_UPD(rule, flags);
	PF_MD5_UPD(rule, flagset);
	PF_MD5_UPD(rule, allow_opts);
	PF_MD5_UPD(rule, rt);
	PF_MD5_UPD(rule, tos);
	}

	int
	pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
	{
	struct pf_ruleset *rs;
	struct pf_rule rule, *old_array;
	struct pf_rulequeue *old_rules;
	int s, error;
	u_int32_t old_rcount;

	if (rs_num < 0 \|\| rs_num >= PF_RULESET_MAX)
	return (EINVAL);
	rs = pf_find_ruleset(anchor);
	if (rs == NULL \|\| !rs->rules[rs_num].inactive.open \|\|
	ticket != rs->rules[rs_num].inactive.ticket)
	return (EBUSY);

	/* Calculate checksum for the main ruleset */
	if (rs == &pf_main_ruleset) {
	error = pf_setup_pfsync_matching(rs);
	if (error != 0)
	return (error);
	}

	/* Swap rules, keep the old. */
	s = splsoftnet();
	old_rules = rs->rules[rs_num].active.ptr;
	old_rcount = rs->rules[rs_num].active.rcount;
	old_array = rs->rules[rs_num].active.ptr_array;

	rs->rules[rs_num].active.ptr =
	rs->rules[rs_num].inactive.ptr;
	rs->rules[rs_num].active.ptr_array =
	rs->rules[rs_num].inactive.ptr_array;
	rs->rules[rs_num].active.rcount =
	rs->rules[rs_num].inactive.rcount;
	rs->rules[rs_num].inactive.ptr = old_rules;
	rs->rules[rs_num].inactive.ptr_array = old_array;
	rs->rules[rs_num].inactive.rcount = old_rcount;

	rs->rules[rs_num].active.ticket =
	rs->rules[rs_num].inactive.ticket;
	pf_calc_skip_steps(rs->rules[rs_num].active.ptr);


	/* Purge the old rule list. */
	while ((rule = TAILQ_FIRST(old_rules)) != NULL)
	pf_rm_rule(old_rules, rule);
	if (rs->rules[rs_num].inactive.ptr_array)
	free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
	rs->rules[rs_num].inactive.ptr_array = NULL;
	rs->rules[rs_num].inactive.rcount = 0;
	rs->rules[rs_num].inactive.open = 0;
	pf_remove_if_empty_ruleset(rs);
	splx(s);
	return (0);
	}

	int
	pf_setup_pfsync_matching(struct pf_ruleset *rs)
	{
	MD5_CTX ctx;
	struct pf_rule *rule;
	int rs_cnt;
	u_int8_t digest[PF_MD5_DIGEST_LENGTH];

	MD5Init(&ctx);
	for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) {
	/* XXX PF_RULESET_SCRUB as well? */
	if (rs_cnt == PF_RULESET_SCRUB)
	continue;

	if (rs->rules[rs_cnt].inactive.ptr_array)
	free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
	rs->rules[rs_cnt].inactive.ptr_array = NULL;

	if (rs->rules[rs_cnt].inactive.rcount) {
	rs->rules[rs_cnt].inactive.ptr_array =
	malloc(sizeof(caddr_t) *
	rs->rules[rs_cnt].inactive.rcount,
	M_TEMP, M_NOWAIT);

	if (!rs->rules[rs_cnt].inactive.ptr_array)
	return (ENOMEM);
	}

	TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
	entries) {
	pf_hash_rule(&ctx, rule);
	(rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
	}
	}

	MD5Final(digest, &ctx);
	memcpy(pf_status.pf_chksum, digest, sizeof(pf_status.pf_chksum));
	return (0);
	}

	int
	#ifdef __FreeBSD__
	pfioctl(struct cdev dev, u_long cmd, caddr_t addr, int flags, struct thread td)
	#else
	pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
	#endif
	{
	struct pf_pooladdr *pa = NULL;
	struct pf_pool *pool = NULL;
	#ifndef __FreeBSD__
	int s;
	#endif
	int error = 0;

	/* XXX keep in sync with switch() below */
	#ifdef __FreeBSD__
	if (securelevel_gt(td->td_ucred, 2))
	#else
	if (securelevel > 1)
	#endif
	switch (cmd) {
	case DIOCGETRULES:
	case DIOCGETRULE:
	case DIOCGETADDRS:
	case DIOCGETADDR:
	case DIOCGETSTATE:
	case DIOCSETSTATUSIF:
	case DIOCGETSTATUS:
	case DIOCCLRSTATUS:
	case DIOCNATLOOK:
	case DIOCSETDEBUG:
	case DIOCGETSTATES:
	case DIOCGETTIMEOUT:
	case DIOCCLRRULECTRS:
	case DIOCGETLIMIT:
	case DIOCGETALTQS:
	case DIOCGETALTQ:
	case DIOCGETQSTATS:
	case DIOCGETRULESETS:
	case DIOCGETRULESET:
	case DIOCRGETTABLES:
	case DIOCRGETTSTATS:
	case DIOCRCLRTSTATS:
	case DIOCRCLRADDRS:
	case DIOCRADDADDRS:
	case DIOCRDELADDRS:
	case DIOCRSETADDRS:
	case DIOCRGETADDRS:
	case DIOCRGETASTATS:
	case DIOCRCLRASTATS:
	case DIOCRTSTADDRS:
	case DIOCOSFPGET:
	case DIOCGETSRCNODES:
	case DIOCCLRSRCNODES:
	case DIOCIGETIFACES:
	#ifdef __FreeBSD__
	case DIOCGIFSPEED:
	#endif
	case DIOCSETIFFLAG:
	case DIOCCLRIFFLAG:
	break;
	case DIOCRCLRTABLES:
	case DIOCRADDTABLES:
	case DIOCRDELTABLES:
	case DIOCRSETTFLAGS:
	if (((struct pfioc_table *)addr)->pfrio_flags &
	PFR_FLAG_DUMMY)
	break; /* dummy operation ok */
	return (EPERM);
	default:
	return (EPERM);
	}

	if (!(flags & FWRITE))
	switch (cmd) {
	case DIOCGETRULES:
	case DIOCGETADDRS:
	case DIOCGETADDR:
	case DIOCGETSTATE:
	case DIOCGETSTATUS:
	case DIOCGETSTATES:
	case DIOCGETTIMEOUT:
	case DIOCGETLIMIT:
	case DIOCGETALTQS:
	case DIOCGETALTQ:
	case DIOCGETQSTATS:
	case DIOCGETRULESETS:
	case DIOCGETRULESET:
	case DIOCNATLOOK:
	case DIOCRGETTABLES:
	case DIOCRGETTSTATS:
	case DIOCRGETADDRS:
	case DIOCRGETASTATS:
	case DIOCRTSTADDRS:
	case DIOCOSFPGET:
	case DIOCGETSRCNODES:
	case DIOCIGETIFACES:
	#ifdef __FreeBSD__
	case DIOCGIFSPEED:
	#endif
	break;
	case DIOCRCLRTABLES:
	case DIOCRADDTABLES:
	case DIOCRDELTABLES:
	case DIOCRCLRTSTATS:
	case DIOCRCLRADDRS:
	case DIOCRADDADDRS:
	case DIOCRDELADDRS:
	case DIOCRSETADDRS:
	case DIOCRSETTFLAGS:
	if (((struct pfioc_table *)addr)->pfrio_flags &
	PFR_FLAG_DUMMY) {
	flags \|= FWRITE; /* need write lock for dummy */
	break; /* dummy operation ok */
	}
	return (EACCES);
	case DIOCGETRULE:
	if (((struct pfioc_rule *)addr)->action == PF_GET_CLR_CNTR)
	return (EACCES);
	break;
	default:
	return (EACCES);
	}

	if (flags & FWRITE)
	#ifdef __FreeBSD__
	sx_xlock(&pf_consistency_lock);
	else
	sx_slock(&pf_consistency_lock);
	#else
	rw_enter_write(&pf_consistency_lock);
	else
	rw_enter_read(&pf_consistency_lock);
	#endif

	#ifdef __FreeBSD__
	PF_LOCK();
	#else
	s = splsoftnet();
	#endif
	switch (cmd) {

	case DIOCSTART:
	if (pf_status.running)
	error = EEXIST;
	else {
	#ifdef __FreeBSD__
	PF_UNLOCK();
	error = hook_pf();
	PF_LOCK();
	if (error) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: pfil registeration fail\n"));
	break;
	}
	#endif
	pf_status.running = 1;
	pf_status.since = time_second;
	if (pf_status.stateid == 0) {
	pf_status.stateid = time_second;
	pf_status.stateid = pf_status.stateid << 32;
	}
	DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n"));
	}
	break;

	case DIOCSTOP:
	if (!pf_status.running)
	error = ENOENT;
	else {
	pf_status.running = 0;
	#ifdef __FreeBSD__
	PF_UNLOCK();
	error = dehook_pf();
	PF_LOCK();
	if (error) {
	pf_status.running = 1;
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: pfil unregisteration failed\n"));
	}
	#endif
	pf_status.since = time_second;
	DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
	}
	break;

	case DIOCADDRULE: {
	struct pfioc_rule pr = (struct pfioc_rule )addr;
	struct pf_ruleset *ruleset;
	struct pf_rule rule, tail;
	struct pf_pooladdr *pa;
	int rs_num;

	pr->anchor[sizeof(pr->anchor) - 1] = 0;
	ruleset = pf_find_ruleset(pr->anchor);
	if (ruleset == NULL) {
	error = EINVAL;
	break;
	}
	rs_num = pf_get_ruleset_number(pr->rule.action);
	if (rs_num >= PF_RULESET_MAX) {
	error = EINVAL;
	break;
	}
	if (pr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
	error = EINVAL;
	break;
	}
	if (pr->ticket != ruleset->rules[rs_num].inactive.ticket) {
	#ifdef __FreeBSD__
	DPFPRINTF(PF_DEBUG_MISC,
	("ticket: %d != [%d]%d\n", pr->ticket, rs_num,
	ruleset->rules[rs_num].inactive.ticket));
	#endif
	error = EBUSY;
	break;
	}
	if (pr->pool_ticket != ticket_pabuf) {
	#ifdef __FreeBSD__
	DPFPRINTF(PF_DEBUG_MISC,
	("pool_ticket: %d != %d\n", pr->pool_ticket,
	ticket_pabuf));
	#endif
	error = EBUSY;
	break;
	}
	rule = pool_get(&pf_rule_pl, PR_NOWAIT);
	if (rule == NULL) {
	error = ENOMEM;
	break;
	}
	bcopy(&pr->rule, rule, sizeof(struct pf_rule));
	#ifdef __FreeBSD__
	rule->cuid = td->td_ucred->cr_ruid;
	rule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
	#else
	rule->cuid = p->p_cred->p_ruid;
	rule->cpid = p->p_pid;
	#endif
	rule->anchor = NULL;
	rule->kif = NULL;
	TAILQ_INIT(&rule->rpool.list);
	/* initialize refcounting */
	rule->states = 0;
	rule->src_nodes = 0;
	rule->entries.tqe_prev = NULL;
	#ifndef INET
	if (rule->af == AF_INET) {
	pool_put(&pf_rule_pl, rule);
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET */
	#ifndef INET6
	if (rule->af == AF_INET6) {
	pool_put(&pf_rule_pl, rule);
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET6 */
	tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
	pf_rulequeue);
	if (tail)
	rule->nr = tail->nr + 1;
	else
	rule->nr = 0;
	if (rule->ifname[0]) {
	rule->kif = pfi_kif_get(rule->ifname);
	if (rule->kif == NULL) {
	pool_put(&pf_rule_pl, rule);
	error = EINVAL;
	break;
	}
	pfi_kif_ref(rule->kif, PFI_KIF_REF_RULE);
	}

	#ifdef __FreeBSD__ /* ROUTING */
	if (rule->rtableid > 0 && rule->rtableid > rt_numfibs)
	#else
	if (rule->rtableid > 0 && !rtable_exists(rule->rtableid))
	#endif
	error = EBUSY;

	#ifdef ALTQ
	/* set queue IDs */
	if (rule->qname[0] != 0) {
	if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
	error = EBUSY;
	else if (rule->pqname[0] != 0) {
	if ((rule->pqid =
	pf_qname2qid(rule->pqname)) == 0)
	error = EBUSY;
	} else
	rule->pqid = rule->qid;
	}
	#endif
	if (rule->tagname[0])
	if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
	error = EBUSY;
	if (rule->match_tagname[0])
	if ((rule->match_tag =
	pf_tagname2tag(rule->match_tagname)) == 0)
	error = EBUSY;
	if (rule->rt && !rule->direction)
	error = EINVAL;
	#if NPFLOG > 0
	#ifdef __FreeBSD__
	if (!rule->log)
	rule->logif = 0;
	#endif
	if (rule->logif >= PFLOGIFS_MAX)
	error = EINVAL;
	#endif
	if (pf_rtlabel_add(&rule->src.addr) \|\|
	pf_rtlabel_add(&rule->dst.addr))
	error = EBUSY;
	if (pfi_dynaddr_setup(&rule->src.addr, rule->af))
	error = EINVAL;
	if (pfi_dynaddr_setup(&rule->dst.addr, rule->af))
	error = EINVAL;
	if (pf_tbladdr_setup(ruleset, &rule->src.addr))
	error = EINVAL;
	if (pf_tbladdr_setup(ruleset, &rule->dst.addr))
	error = EINVAL;
	if (pf_anchor_setup(rule, ruleset, pr->anchor_call))
	error = EINVAL;
	TAILQ_FOREACH(pa, &pf_pabuf, entries)
	if (pf_tbladdr_setup(ruleset, &pa->addr))
	error = EINVAL;

	if (rule->overload_tblname[0]) {
	if ((rule->overload_tbl = pfr_attach_table(ruleset,
	rule->overload_tblname)) == NULL)
	error = EINVAL;
	else
	rule->overload_tbl->pfrkt_flags \|=
	PFR_TFLAG_ACTIVE;
	}

	pf_mv_pool(&pf_pabuf, &rule->rpool.list);
	if (((((rule->action == PF_NAT) \|\| (rule->action == PF_RDR) \|\|
	(rule->action == PF_BINAT)) && rule->anchor == NULL) \|\|
	(rule->rt > PF_FASTROUTE)) &&
	(TAILQ_FIRST(&rule->rpool.list) == NULL))
	error = EINVAL;

	if (error) {
	pf_rm_rule(NULL, rule);
	break;
	}

	#ifdef __FreeBSD__
	if (!debug_pfugidhack && (rule->uid.op \|\| rule->gid.op \|\|
	rule->log & PF_LOG_SOCKET_LOOKUP)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: debug.pfugidhack enabled\n"));
	debug_pfugidhack = 1;
	}
	#endif

	rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list);
	rule->evaluations = rule->packets[0] = rule->packets[1] =
	rule->bytes[0] = rule->bytes[1] = 0;
	TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
	rule, entries);
	ruleset->rules[rs_num].inactive.rcount++;
	break;
	}

	case DIOCGETRULES: {
	struct pfioc_rule pr = (struct pfioc_rule )addr;
	struct pf_ruleset *ruleset;
	struct pf_rule *tail;
	int rs_num;

	pr->anchor[sizeof(pr->anchor) - 1] = 0;
	ruleset = pf_find_ruleset(pr->anchor);
	if (ruleset == NULL) {
	error = EINVAL;
	break;
	}
	rs_num = pf_get_ruleset_number(pr->rule.action);
	if (rs_num >= PF_RULESET_MAX) {
	error = EINVAL;
	break;
	}
	tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
	pf_rulequeue);
	if (tail)
	pr->nr = tail->nr + 1;
	else
	pr->nr = 0;
	pr->ticket = ruleset->rules[rs_num].active.ticket;
	break;
	}

	case DIOCGETRULE: {
	struct pfioc_rule pr = (struct pfioc_rule )addr;
	struct pf_ruleset *ruleset;
	struct pf_rule *rule;
	int rs_num, i;

	pr->anchor[sizeof(pr->anchor) - 1] = 0;
	ruleset = pf_find_ruleset(pr->anchor);
	if (ruleset == NULL) {
	error = EINVAL;
	break;
	}
	rs_num = pf_get_ruleset_number(pr->rule.action);
	if (rs_num >= PF_RULESET_MAX) {
	error = EINVAL;
	break;
	}
	if (pr->ticket != ruleset->rules[rs_num].active.ticket) {
	error = EBUSY;
	break;
	}
	rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
	while ((rule != NULL) && (rule->nr != pr->nr))
	rule = TAILQ_NEXT(rule, entries);
	if (rule == NULL) {
	error = EBUSY;
	break;
	}
	bcopy(rule, &pr->rule, sizeof(struct pf_rule));
	if (pf_anchor_copyout(ruleset, rule, pr)) {
	error = EBUSY;
	break;
	}
	pfi_dynaddr_copyout(&pr->rule.src.addr);
	pfi_dynaddr_copyout(&pr->rule.dst.addr);
	pf_tbladdr_copyout(&pr->rule.src.addr);
	pf_tbladdr_copyout(&pr->rule.dst.addr);
	pf_rtlabel_copyout(&pr->rule.src.addr);
	pf_rtlabel_copyout(&pr->rule.dst.addr);
	for (i = 0; i < PF_SKIP_COUNT; ++i)
	if (rule->skip[i].ptr == NULL)
	pr->rule.skip[i].nr = -1;
	else
	pr->rule.skip[i].nr =
	rule->skip[i].ptr->nr;

	if (pr->action == PF_GET_CLR_CNTR) {
	rule->evaluations = 0;
	rule->packets[0] = rule->packets[1] = 0;
	rule->bytes[0] = rule->bytes[1] = 0;
	}
	break;
	}

	case DIOCCHANGERULE: {
	struct pfioc_rule pcr = (struct pfioc_rule )addr;
	struct pf_ruleset *ruleset;
	struct pf_rule oldrule = NULL, newrule = NULL;
	u_int32_t nr = 0;
	int rs_num;

	if (!(pcr->action == PF_CHANGE_REMOVE \|\|
	pcr->action == PF_CHANGE_GET_TICKET) &&
	pcr->pool_ticket != ticket_pabuf) {
	error = EBUSY;
	break;
	}

	if (pcr->action < PF_CHANGE_ADD_HEAD \|\|
	pcr->action > PF_CHANGE_GET_TICKET) {
	error = EINVAL;
	break;
	}
	ruleset = pf_find_ruleset(pcr->anchor);
	if (ruleset == NULL) {
	error = EINVAL;
	break;
	}
	rs_num = pf_get_ruleset_number(pcr->rule.action);
	if (rs_num >= PF_RULESET_MAX) {
	error = EINVAL;
	break;
	}

	if (pcr->action == PF_CHANGE_GET_TICKET) {
	pcr->ticket = ++ruleset->rules[rs_num].active.ticket;
	break;
	} else {
	if (pcr->ticket !=
	ruleset->rules[rs_num].active.ticket) {
	error = EINVAL;
	break;
	}
	if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
	error = EINVAL;
	break;
	}
	}

	if (pcr->action != PF_CHANGE_REMOVE) {
	newrule = pool_get(&pf_rule_pl, PR_NOWAIT);
	if (newrule == NULL) {
	error = ENOMEM;
	break;
	}
	bcopy(&pcr->rule, newrule, sizeof(struct pf_rule));
	#ifdef __FreeBSD__
	newrule->cuid = td->td_ucred->cr_ruid;
	newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
	#else
	newrule->cuid = p->p_cred->p_ruid;
	newrule->cpid = p->p_pid;
	#endif
	TAILQ_INIT(&newrule->rpool.list);
	/* initialize refcounting */
	newrule->states = 0;
	newrule->entries.tqe_prev = NULL;
	#ifndef INET
	if (newrule->af == AF_INET) {
	pool_put(&pf_rule_pl, newrule);
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET */
	#ifndef INET6
	if (newrule->af == AF_INET6) {
	pool_put(&pf_rule_pl, newrule);
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET6 */
	if (newrule->ifname[0]) {
	newrule->kif = pfi_kif_get(newrule->ifname);
	if (newrule->kif == NULL) {
	pool_put(&pf_rule_pl, newrule);
	error = EINVAL;
	break;
	}
	pfi_kif_ref(newrule->kif, PFI_KIF_REF_RULE);
	} else
	newrule->kif = NULL;

	if (newrule->rtableid > 0 &&
	#ifdef __FreeBSD__ /* ROUTING */
	newrule->rtableid > rt_numfibs)
	#else
	!rtable_exists(newrule->rtableid))
	#endif
	error = EBUSY;

	#ifdef ALTQ
	/* set queue IDs */
	if (newrule->qname[0] != 0) {
	if ((newrule->qid =
	pf_qname2qid(newrule->qname)) == 0)
	error = EBUSY;
	else if (newrule->pqname[0] != 0) {
	if ((newrule->pqid =
	pf_qname2qid(newrule->pqname)) == 0)
	error = EBUSY;
	} else
	newrule->pqid = newrule->qid;
	}
	#endif /* ALTQ */
	if (newrule->tagname[0])
	if ((newrule->tag =
	pf_tagname2tag(newrule->tagname)) == 0)
	error = EBUSY;
	if (newrule->match_tagname[0])
	if ((newrule->match_tag = pf_tagname2tag(
	newrule->match_tagname)) == 0)
	error = EBUSY;
	if (newrule->rt && !newrule->direction)
	error = EINVAL;
	#ifdef __FreeBSD__
	#if NPFLOG > 0
	if (!newrule->log)
	newrule->logif = 0;
	if (newrule->logif >= PFLOGIFS_MAX)
	error = EINVAL;
	#endif
	#endif
	if (pf_rtlabel_add(&newrule->src.addr) \|\|
	pf_rtlabel_add(&newrule->dst.addr))
	error = EBUSY;
	if (pfi_dynaddr_setup(&newrule->src.addr, newrule->af))
	error = EINVAL;
	if (pfi_dynaddr_setup(&newrule->dst.addr, newrule->af))
	error = EINVAL;
	if (pf_tbladdr_setup(ruleset, &newrule->src.addr))
	error = EINVAL;
	if (pf_tbladdr_setup(ruleset, &newrule->dst.addr))
	error = EINVAL;
	if (pf_anchor_setup(newrule, ruleset, pcr->anchor_call))
	error = EINVAL;
	TAILQ_FOREACH(pa, &pf_pabuf, entries)
	if (pf_tbladdr_setup(ruleset, &pa->addr))
	error = EINVAL;

	if (newrule->overload_tblname[0]) {
	if ((newrule->overload_tbl = pfr_attach_table(
	ruleset, newrule->overload_tblname)) ==
	NULL)
	error = EINVAL;
	else
	newrule->overload_tbl->pfrkt_flags \|=
	PFR_TFLAG_ACTIVE;
	}

	pf_mv_pool(&pf_pabuf, &newrule->rpool.list);
	if (((((newrule->action == PF_NAT) \|\|
	(newrule->action == PF_RDR) \|\|
	(newrule->action == PF_BINAT) \|\|
	(newrule->rt > PF_FASTROUTE)) &&
	!newrule->anchor)) &&
	(TAILQ_FIRST(&newrule->rpool.list) == NULL))
	error = EINVAL;

	if (error) {
	pf_rm_rule(NULL, newrule);
	break;
	}

	#ifdef __FreeBSD__
	if (!debug_pfugidhack && (newrule->uid.op \|\|
	newrule->gid.op \|\|
	newrule->log & PF_LOG_SOCKET_LOOKUP)) {
	DPFPRINTF(PF_DEBUG_MISC,
	("pf: debug.pfugidhack enabled\n"));
	debug_pfugidhack = 1;
	}
	#endif

	newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list);
	newrule->evaluations = 0;
	newrule->packets[0] = newrule->packets[1] = 0;
	newrule->bytes[0] = newrule->bytes[1] = 0;
	}
	pf_empty_pool(&pf_pabuf);

	if (pcr->action == PF_CHANGE_ADD_HEAD)
	oldrule = TAILQ_FIRST(
	ruleset->rules[rs_num].active.ptr);
	else if (pcr->action == PF_CHANGE_ADD_TAIL)
	oldrule = TAILQ_LAST(
	ruleset->rules[rs_num].active.ptr, pf_rulequeue);
	else {
	oldrule = TAILQ_FIRST(
	ruleset->rules[rs_num].active.ptr);
	while ((oldrule != NULL) && (oldrule->nr != pcr->nr))
	oldrule = TAILQ_NEXT(oldrule, entries);
	if (oldrule == NULL) {
	if (newrule != NULL)
	pf_rm_rule(NULL, newrule);
	error = EINVAL;
	break;
	}
	}

	if (pcr->action == PF_CHANGE_REMOVE) {
	pf_rm_rule(ruleset->rules[rs_num].active.ptr, oldrule);
	ruleset->rules[rs_num].active.rcount--;
	} else {
	if (oldrule == NULL)
	TAILQ_INSERT_TAIL(
	ruleset->rules[rs_num].active.ptr,
	newrule, entries);
	else if (pcr->action == PF_CHANGE_ADD_HEAD \|\|
	pcr->action == PF_CHANGE_ADD_BEFORE)
	TAILQ_INSERT_BEFORE(oldrule, newrule, entries);
	else
	TAILQ_INSERT_AFTER(
	ruleset->rules[rs_num].active.ptr,
	oldrule, newrule, entries);
	ruleset->rules[rs_num].active.rcount++;
	}

	nr = 0;
	TAILQ_FOREACH(oldrule,
	ruleset->rules[rs_num].active.ptr, entries)
	oldrule->nr = nr++;

	ruleset->rules[rs_num].active.ticket++;

	pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr);
	pf_remove_if_empty_ruleset(ruleset);

	break;
	}

	case DIOCCLRSTATES: {
	struct pf_state state, nexts;
	struct pfioc_state_kill psk = (struct pfioc_state_kill )addr;
	int killed = 0;

	for (state = RB_MIN(pf_state_tree_id, &tree_id); state;
	state = nexts) {
	nexts = RB_NEXT(pf_state_tree_id, &tree_id, state);

	if (!psk->psk_ifname[0] \|\| !strcmp(psk->psk_ifname,
	state->u.s.kif->pfik_name)) {
	#if NPFSYNC
	/* don't send out individual delete messages */
	state->sync_flags = PFSTATE_NOSYNC;
	#endif
	pf_unlink_state(state);
	killed++;
	}
	}
	psk->psk_af = killed;
	#if NPFSYNC
	pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
	#endif
	break;
	}

	case DIOCKILLSTATES: {
	struct pf_state state, nexts;
	struct pf_state_host src, dst;
	struct pfioc_state_kill psk = (struct pfioc_state_kill )addr;
	int killed = 0;

	for (state = RB_MIN(pf_state_tree_id, &tree_id); state;
	state = nexts) {
	nexts = RB_NEXT(pf_state_tree_id, &tree_id, state);

	if (state->direction == PF_OUT) {
	src = &state->lan;
	dst = &state->ext;
	} else {
	src = &state->ext;
	dst = &state->lan;
	}
	if ((!psk->psk_af \|\| state->af == psk->psk_af)
	&& (!psk->psk_proto \|\| psk->psk_proto ==
	state->proto) &&
	PF_MATCHA(psk->psk_src.neg,
	&psk->psk_src.addr.v.a.addr,
	&psk->psk_src.addr.v.a.mask,
	&src->addr, state->af) &&
	PF_MATCHA(psk->psk_dst.neg,
	&psk->psk_dst.addr.v.a.addr,
	&psk->psk_dst.addr.v.a.mask,
	&dst->addr, state->af) &&
	(psk->psk_src.port_op == 0 \|\|
	pf_match_port(psk->psk_src.port_op,
	psk->psk_src.port[0], psk->psk_src.port[1],
	src->port)) &&
	(psk->psk_dst.port_op == 0 \|\|
	pf_match_port(psk->psk_dst.port_op,
	psk->psk_dst.port[0], psk->psk_dst.port[1],
	dst->port)) &&
	(!psk->psk_ifname[0] \|\| !strcmp(psk->psk_ifname,
	state->u.s.kif->pfik_name))) {
	#if NPFSYNC > 0
	/* send immediate delete of state */
	pfsync_delete_state(state);
	state->sync_flags \|= PFSTATE_NOSYNC;
	#endif
	pf_unlink_state(state);
	killed++;
	}
	}
	psk->psk_af = killed;
	break;
	}

	case DIOCADDSTATE: {
	struct pfioc_state ps = (struct pfioc_state )addr;
	struct pf_state *state;
	struct pfi_kif *kif;

	if (ps->state.timeout >= PFTM_MAX &&
	ps->state.timeout != PFTM_UNTIL_PACKET) {
	error = EINVAL;
	break;
	}
	state = pool_get(&pf_state_pl, PR_NOWAIT);
	if (state == NULL) {
	error = ENOMEM;
	break;
	}
	kif = pfi_kif_get(ps->state.u.ifname);
	if (kif == NULL) {
	pool_put(&pf_state_pl, state);
	error = ENOENT;
	break;
	}
	bcopy(&ps->state, state, sizeof(struct pf_state));
	bzero(&state->u, sizeof(state->u));
	state->rule.ptr = &pf_default_rule;
	state->nat_rule.ptr = NULL;
	state->anchor.ptr = NULL;
	state->rt_kif = NULL;
	state->creation = time_second;
	state->pfsync_time = 0;
	state->packets[0] = state->packets[1] = 0;
	state->bytes[0] = state->bytes[1] = 0;

	if (pf_insert_state(kif, state)) {
	pfi_kif_unref(kif, PFI_KIF_REF_NONE);
	pool_put(&pf_state_pl, state);
	error = ENOMEM;
	}
	break;
	}

	case DIOCGETSTATE: {
	struct pfioc_state ps = (struct pfioc_state )addr;
	struct pf_state *state;
	u_int32_t nr;
	int secs;

	nr = 0;
	RB_FOREACH(state, pf_state_tree_id, &tree_id) {
	if (nr >= ps->nr)
	break;
	nr++;
	}
	if (state == NULL) {
	error = EBUSY;
	break;
	}
	secs = time_second;
	bcopy(state, &ps->state, sizeof(ps->state));
	strlcpy(ps->state.u.ifname, state->u.s.kif->pfik_name,
	sizeof(ps->state.u.ifname));
	ps->state.rule.nr = state->rule.ptr->nr;
	ps->state.nat_rule.nr = (state->nat_rule.ptr == NULL) ?
	-1 : state->nat_rule.ptr->nr;
	ps->state.anchor.nr = (state->anchor.ptr == NULL) ?
	-1 : state->anchor.ptr->nr;
	ps->state.creation = secs - ps->state.creation;
	ps->state.expire = pf_state_expires(state);
	if (ps->state.expire > secs)
	ps->state.expire -= secs;
	else
	ps->state.expire = 0;
	break;
	}

	case DIOCGETSTATES: {
	struct pfioc_states ps = (struct pfioc_states )addr;
	struct pf_state *state;
	struct pf_state p, pstore;
	u_int32_t nr = 0;
	int space = ps->ps_len;

	if (space == 0) {
	nr = pf_status.states;
	ps->ps_len = sizeof(struct pf_state) * nr;
	break;
	}

	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif

	p = ps->ps_states;

	state = TAILQ_FIRST(&state_list);
	while (state) {
	if (state->timeout != PFTM_UNLINKED) {
	int secs = time_second;

	if ((nr+1) * sizeof(*p) > (unsigned)ps->ps_len)
	break;

	bcopy(state, pstore, sizeof(*pstore));
	strlcpy(pstore->u.ifname,
	state->u.s.kif->pfik_name,
	sizeof(pstore->u.ifname));
	pstore->rule.nr = state->rule.ptr->nr;
	pstore->nat_rule.nr = (state->nat_rule.ptr ==
	NULL) ? -1 : state->nat_rule.ptr->nr;
	pstore->anchor.nr = (state->anchor.ptr ==
	NULL) ? -1 : state->anchor.ptr->nr;
	pstore->creation = secs - pstore->creation;
	pstore->expire = pf_state_expires(state);
	if (pstore->expire > secs)
	pstore->expire -= secs;
	else
	pstore->expire = 0;
	#ifdef __FreeBSD__
	PF_COPYOUT(pstore, p, sizeof(*p), error);
	#else
	error = copyout(pstore, p, sizeof(*p));
	#endif
	if (error) {
	free(pstore, M_TEMP);
	goto fail;
	}
	p++;
	nr++;
	}
	state = TAILQ_NEXT(state, u.s.entry_list);
	}

	ps->ps_len = sizeof(struct pf_state) * nr;

	free(pstore, M_TEMP);
	break;
	}

	case DIOCGETSTATUS: {
	struct pf_status s = (struct pf_status )addr;
	bcopy(&pf_status, s, sizeof(struct pf_status));
	pfi_fill_oldstatus(s);
	break;
	}

	case DIOCSETSTATUSIF: {
	struct pfioc_if pi = (struct pfioc_if )addr;

	if (pi->ifname[0] == 0) {
	bzero(pf_status.ifname, IFNAMSIZ);
	break;
	}
	if (ifunit(pi->ifname) == NULL) {
	error = EINVAL;
	break;
	}
	strlcpy(pf_status.ifname, pi->ifname, IFNAMSIZ);
	break;
	}

	case DIOCCLRSTATUS: {
	bzero(pf_status.counters, sizeof(pf_status.counters));
	bzero(pf_status.fcounters, sizeof(pf_status.fcounters));
	bzero(pf_status.scounters, sizeof(pf_status.scounters));
	pf_status.since = time_second;
	if (*pf_status.ifname)
	pfi_clr_istats(pf_status.ifname);
	break;
	}

	case DIOCNATLOOK: {
	struct pfioc_natlook pnl = (struct pfioc_natlook )addr;
	struct pf_state *state;
	struct pf_state_cmp key;
	int m = 0, direction = pnl->direction;

	key.af = pnl->af;
	key.proto = pnl->proto;

	if (!pnl->proto \|\|
	PF_AZERO(&pnl->saddr, pnl->af) \|\|
	PF_AZERO(&pnl->daddr, pnl->af) \|\|
	((pnl->proto == IPPROTO_TCP \|\|
	pnl->proto == IPPROTO_UDP) &&
	(!pnl->dport \|\| !pnl->sport)))
	error = EINVAL;
	else {
	/*
	* userland gives us source and dest of connection,
	* reverse the lookup so we ask for what happens with
	* the return traffic, enabling us to find it in the
	* state tree.
	*/
	if (direction == PF_IN) {
	PF_ACPY(&key.ext.addr, &pnl->daddr, pnl->af);
	key.ext.port = pnl->dport;
	PF_ACPY(&key.gwy.addr, &pnl->saddr, pnl->af);
	key.gwy.port = pnl->sport;
	state = pf_find_state_all(&key, PF_EXT_GWY, &m);
	} else {
	PF_ACPY(&key.lan.addr, &pnl->daddr, pnl->af);
	key.lan.port = pnl->dport;
	PF_ACPY(&key.ext.addr, &pnl->saddr, pnl->af);
	key.ext.port = pnl->sport;
	state = pf_find_state_all(&key, PF_LAN_EXT, &m);
	}
	if (m > 1)
	error = E2BIG; /* more than one state */
	else if (state != NULL) {
	if (direction == PF_IN) {
	PF_ACPY(&pnl->rsaddr, &state->lan.addr,
	state->af);
	pnl->rsport = state->lan.port;
	PF_ACPY(&pnl->rdaddr, &pnl->daddr,
	pnl->af);
	pnl->rdport = pnl->dport;
	} else {
	PF_ACPY(&pnl->rdaddr, &state->gwy.addr,
	state->af);
	pnl->rdport = state->gwy.port;
	PF_ACPY(&pnl->rsaddr, &pnl->saddr,
	pnl->af);
	pnl->rsport = pnl->sport;
	}
	} else
	error = ENOENT;
	}
	break;
	}

	case DIOCSETTIMEOUT: {
	struct pfioc_tm pt = (struct pfioc_tm )addr;
	int old;

	if (pt->timeout < 0 \|\| pt->timeout >= PFTM_MAX \|\|
	pt->seconds < 0) {
	error = EINVAL;
	goto fail;
	}
	old = pf_default_rule.timeout[pt->timeout];
	if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0)
	pt->seconds = 1;
	pf_default_rule.timeout[pt->timeout] = pt->seconds;
	if (pt->timeout == PFTM_INTERVAL && pt->seconds < old)
	wakeup(pf_purge_thread);
	pt->seconds = old;
	break;
	}

	case DIOCGETTIMEOUT: {
	struct pfioc_tm pt = (struct pfioc_tm )addr;

	if (pt->timeout < 0 \|\| pt->timeout >= PFTM_MAX) {
	error = EINVAL;
	goto fail;
	}
	pt->seconds = pf_default_rule.timeout[pt->timeout];
	break;
	}

	case DIOCGETLIMIT: {
	struct pfioc_limit pl = (struct pfioc_limit )addr;

	if (pl->index < 0 \|\| pl->index >= PF_LIMIT_MAX) {
	error = EINVAL;
	goto fail;
	}
	pl->limit = pf_pool_limits[pl->index].limit;
	break;
	}

	case DIOCSETLIMIT: {
	struct pfioc_limit pl = (struct pfioc_limit )addr;
	int old_limit;

	if (pl->index < 0 \|\| pl->index >= PF_LIMIT_MAX \|\|
	pf_pool_limits[pl->index].pp == NULL) {
	error = EINVAL;
	goto fail;
	}
	#ifdef __FreeBSD__
	uma_zone_set_max(pf_pool_limits[pl->index].pp, pl->limit);
	#else
	if (pool_sethardlimit(pf_pool_limits[pl->index].pp,
	pl->limit, NULL, 0) != 0) {
	error = EBUSY;
	goto fail;
	}
	#endif
	old_limit = pf_pool_limits[pl->index].limit;
	pf_pool_limits[pl->index].limit = pl->limit;
	pl->limit = old_limit;
	break;
	}

	case DIOCSETDEBUG: {
	u_int32_t level = (u_int32_t )addr;

	pf_status.debug = *level;
	break;
	}

	case DIOCCLRRULECTRS: {
	/* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */
	struct pf_ruleset *ruleset = &pf_main_ruleset;
	struct pf_rule *rule;

	TAILQ_FOREACH(rule,
	ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) {
	rule->evaluations = 0;
	rule->packets[0] = rule->packets[1] = 0;
	rule->bytes[0] = rule->bytes[1] = 0;
	}
	break;
	}

	#ifdef __FreeBSD__
	case DIOCGIFSPEED: {
	struct pf_ifspeed psp = (struct pf_ifspeed )addr;
	struct pf_ifspeed ps;
	struct ifnet *ifp;

	if (psp->ifname[0] != 0) {
	/* Can we completely trust user-land? */
	strlcpy(ps.ifname, psp->ifname, IFNAMSIZ);
	ifp = ifunit(ps.ifname);
	if (ifp != NULL)
	psp->baudrate = ifp->if_baudrate;
	else
	error = EINVAL;
	} else
	error = EINVAL;
	break;
	}
	#endif /* __FreeBSD__ */

	#ifdef ALTQ
	case DIOCSTARTALTQ: {
	struct pf_altq *altq;

	/* enable all altq interfaces on active list */
	TAILQ_FOREACH(altq, pf_altqs_active, entries) {
	#ifdef __FreeBSD__
	if (altq->qname[0] == 0 && (altq->local_flags &
	PFALTQ_FLAG_IF_REMOVED) == 0) {
	#else
	if (altq->qname[0] == 0) {
	#endif
	error = pf_enable_altq(altq);
	if (error != 0)
	break;
	}
	}
	if (error == 0)
	pf_altq_running = 1;
	DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n"));
	break;
	}

	case DIOCSTOPALTQ: {
	struct pf_altq *altq;

	/* disable all altq interfaces on active list */
	TAILQ_FOREACH(altq, pf_altqs_active, entries) {
	#ifdef __FreeBSD__
	if (altq->qname[0] == 0 && (altq->local_flags &
	PFALTQ_FLAG_IF_REMOVED) == 0) {
	#else
	if (altq->qname[0] == 0) {
	#endif
	error = pf_disable_altq(altq);
	if (error != 0)
	break;
	}
	}
	if (error == 0)
	pf_altq_running = 0;
	DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n"));
	break;
	}

	case DIOCADDALTQ: {
	struct pfioc_altq pa = (struct pfioc_altq )addr;
	struct pf_altq altq, a;

	if (pa->ticket != ticket_altqs_inactive) {
	error = EBUSY;
	break;
	}
	altq = pool_get(&pf_altq_pl, PR_NOWAIT);
	if (altq == NULL) {
	error = ENOMEM;
	break;
	}
	bcopy(&pa->altq, altq, sizeof(struct pf_altq));
	#ifdef __FreeBSD__
	altq->local_flags = 0;
	#endif

	/*
	* if this is for a queue, find the discipline and
	* copy the necessary fields
	*/
	if (altq->qname[0] != 0) {
	if ((altq->qid = pf_qname2qid(altq->qname)) == 0) {
	error = EBUSY;
	pool_put(&pf_altq_pl, altq);
	break;
	}
	altq->altq_disc = NULL;
	TAILQ_FOREACH(a, pf_altqs_inactive, entries) {
	if (strncmp(a->ifname, altq->ifname,
	IFNAMSIZ) == 0 && a->qname[0] == 0) {
	altq->altq_disc = a->altq_disc;
	break;
	}
	}
	}

	#ifdef __FreeBSD__
	struct ifnet *ifp;

	if ((ifp = ifunit(altq->ifname)) == NULL) {
	altq->local_flags \|= PFALTQ_FLAG_IF_REMOVED;
	} else {
	PF_UNLOCK();
	#endif
	error = altq_add(altq);
	#ifdef __FreeBSD__
	PF_LOCK();
	}
	#endif
	if (error) {
	pool_put(&pf_altq_pl, altq);
	break;
	}

	TAILQ_INSERT_TAIL(pf_altqs_inactive, altq, entries);
	bcopy(altq, &pa->altq, sizeof(struct pf_altq));
	break;
	}

	case DIOCGETALTQS: {
	struct pfioc_altq pa = (struct pfioc_altq )addr;
	struct pf_altq *altq;

	pa->nr = 0;
	TAILQ_FOREACH(altq, pf_altqs_active, entries)
	pa->nr++;
	pa->ticket = ticket_altqs_active;
	break;
	}

	case DIOCGETALTQ: {
	struct pfioc_altq pa = (struct pfioc_altq )addr;
	struct pf_altq *altq;
	u_int32_t nr;

	if (pa->ticket != ticket_altqs_active) {
	error = EBUSY;
	break;
	}
	nr = 0;
	altq = TAILQ_FIRST(pf_altqs_active);
	while ((altq != NULL) && (nr < pa->nr)) {
	altq = TAILQ_NEXT(altq, entries);
	nr++;
	}
	if (altq == NULL) {
	error = EBUSY;
	break;
	}
	bcopy(altq, &pa->altq, sizeof(struct pf_altq));
	break;
	}

	case DIOCCHANGEALTQ:
	/* CHANGEALTQ not supported yet! */
	error = ENODEV;
	break;

	case DIOCGETQSTATS: {
	struct pfioc_qstats pq = (struct pfioc_qstats )addr;
	struct pf_altq *altq;
	u_int32_t nr;
	int nbytes;

	if (pq->ticket != ticket_altqs_active) {
	error = EBUSY;
	break;
	}
	nbytes = pq->nbytes;
	nr = 0;
	altq = TAILQ_FIRST(pf_altqs_active);
	while ((altq != NULL) && (nr < pq->nr)) {
	altq = TAILQ_NEXT(altq, entries);
	nr++;
	}
	if (altq == NULL) {
	error = EBUSY;
	break;
	}
	#ifdef __FreeBSD__
	if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
	error = ENXIO;
	break;
	}
	PF_UNLOCK();
	#endif
	error = altq_getqstats(altq, pq->buf, &nbytes);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	if (error == 0) {
	pq->scheduler = altq->scheduler;
	pq->nbytes = nbytes;
	}
	break;
	}
	#endif /* ALTQ */

	case DIOCBEGINADDRS: {
	struct pfioc_pooladdr pp = (struct pfioc_pooladdr )addr;

	pf_empty_pool(&pf_pabuf);
	pp->ticket = ++ticket_pabuf;
	break;
	}

	case DIOCADDADDR: {
	struct pfioc_pooladdr pp = (struct pfioc_pooladdr )addr;

	if (pp->ticket != ticket_pabuf) {
	error = EBUSY;
	break;
	}
	#ifndef INET
	if (pp->af == AF_INET) {
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET */
	#ifndef INET6
	if (pp->af == AF_INET6) {
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET6 */
	if (pp->addr.addr.type != PF_ADDR_ADDRMASK &&
	pp->addr.addr.type != PF_ADDR_DYNIFTL &&
	pp->addr.addr.type != PF_ADDR_TABLE) {
	error = EINVAL;
	break;
	}
	pa = pool_get(&pf_pooladdr_pl, PR_NOWAIT);
	if (pa == NULL) {
	error = ENOMEM;
	break;
	}
	bcopy(&pp->addr, pa, sizeof(struct pf_pooladdr));
	if (pa->ifname[0]) {
	pa->kif = pfi_kif_get(pa->ifname);
	if (pa->kif == NULL) {
	pool_put(&pf_pooladdr_pl, pa);
	error = EINVAL;
	break;
	}
	pfi_kif_ref(pa->kif, PFI_KIF_REF_RULE);
	}
	if (pfi_dynaddr_setup(&pa->addr, pp->af)) {
	pfi_dynaddr_remove(&pa->addr);
	pfi_kif_unref(pa->kif, PFI_KIF_REF_RULE);
	pool_put(&pf_pooladdr_pl, pa);
	error = EINVAL;
	break;
	}
	TAILQ_INSERT_TAIL(&pf_pabuf, pa, entries);
	break;
	}

	case DIOCGETADDRS: {
	struct pfioc_pooladdr pp = (struct pfioc_pooladdr )addr;

	pp->nr = 0;
	pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
	pp->r_num, 0, 1, 0);
	if (pool == NULL) {
	error = EBUSY;
	break;
	}
	TAILQ_FOREACH(pa, &pool->list, entries)
	pp->nr++;
	break;
	}

	case DIOCGETADDR: {
	struct pfioc_pooladdr pp = (struct pfioc_pooladdr )addr;
	u_int32_t nr = 0;

	pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action,
	pp->r_num, 0, 1, 1);
	if (pool == NULL) {
	error = EBUSY;
	break;
	}
	pa = TAILQ_FIRST(&pool->list);
	while ((pa != NULL) && (nr < pp->nr)) {
	pa = TAILQ_NEXT(pa, entries);
	nr++;
	}
	if (pa == NULL) {
	error = EBUSY;
	break;
	}
	bcopy(pa, &pp->addr, sizeof(struct pf_pooladdr));
	pfi_dynaddr_copyout(&pp->addr.addr);
	pf_tbladdr_copyout(&pp->addr.addr);
	pf_rtlabel_copyout(&pp->addr.addr);
	break;
	}

	case DIOCCHANGEADDR: {
	struct pfioc_pooladdr pca = (struct pfioc_pooladdr )addr;
	struct pf_pooladdr oldpa = NULL, newpa = NULL;
	struct pf_ruleset *ruleset;

	if (pca->action < PF_CHANGE_ADD_HEAD \|\|
	pca->action > PF_CHANGE_REMOVE) {
	error = EINVAL;
	break;
	}
	if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
	pca->addr.addr.type != PF_ADDR_DYNIFTL &&
	pca->addr.addr.type != PF_ADDR_TABLE) {
	error = EINVAL;
	break;
	}

	ruleset = pf_find_ruleset(pca->anchor);
	if (ruleset == NULL) {
	error = EBUSY;
	break;
	}
	pool = pf_get_pool(pca->anchor, pca->ticket, pca->r_action,
	pca->r_num, pca->r_last, 1, 1);
	if (pool == NULL) {
	error = EBUSY;
	break;
	}
	if (pca->action != PF_CHANGE_REMOVE) {
	newpa = pool_get(&pf_pooladdr_pl, PR_NOWAIT);
	if (newpa == NULL) {
	error = ENOMEM;
	break;
	}
	bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr));
	#ifndef INET
	if (pca->af == AF_INET) {
	pool_put(&pf_pooladdr_pl, newpa);
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET */
	#ifndef INET6
	if (pca->af == AF_INET6) {
	pool_put(&pf_pooladdr_pl, newpa);
	error = EAFNOSUPPORT;
	break;
	}
	#endif /* INET6 */
	if (newpa->ifname[0]) {
	newpa->kif = pfi_kif_get(newpa->ifname);
	if (newpa->kif == NULL) {
	pool_put(&pf_pooladdr_pl, newpa);
	error = EINVAL;
	break;
	}
	pfi_kif_ref(newpa->kif, PFI_KIF_REF_RULE);
	} else
	newpa->kif = NULL;
	if (pfi_dynaddr_setup(&newpa->addr, pca->af) \|\|
	pf_tbladdr_setup(ruleset, &newpa->addr)) {
	pfi_dynaddr_remove(&newpa->addr);
	pfi_kif_unref(newpa->kif, PFI_KIF_REF_RULE);
	pool_put(&pf_pooladdr_pl, newpa);
	error = EINVAL;
	break;
	}
	}

	if (pca->action == PF_CHANGE_ADD_HEAD)
	oldpa = TAILQ_FIRST(&pool->list);
	else if (pca->action == PF_CHANGE_ADD_TAIL)
	oldpa = TAILQ_LAST(&pool->list, pf_palist);
	else {
	int i = 0;

	oldpa = TAILQ_FIRST(&pool->list);
	while ((oldpa != NULL) && (i < pca->nr)) {
	oldpa = TAILQ_NEXT(oldpa, entries);
	i++;
	}
	if (oldpa == NULL) {
	error = EINVAL;
	break;
	}
	}

	if (pca->action == PF_CHANGE_REMOVE) {
	TAILQ_REMOVE(&pool->list, oldpa, entries);
	pfi_dynaddr_remove(&oldpa->addr);
	pf_tbladdr_remove(&oldpa->addr);
	pfi_kif_unref(oldpa->kif, PFI_KIF_REF_RULE);
	pool_put(&pf_pooladdr_pl, oldpa);
	} else {
	if (oldpa == NULL)
	TAILQ_INSERT_TAIL(&pool->list, newpa, entries);
	else if (pca->action == PF_CHANGE_ADD_HEAD \|\|
	pca->action == PF_CHANGE_ADD_BEFORE)
	TAILQ_INSERT_BEFORE(oldpa, newpa, entries);
	else
	TAILQ_INSERT_AFTER(&pool->list, oldpa,
	newpa, entries);
	}

	pool->cur = TAILQ_FIRST(&pool->list);
	PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr,
	pca->af);
	break;
	}

	case DIOCGETRULESETS: {
	struct pfioc_ruleset pr = (struct pfioc_ruleset )addr;
	struct pf_ruleset *ruleset;
	struct pf_anchor *anchor;

	pr->path[sizeof(pr->path) - 1] = 0;
	if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
	error = EINVAL;
	break;
	}
	pr->nr = 0;
	if (ruleset->anchor == NULL) {
	/* XXX kludge for pf_main_ruleset */
	RB_FOREACH(anchor, pf_anchor_global, &pf_anchors)
	if (anchor->parent == NULL)
	pr->nr++;
	} else {
	RB_FOREACH(anchor, pf_anchor_node,
	&ruleset->anchor->children)
	pr->nr++;
	}
	break;
	}

	case DIOCGETRULESET: {
	struct pfioc_ruleset pr = (struct pfioc_ruleset )addr;
	struct pf_ruleset *ruleset;
	struct pf_anchor *anchor;
	u_int32_t nr = 0;

	pr->path[sizeof(pr->path) - 1] = 0;
	if ((ruleset = pf_find_ruleset(pr->path)) == NULL) {
	error = EINVAL;
	break;
	}
	pr->name[0] = 0;
	if (ruleset->anchor == NULL) {
	/* XXX kludge for pf_main_ruleset */
	RB_FOREACH(anchor, pf_anchor_global, &pf_anchors)
	if (anchor->parent == NULL && nr++ == pr->nr) {
	strlcpy(pr->name, anchor->name,
	sizeof(pr->name));
	break;
	}
	} else {
	RB_FOREACH(anchor, pf_anchor_node,
	&ruleset->anchor->children)
	if (nr++ == pr->nr) {
	strlcpy(pr->name, anchor->name,
	sizeof(pr->name));
	break;
	}
	}
	if (!pr->name[0])
	error = EBUSY;
	break;
	}

	case DIOCRCLRTABLES: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != 0) {
	error = ENODEV;
	break;
	}
	error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
	io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRADDTABLES: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_table)) {
	error = ENODEV;
	break;
	}
	error = pfr_add_tables(io->pfrio_buffer, io->pfrio_size,
	&io->pfrio_nadd, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRDELTABLES: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_table)) {
	error = ENODEV;
	break;
	}
	error = pfr_del_tables(io->pfrio_buffer, io->pfrio_size,
	&io->pfrio_ndel, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRGETTABLES: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_table)) {
	error = ENODEV;
	break;
	}
	error = pfr_get_tables(&io->pfrio_table, io->pfrio_buffer,
	&io->pfrio_size, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRGETTSTATS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
	error = ENODEV;
	break;
	}
	error = pfr_get_tstats(&io->pfrio_table, io->pfrio_buffer,
	&io->pfrio_size, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRCLRTSTATS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_table)) {
	error = ENODEV;
	break;
	}
	error = pfr_clr_tstats(io->pfrio_buffer, io->pfrio_size,
	&io->pfrio_nzero, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRSETTFLAGS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_table)) {
	error = ENODEV;
	break;
	}
	error = pfr_set_tflags(io->pfrio_buffer, io->pfrio_size,
	io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange,
	&io->pfrio_ndel, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRCLRADDRS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != 0) {
	error = ENODEV;
	break;
	}
	error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
	io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRADDADDRS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_add_addrs(&io->pfrio_table, io->pfrio_buffer,
	io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags \|
	PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRDELADDRS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_del_addrs(&io->pfrio_table, io->pfrio_buffer,
	io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags \|
	PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRSETADDRS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_set_addrs(&io->pfrio_table, io->pfrio_buffer,
	io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd,
	&io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags \|
	PFR_FLAG_USERIOCTL, 0);
	break;
	}

	case DIOCRGETADDRS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_get_addrs(&io->pfrio_table, io->pfrio_buffer,
	&io->pfrio_size, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRGETASTATS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_astats)) {
	error = ENODEV;
	break;
	}
	error = pfr_get_astats(&io->pfrio_table, io->pfrio_buffer,
	&io->pfrio_size, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRCLRASTATS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_clr_astats(&io->pfrio_table, io->pfrio_buffer,
	io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags \|
	PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRTSTADDRS: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_tst_addrs(&io->pfrio_table, io->pfrio_buffer,
	io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags \|
	PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCRINADEFINE: {
	struct pfioc_table io = (struct pfioc_table )addr;

	if (io->pfrio_esize != sizeof(struct pfr_addr)) {
	error = ENODEV;
	break;
	}
	error = pfr_ina_define(&io->pfrio_table, io->pfrio_buffer,
	io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr,
	io->pfrio_ticket, io->pfrio_flags \| PFR_FLAG_USERIOCTL);
	break;
	}

	case DIOCOSFPADD: {
	struct pf_osfp_ioctl io = (struct pf_osfp_ioctl )addr;
	error = pf_osfp_add(io);
	break;
	}

	case DIOCOSFPGET: {
	struct pf_osfp_ioctl io = (struct pf_osfp_ioctl )addr;
	error = pf_osfp_get(io);
	break;
	}

	case DIOCXBEGIN: {
	struct pfioc_trans io = (struct pfioc_trans )addr;
	struct pfioc_trans_e *ioe;
	struct pfr_table *table;
	int i;

	if (io->esize != sizeof(*ioe)) {
	error = ENODEV;
	goto fail;
	}
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	ioe = (struct pfioc_trans_e )malloc(sizeof(ioe),
	M_TEMP, M_WAITOK);
	table = (struct pfr_table )malloc(sizeof(table),
	M_TEMP, M_WAITOK);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	for (i = 0; i < io->size; i++) {
	#ifdef __FreeBSD__
	PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error);
	if (error) {
	#else
	if (copyin(io->array+i, ioe, sizeof(*ioe))) {
	#endif
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EFAULT;
	goto fail;
	}
	switch (ioe->rs_num) {
	#ifdef ALTQ
	case PF_RULESET_ALTQ:
	if (ioe->anchor[0]) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EINVAL;
	goto fail;
	}
	if ((error = pf_begin_altq(&ioe->ticket))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail;
	}
	break;
	#endif /* ALTQ */
	case PF_RULESET_TABLE:
	bzero(table, sizeof(*table));
	strlcpy(table->pfrt_anchor, ioe->anchor,
	sizeof(table->pfrt_anchor));
	if ((error = pfr_ina_begin(table,
	&ioe->ticket, NULL, 0))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail;
	}
	break;
	default:
	if ((error = pf_begin_rules(&ioe->ticket,
	ioe->rs_num, ioe->anchor))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail;
	}
	break;
	}
	#ifdef __FreeBSD__
	PF_COPYOUT(ioe, io->array+i, sizeof(io->array[i]),
	error);
	if (error) {
	#else
	if (copyout(ioe, io->array+i, sizeof(io->array[i]))) {
	#endif
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EFAULT;
	goto fail;
	}
	}
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	break;
	}

	case DIOCXROLLBACK: {
	struct pfioc_trans io = (struct pfioc_trans )addr;
	struct pfioc_trans_e *ioe;
	struct pfr_table *table;
	int i;

	if (io->esize != sizeof(*ioe)) {
	error = ENODEV;
	goto fail;
	}
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	ioe = (struct pfioc_trans_e )malloc(sizeof(ioe),
	M_TEMP, M_WAITOK);
	table = (struct pfr_table )malloc(sizeof(table),
	M_TEMP, M_WAITOK);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	for (i = 0; i < io->size; i++) {
	#ifdef __FreeBSD__
	PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error);
	if (error) {
	#else
	if (copyin(io->array+i, ioe, sizeof(*ioe))) {
	#endif
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EFAULT;
	goto fail;
	}
	switch (ioe->rs_num) {
	#ifdef ALTQ
	case PF_RULESET_ALTQ:
	if (ioe->anchor[0]) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EINVAL;
	goto fail;
	}
	if ((error = pf_rollback_altq(ioe->ticket))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail; /* really bad */
	}
	break;
	#endif /* ALTQ */
	case PF_RULESET_TABLE:
	bzero(table, sizeof(*table));
	strlcpy(table->pfrt_anchor, ioe->anchor,
	sizeof(table->pfrt_anchor));
	if ((error = pfr_ina_rollback(table,
	ioe->ticket, NULL, 0))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail; /* really bad */
	}
	break;
	default:
	if ((error = pf_rollback_rules(ioe->ticket,
	ioe->rs_num, ioe->anchor))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail; /* really bad */
	}
	break;
	}
	}
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	break;
	}

	case DIOCXCOMMIT: {
	struct pfioc_trans io = (struct pfioc_trans )addr;
	struct pfioc_trans_e *ioe;
	struct pfr_table *table;
	struct pf_ruleset *rs;
	int i;

	if (io->esize != sizeof(*ioe)) {
	error = ENODEV;
	goto fail;
	}
	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	ioe = (struct pfioc_trans_e )malloc(sizeof(ioe),
	M_TEMP, M_WAITOK);
	table = (struct pfr_table )malloc(sizeof(table),
	M_TEMP, M_WAITOK);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif
	/* first makes sure everything will succeed */
	for (i = 0; i < io->size; i++) {
	#ifdef __FreeBSD__
	PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error);
	if (error) {
	#else
	if (copyin(io->array+i, ioe, sizeof(*ioe))) {
	#endif
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EFAULT;
	goto fail;
	}
	switch (ioe->rs_num) {
	#ifdef ALTQ
	case PF_RULESET_ALTQ:
	if (ioe->anchor[0]) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EINVAL;
	goto fail;
	}
	if (!altqs_inactive_open \|\| ioe->ticket !=
	ticket_altqs_inactive) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EBUSY;
	goto fail;
	}
	break;
	#endif /* ALTQ */
	case PF_RULESET_TABLE:
	rs = pf_find_ruleset(ioe->anchor);
	if (rs == NULL \|\| !rs->topen \|\| ioe->ticket !=
	rs->tticket) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EBUSY;
	goto fail;
	}
	break;
	default:
	if (ioe->rs_num < 0 \|\| ioe->rs_num >=
	PF_RULESET_MAX) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EINVAL;
	goto fail;
	}
	rs = pf_find_ruleset(ioe->anchor);
	if (rs == NULL \|\|
	!rs->rules[ioe->rs_num].inactive.open \|\|
	rs->rules[ioe->rs_num].inactive.ticket !=
	ioe->ticket) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EBUSY;
	goto fail;
	}
	break;
	}
	}
	/* now do the commit - no errors should happen here */
	for (i = 0; i < io->size; i++) {
	#ifdef __FreeBSD__
	PF_COPYIN(io->array+i, ioe, sizeof(*ioe), error);
	if (error) {
	#else
	if (copyin(io->array+i, ioe, sizeof(*ioe))) {
	#endif
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	error = EFAULT;
	goto fail;
	}
	switch (ioe->rs_num) {
	#ifdef ALTQ
	case PF_RULESET_ALTQ:
	if ((error = pf_commit_altq(ioe->ticket))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail; /* really bad */
	}
	break;
	#endif /* ALTQ */
	case PF_RULESET_TABLE:
	bzero(table, sizeof(*table));
	strlcpy(table->pfrt_anchor, ioe->anchor,
	sizeof(table->pfrt_anchor));
	if ((error = pfr_ina_commit(table, ioe->ticket,
	NULL, NULL, 0))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail; /* really bad */
	}
	break;
	default:
	if ((error = pf_commit_rules(ioe->ticket,
	ioe->rs_num, ioe->anchor))) {
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	goto fail; /* really bad */
	}
	break;
	}
	}
	free(table, M_TEMP);
	free(ioe, M_TEMP);
	break;
	}

	case DIOCGETSRCNODES: {
	struct pfioc_src_nodes psn = (struct pfioc_src_nodes )addr;
	struct pf_src_node n, p, *pstore;
	u_int32_t nr = 0;
	int space = psn->psn_len;

	if (space == 0) {
	RB_FOREACH(n, pf_src_tree, &tree_src_tracking)
	nr++;
	psn->psn_len = sizeof(struct pf_src_node) * nr;
	break;
	}

	#ifdef __FreeBSD__
	PF_UNLOCK();
	#endif
	pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
	#ifdef __FreeBSD__
	PF_LOCK();
	#endif

	p = psn->psn_src_nodes;
	RB_FOREACH(n, pf_src_tree, &tree_src_tracking) {
	int secs = time_second, diff;

	if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len)
	break;

	bcopy(n, pstore, sizeof(*pstore));
	if (n->rule.ptr != NULL)
	pstore->rule.nr = n->rule.ptr->nr;
	pstore->creation = secs - pstore->creation;
	if (pstore->expire > secs)
	pstore->expire -= secs;
	else
	pstore->expire = 0;

	/* adjust the connection rate estimate */
	diff = secs - n->conn_rate.last;
	if (diff >= n->conn_rate.seconds)
	pstore->conn_rate.count = 0;
	else
	pstore->conn_rate.count -=
	n->conn_rate.count * diff /
	n->conn_rate.seconds;

	#ifdef __FreeBSD__
	PF_COPYOUT(pstore, p, sizeof(*p), error);
	#else
	error = copyout(pstore, p, sizeof(*p));
	#endif
	if (error) {
	free(pstore, M_TEMP);
	goto fail;
	}
	p++;
	nr++;
	}
	psn->psn_len = sizeof(struct pf_src_node) * nr;

	free(pstore, M_TEMP);
	break;
	}

	case DIOCCLRSRCNODES: {
	struct pf_src_node *n;
	struct pf_state *state;

	RB_FOREACH(state, pf_state_tree_id, &tree_id) {
	state->src_node = NULL;
	state->nat_src_node = NULL;
	}
	RB_FOREACH(n, pf_src_tree, &tree_src_tracking) {
	n->expire = 1;
	n->states = 0;
	}
	pf_purge_expired_src_nodes(1);
	pf_status.src_nodes = 0;
	break;
	}

	case DIOCKILLSRCNODES: {
	struct pf_src_node *sn;
	struct pf_state *s;
	struct pfioc_src_node_kill *psnk = \
	(struct pfioc_src_node_kill *) addr;
	int killed = 0;

	RB_FOREACH(sn, pf_src_tree, &tree_src_tracking) {
	if (PF_MATCHA(psnk->psnk_src.neg, \
	&psnk->psnk_src.addr.v.a.addr, \
	&psnk->psnk_src.addr.v.a.mask, \
	&sn->addr, sn->af) &&
	PF_MATCHA(psnk->psnk_dst.neg, \
	&psnk->psnk_dst.addr.v.a.addr, \
	&psnk->psnk_dst.addr.v.a.mask, \
	&sn->raddr, sn->af)) {
	/* Handle state to src_node linkage */
	if (sn->states != 0) {
	RB_FOREACH(s, pf_state_tree_id,
	&tree_id) {
	if (s->src_node == sn)
	s->src_node = NULL;
	if (s->nat_src_node == sn)
	s->nat_src_node = NULL;
	}
	sn->states = 0;
	}
	sn->expire = 1;
	killed++;
	}
	}

	if (killed > 0)
	pf_purge_expired_src_nodes(1);

	psnk->psnk_af = killed;
	break;
	}

	case DIOCSETHOSTID: {
	u_int32_t hostid = (u_int32_t )addr;

	if (*hostid == 0)
	pf_status.hostid = arc4random();
	else
	pf_status.hostid = *hostid;
	break;
	}

	case DIOCOSFPFLUSH:
	pf_osfp_flush();
	break;

	case DIOCIGETIFACES: {
	struct pfioc_iface io = (struct pfioc_iface )addr;

	if (io->pfiio_esize != sizeof(struct pfi_kif)) {
	error = ENODEV;
	break;
	}
	error = pfi_get_ifaces(io->pfiio_name, io->pfiio_buffer,
	&io->pfiio_size);
	break;
	}

	case DIOCSETIFFLAG: {
	struct pfioc_iface io = (struct pfioc_iface )addr;

	error = pfi_set_flags(io->pfiio_name, io->pfiio_flags);
	break;
	}

	case DIOCCLRIFFLAG: {
	struct pfioc_iface io = (struct pfioc_iface )addr;

	error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags);
	break;
	}

	default:
	error = ENODEV;
	break;
	}
	fail:
	#ifdef __FreeBSD__
	PF_UNLOCK();

	if (flags & FWRITE)
	sx_xunlock(&pf_consistency_lock);
	else
	sx_sunlock(&pf_consistency_lock);
	#else
	splx(s);
	/* XXX: Lock order? */
	if (flags & FWRITE)
	rw_exit_write(&pf_consistency_lock);
	else
	rw_exit_read(&pf_consistency_lock);
	#endif
	return (error);
	}

	#ifdef __FreeBSD__
	/*
	* XXX - Check for version missmatch!!!
	*/
	static void
	pf_clear_states(void)
	{
	struct pf_state *state;

	RB_FOREACH(state, pf_state_tree_id, &tree_id) {
	state->timeout = PFTM_PURGE;
	#if NPFSYNC
	/* don't send out individual delete messages */
	state->sync_flags = PFSTATE_NOSYNC;
	#endif
	pf_unlink_state(state);
	}

	#if 0 /* NPFSYNC */
	/*
	* XXX This is called on module unload, we do not want to sync that over? */
	*/
	pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
	#endif
	}

	static int
	pf_clear_tables(void)
	{
	struct pfioc_table io;
	int error;

	bzero(&io, sizeof(io));

	error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel,
	io.pfrio_flags);

	return (error);
	}

	static void
	pf_clear_srcnodes(void)
	{
	struct pf_src_node *n;
	struct pf_state *state;

	RB_FOREACH(state, pf_state_tree_id, &tree_id) {
	state->src_node = NULL;
	state->nat_src_node = NULL;
	}
	RB_FOREACH(n, pf_src_tree, &tree_src_tracking) {
	n->expire = 1;
	n->states = 0;
	}
	}
	/*
	* XXX - Check for version missmatch!!!
	*/

	/*
	* Duplicate pfctl -Fa operation to get rid of as much as we can.
	*/
	static int
	shutdown_pf(void)
	{
	int error = 0;
	u_int32_t t[5];
	char nn = '\0';

	pf_status.running = 0;
	do {
	if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn))
	!= 0) {
	DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n"));
	break;
	}
	if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn))
	!= 0) {
	DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n"));
	break; /* XXX: rollback? */
	}
	if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn))
	!= 0) {
	DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n"));
	break; /* XXX: rollback? */
	}
	if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn))
	!= 0) {
	DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n"));
	break; /* XXX: rollback? */
	}
	if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn))
	!= 0) {
	DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n"));
	break; /* XXX: rollback? */
	}

	/* XXX: these should always succeed here */
	pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn);
	pf_commit_rules(t[1], PF_RULESET_FILTER, &nn);
	pf_commit_rules(t[2], PF_RULESET_NAT, &nn);
	pf_commit_rules(t[3], PF_RULESET_BINAT, &nn);
	pf_commit_rules(t[4], PF_RULESET_RDR, &nn);

	if ((error = pf_clear_tables()) != 0)
	break;

	#ifdef ALTQ
	if ((error = pf_begin_altq(&t[0])) != 0) {
	DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n"));
	break;
	}
	pf_commit_altq(t[0]);
	#endif

	pf_clear_states();

	pf_clear_srcnodes();

	/* status does not use malloced mem so no need to cleanup */
	/* fingerprints and interfaces have thier own cleanup code */
	} while(0);

	return (error);
	}

	static int
	pf_check_in(void arg, struct mbuf m, struct ifnet ifp, int dir,
	struct inpcb *inp)
	{
	/*
	* XXX Wed Jul 9 22:03:16 2003 UTC
	* OpenBSD has changed its byte ordering convention on ip_len/ip_off
	* in network stack. OpenBSD's network stack have converted
	* ip_len/ip_off to host byte order frist as FreeBSD.
	* Now this is not true anymore , so we should convert back to network
	* byte order.
	*/
	struct ip *h = NULL;
	int chk;

	if ((*m)->m_pkthdr.len >= (int)sizeof(struct ip)) {
	/* if m_pkthdr.len is less than ip header, pf will handle. */
	h = mtod(m, struct ip );
	HTONS(h->ip_len);
	HTONS(h->ip_off);
	}
	chk = pf_test(PF_IN, ifp, m, NULL, inp);
	if (chk && *m) {
	m_freem(*m);
	*m = NULL;
	}
	if (*m != NULL) {
	/* pf_test can change ip header location */
	h = mtod(m, struct ip );
	NTOHS(h->ip_len);
	NTOHS(h->ip_off);
	}
	return chk;
	}

	static int
	pf_check_out(void arg, struct mbuf m, struct ifnet ifp, int dir,
	struct inpcb *inp)
	{
	/*
	* XXX Wed Jul 9 22:03:16 2003 UTC
	* OpenBSD has changed its byte ordering convention on ip_len/ip_off
	* in network stack. OpenBSD's network stack have converted
	* ip_len/ip_off to host byte order frist as FreeBSD.
	* Now this is not true anymore , so we should convert back to network
	* byte order.
	*/
	struct ip *h = NULL;
	int chk;

	/* We need a proper CSUM befor we start (s. OpenBSD ip_output) */
	if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	in_delayed_cksum(*m);
	(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}
	if ((m)->m_pkthdr.len >= (int)sizeof(h)) {
	/* if m_pkthdr.len is less than ip header, pf will handle. */
	h = mtod(m, struct ip );
	HTONS(h->ip_len);
	HTONS(h->ip_off);
	}
	chk = pf_test(PF_OUT, ifp, m, NULL, inp);
	if (chk && *m) {
	m_freem(*m);
	*m = NULL;
	}
	if (*m != NULL) {
	/* pf_test can change ip header location */
	h = mtod(m, struct ip );
	NTOHS(h->ip_len);
	NTOHS(h->ip_off);
	}
	return chk;
	}

	#ifdef INET6
	static int
	pf_check6_in(void arg, struct mbuf m, struct ifnet ifp, int dir,
	struct inpcb *inp)
	{
	+ INIT_VNET_NET(curvnet);
	+
	/*
	* IPv6 is not affected by ip_len/ip_off byte order changes.
	*/
	int chk;

	/*
	* In case of loopback traffic IPv6 uses the real interface in
	* order to support scoped addresses. In order to support stateful
	* filtering we have change this to lo0 as it is the case in IPv4.
	*/
	chk = pf_test6(PF_IN, (*m)->m_flags & M_LOOP ? &V_loif[0] : ifp, m,
	NULL, inp);
	if (chk && *m) {
	m_freem(*m);
	*m = NULL;
	}
	return chk;
	}

	static int
	pf_check6_out(void arg, struct mbuf m, struct ifnet ifp, int dir,
	struct inpcb *inp)
	{
	/*
	* IPv6 does not affected ip_len/ip_off byte order changes.
	*/
	int chk;

	/* We need a proper CSUM befor we start (s. OpenBSD ip_output) */
	if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	in_delayed_cksum(*m);
	(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}
	chk = pf_test6(PF_OUT, ifp, m, NULL, inp);
	if (chk && *m) {
	m_freem(*m);
	*m = NULL;
	}
	return chk;
	}
	#endif /* INET6 */

	static int
	hook_pf(void)
	{
	struct pfil_head *pfh_inet;
	#ifdef INET6
	struct pfil_head *pfh_inet6;
	#endif

	PF_ASSERT(MA_NOTOWNED);

	if (pf_pfil_hooked)
	return (0);

	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
	if (pfh_inet == NULL)
	return (ESRCH); /* XXX */
	pfil_add_hook(pf_check_in, NULL, PFIL_IN \| PFIL_WAITOK, pfh_inet);
	pfil_add_hook(pf_check_out, NULL, PFIL_OUT \| PFIL_WAITOK, pfh_inet);
	#ifdef INET6
	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
	if (pfh_inet6 == NULL) {
	pfil_remove_hook(pf_check_in, NULL, PFIL_IN \| PFIL_WAITOK,
	pfh_inet);
	pfil_remove_hook(pf_check_out, NULL, PFIL_OUT \| PFIL_WAITOK,
	pfh_inet);
	return (ESRCH); /* XXX */
	}
	pfil_add_hook(pf_check6_in, NULL, PFIL_IN \| PFIL_WAITOK, pfh_inet6);
	pfil_add_hook(pf_check6_out, NULL, PFIL_OUT \| PFIL_WAITOK, pfh_inet6);
	#endif

	pf_pfil_hooked = 1;
	return (0);
	}

	static int
	dehook_pf(void)
	{
	struct pfil_head *pfh_inet;
	#ifdef INET6
	struct pfil_head *pfh_inet6;
	#endif

	PF_ASSERT(MA_NOTOWNED);

	if (pf_pfil_hooked == 0)
	return (0);

	pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET);
	if (pfh_inet == NULL)
	return (ESRCH); /* XXX */
	pfil_remove_hook(pf_check_in, NULL, PFIL_IN \| PFIL_WAITOK,
	pfh_inet);
	pfil_remove_hook(pf_check_out, NULL, PFIL_OUT \| PFIL_WAITOK,
	pfh_inet);
	#ifdef INET6
	pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
	if (pfh_inet6 == NULL)
	return (ESRCH); /* XXX */
	pfil_remove_hook(pf_check6_in, NULL, PFIL_IN \| PFIL_WAITOK,
	pfh_inet6);
	pfil_remove_hook(pf_check6_out, NULL, PFIL_OUT \| PFIL_WAITOK,
	pfh_inet6);
	#endif

	pf_pfil_hooked = 0;
	return (0);
	}

	static int
	pf_load(void)
	{
	init_zone_var();
	init_pf_mutex();
	pf_dev = make_dev(&pf_cdevsw, 0, 0, 0, 0600, PF_NAME);
	if (pfattach() < 0) {
	destroy_dev(pf_dev);
	destroy_pf_mutex();
	return (ENOMEM);
	}
	return (0);
	}

	static int
	pf_unload(void)
	{
	int error = 0;

	PF_LOCK();
	pf_status.running = 0;
	PF_UNLOCK();
	error = dehook_pf();
	if (error) {
	/*
	* Should not happen!
	* XXX Due to error code ESRCH, kldunload will show
	* a message like 'No such process'.
	*/
	printf("%s : pfil unregisteration fail\n", __FUNCTION__);
	return error;
	}
	PF_LOCK();
	shutdown_pf();
	pf_end_threads = 1;
	while (pf_end_threads < 2) {
	wakeup_one(pf_purge_thread);
	msleep(pf_purge_thread, &pf_task_mtx, 0, "pftmo", hz);
	}
	pfi_cleanup();
	pf_osfp_flush();
	pf_osfp_cleanup();
	cleanup_pf_zone();
	PF_UNLOCK();
	destroy_dev(pf_dev);
	destroy_pf_mutex();
	return error;
	}

	static int
	pf_modevent(module_t mod, int type, void *data)
	{
	int error = 0;

	switch(type) {
	case MOD_LOAD:
	error = pf_load();
	break;

	case MOD_UNLOAD:
	error = pf_unload();
	break;
	default:
	error = EINVAL;
	break;
	}
	return error;
	}

	static moduledata_t pf_mod = {
	"pf",
	pf_modevent,
	0
	};

	DECLARE_MODULE(pf, pf_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST);
	MODULE_VERSION(pf, PF_MODVER);
	#endif /* __FreeBSD__ */
	Index: head/sys/contrib/pf/net/pf_subr.c
	===================================================================
	--- head/sys/contrib/pf/net/pf_subr.c (revision 183549)
	+++ head/sys/contrib/pf/net/pf_subr.c (revision 183550)
	@@ -1,169 +1,170 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/libkern.h>
	#include <sys/mbuf.h>
	#include <sys/md5.h>
	#include <sys/time.h>
	#include <sys/random.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/bpf.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/udp.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/in_pcb.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/if_ether.h>
	#include <net/pfvar.h>

	/*
	* Following is where TCP initial sequence number generation occurs.
	*
	* There are two places where we must use initial sequence numbers:
	* 1. In SYN-ACK packets.
	* 2. In SYN packets.
	*
	* All ISNs for SYN-ACK packets are generated by the syncache. See
	* tcp_syncache.c for details.
	*
	* The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
	* depends on this property. In addition, these ISNs should be
	* unguessable so as to prevent connection hijacking. To satisfy
	* the requirements of this situation, the algorithm outlined in
	* RFC 1948 is used, with only small modifications.
	*
	* Implementation details:
	*
	* Time is based off the system timer, and is corrected so that it
	* increases by one megabyte per second. This allows for proper
	* recycling on high speed LANs while still leaving over an hour
	* before rollover.
	*
	* As reading the exact system time is too expensive to be done
	* whenever setting up a TCP connection, we increment the time
	* offset in two ways. First, a small random positive increment
	* is added to isn_offset for each connection that is set up.
	* Second, the function tcp_isn_tick fires once per clock tick
	* and increments isn_offset as necessary so that sequence numbers
	* are incremented at approximately ISN_BYTES_PER_SECOND. The
	* random positive increments serve only to ensure that the same
	* exact sequence number is never sent out twice (as could otherwise
	* happen when a port is recycled in less than the system tick
	* interval.)
	*
	* net.inet.tcp.isn_reseed_interval controls the number of seconds
	* between seeding of isn_secret. This is normally set to zero,
	* as reseeding should not be necessary.
	*
	* Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
	* isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
	* general, this means holding an exclusive (write) lock.
	*/

	#define ISN_BYTES_PER_SECOND 1048576
	#define ISN_STATIC_INCREMENT 4096
	#define ISN_RANDOM_INCREMENT (4096 - 1)

	static u_char isn_secret[32];
	static int isn_last_reseed;
	static u_int32_t isn_offset;
	static MD5_CTX isn_ctx;

	u_int32_t
	pf_new_isn(struct pf_state *s)
	{
	+ INIT_VNET_INET(curvnet);
	u_int32_t md5_buffer[4];
	u_int32_t new_isn;
	struct pf_state_host src, dst;

	/* Seed if this is the first use, reseed if requested. */
	if (V_isn_last_reseed == 0) {
	read_random(&V_isn_secret, sizeof(V_isn_secret));
	V_isn_last_reseed = ticks;
	}

	if (s->direction == PF_IN) {
	src = &s->ext;
	dst = &s->gwy;
	} else {
	src = &s->lan;
	dst = &s->ext;
	}

	/* Compute the md5 hash and return the ISN. */
	MD5Init(&V_isn_ctx);
	MD5Update(&V_isn_ctx, (u_char *) &dst->port, sizeof(u_short));
	MD5Update(&V_isn_ctx, (u_char *) &src->port, sizeof(u_short));
	#ifdef INET6
	if (s->af == AF_INET6) {
	MD5Update(&V_isn_ctx, (u_char *) &dst->addr,
	sizeof(struct in6_addr));
	MD5Update(&V_isn_ctx, (u_char *) &src->addr,
	sizeof(struct in6_addr));
	} else
	#endif
	{
	MD5Update(&V_isn_ctx, (u_char *) &dst->addr,
	sizeof(struct in_addr));
	MD5Update(&V_isn_ctx, (u_char *) &src->addr,
	sizeof(struct in_addr));
	}
	MD5Update(&V_isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
	MD5Final((u_char *) &md5_buffer, &V_isn_ctx);
	new_isn = (tcp_seq) md5_buffer[0];
	V_isn_offset += ISN_STATIC_INCREMENT +
	(arc4random() & ISN_RANDOM_INCREMENT);
	new_isn += V_isn_offset;
	return (new_isn);
	}
	Index: head/sys/contrib/pf/net/pfvar.h
	===================================================================
	--- head/sys/contrib/pf/net/pfvar.h (revision 183549)
	+++ head/sys/contrib/pf/net/pfvar.h (revision 183550)
	@@ -1,1859 +1,1866 @@
	/* $FreeBSD$ */
	/* $OpenBSD: pfvar.h,v 1.244 2007/02/23 21:31:51 deraadt Exp $ */

	/*
	* Copyright (c) 2001 Daniel Hartmeier
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials provided
	* with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	*/

	#ifndef _NET_PFVAR_H_
	#define _NET_PFVAR_H_

	#include <sys/param.h>
	#include <sys/types.h>
	#include <sys/queue.h>
	#include <sys/tree.h>
	#ifdef __FreeBSD__
	#include <sys/lock.h>
	#include <sys/sx.h>
	#else
	#include <sys/rwlock.h>
	#endif

	#include <net/radix.h>
	#include <net/route.h>
	#ifdef __FreeBSD__
	#include <net/if_clone.h>
	#include <net/pf_mtag.h>
	#include <vm/uma.h>
	#else
	#include <netinet/ip_ipsp.h>
	#endif

	#ifdef __FreeBSD__
	#include <netinet/in.h>
	#endif

	#include <netinet/tcp_fsm.h>

	struct ip;
	struct ip6_hdr;
	#ifdef __FreeBSD__
	struct inpcb;
	#endif

	#define PF_TCPS_PROXY_SRC ((TCP_NSTATES)+0)
	#define PF_TCPS_PROXY_DST ((TCP_NSTATES)+1)

	#define PF_MD5_DIGEST_LENGTH 16
	#ifdef MD5_DIGEST_LENGTH
	#if PF_MD5_DIGEST_LENGTH != MD5_DIGEST_LENGTH
	#error
	#endif
	#endif

	enum { PF_INOUT, PF_IN, PF_OUT };
	enum { PF_LAN_EXT, PF_EXT_GWY, PF_ID };
	enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT,
	PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP };
	enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT,
	PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX };
	enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT,
	PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG };
	enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY };
	enum { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL,
	PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER,
	PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET };
	enum { PF_GET_NONE, PF_GET_CLR_CNTR };

	/*
	* Note about PFTM_*: real indices into pf_rule.timeout[] come before
	* PFTM_MAX, special cases afterwards. See pf_state_expires().
	*/
	enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED,
	PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED,
	PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE,
	PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY,
	PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE,
	PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL,
	PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE,
	PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED,
	PFTM_UNTIL_PACKET };

	/* PFTM default values */
	#define PFTM_TCP_FIRST_PACKET_VAL 120 /* First TCP packet */
	#define PFTM_TCP_OPENING_VAL 30 /* No response yet */
	#define PFTM_TCP_ESTABLISHED_VAL 246060/* Established */
	#define PFTM_TCP_CLOSING_VAL 15 * 60 /* Half closed */
	#define PFTM_TCP_FIN_WAIT_VAL 45 /* Got both FINs */
	#define PFTM_TCP_CLOSED_VAL 90 /* Got a RST */
	#define PFTM_UDP_FIRST_PACKET_VAL 60 /* First UDP packet */
	#define PFTM_UDP_SINGLE_VAL 30 /* Unidirectional */
	#define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */
	#define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */
	#define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */
	#define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */
	#define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */
	#define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */
	#define PFTM_FRAG_VAL 30 /* Fragment expire */
	#define PFTM_INTERVAL_VAL 10 /* Expire interval */
	#define PFTM_SRC_NODE_VAL 0 /* Source tracking */
	#define PFTM_TS_DIFF_VAL 30 /* Allowed TS diff */

	enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO };
	enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS,
	PF_LIMIT_TABLES, PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX };
	#define PF_POOL_IDMASK 0x0f
	enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM,
	PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN };
	enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL,
	PF_ADDR_TABLE, PF_ADDR_RTLABEL, PF_ADDR_URPFFAILED };
	#define PF_POOL_TYPEMASK 0x0f
	#define PF_POOL_STICKYADDR 0x20
	#define PF_WSCALE_FLAG 0x80
	#define PF_WSCALE_MASK 0x0f

	#define PF_LOG 0x01
	#define PF_LOG_ALL 0x02
	#define PF_LOG_SOCKET_LOOKUP 0x04

	struct pf_addr {
	union {
	struct in_addr v4;
	struct in6_addr v6;
	u_int8_t addr8[16];
	u_int16_t addr16[8];
	u_int32_t addr32[4];
	} pfa; /* 128-bit address */
	#define v4 pfa.v4
	#define v6 pfa.v6
	#define addr8 pfa.addr8
	#define addr16 pfa.addr16
	#define addr32 pfa.addr32
	};

	#define PF_TABLE_NAME_SIZE 32

	#define PFI_AFLAG_NETWORK 0x01
	#define PFI_AFLAG_BROADCAST 0x02
	#define PFI_AFLAG_PEER 0x04
	#define PFI_AFLAG_MODEMASK 0x07
	#define PFI_AFLAG_NOALIAS 0x08

	struct pf_addr_wrap {
	union {
	struct {
	struct pf_addr addr;
	struct pf_addr mask;
	} a;
	char ifname[IFNAMSIZ];
	char tblname[PF_TABLE_NAME_SIZE];
	#ifdef __FreeBSD__
	#define RTLABEL_LEN 32
	#endif
	char rtlabelname[RTLABEL_LEN];
	u_int32_t rtlabel;
	} v;
	union {
	struct pfi_dynaddr *dyn;
	struct pfr_ktable *tbl;
	int dyncnt;
	int tblcnt;
	} p;
	u_int8_t type; /* PF_ADDR_* */
	u_int8_t iflags; /* PFI_AFLAG_* */
	};

	#ifdef _KERNEL

	struct pfi_dynaddr {
	TAILQ_ENTRY(pfi_dynaddr) entry;
	struct pf_addr pfid_addr4;
	struct pf_addr pfid_mask4;
	struct pf_addr pfid_addr6;
	struct pf_addr pfid_mask6;
	struct pfr_ktable *pfid_kt;
	struct pfi_kif *pfid_kif;
	void *pfid_hook_cookie;
	int pfid_net; /* mask or 128 */
	int pfid_acnt4; /* address count IPv4 */
	int pfid_acnt6; /* address count IPv6 */
	sa_family_t pfid_af; /* rule af */
	u_int8_t pfid_iflags; /* PFI_AFLAG_* */
	};

	/*
	* Address manipulation macros
	*/

	#ifdef __FreeBSD__
	#define splsoftnet() splnet()

	#define HTONL(x) (x) = htonl((__uint32_t)(x))
	#define HTONS(x) (x) = htons((__uint16_t)(x))
	#define NTOHL(x) (x) = ntohl((__uint32_t)(x))
	#define NTOHS(x) (x) = ntohs((__uint16_t)(x))

	#define PF_NAME "pf"

	#define PR_NOWAIT M_NOWAIT
	#define pool_get(p, f) uma_zalloc(*(p), (f))
	#define pool_put(p, o) uma_zfree(*(p), (o))

	#define UMA_CREATE(var, type, desc) \
	var = uma_zcreate(desc, sizeof(type), \
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); \
	if (var == NULL) break
	#define UMA_DESTROY(var) \
	if(var) uma_zdestroy(var)

	extern struct mtx pf_task_mtx;

	#define PF_ASSERT(h) mtx_assert(&pf_task_mtx, (h))

	#define PF_LOCK() do { \
	PF_ASSERT(MA_NOTOWNED); \
	mtx_lock(&pf_task_mtx); \
	} while(0)
	#define PF_UNLOCK() do { \
	PF_ASSERT(MA_OWNED); \
	mtx_unlock(&pf_task_mtx); \
	} while(0)

	#define PF_COPYIN(uaddr, kaddr, len, r) do { \
	PF_UNLOCK(); \
	r = copyin((uaddr), (kaddr), (len)); \
	PF_LOCK(); \
	} while(0)

	#define PF_COPYOUT(kaddr, uaddr, len, r) do { \
	PF_UNLOCK(); \
	r = copyout((kaddr), (uaddr), (len)); \
	PF_LOCK(); \
	} while(0)

	extern void init_pf_mutex(void);
	extern void destroy_pf_mutex(void);

	#define PF_MODVER 1
	#define PFLOG_MODVER 1
	#define PFSYNC_MODVER 1

	#define PFLOG_MINVER 1
	#define PFLOG_PREFVER PFLOG_MODVER
	#define PFLOG_MAXVER 1
	#define PFSYNC_MINVER 1
	#define PFSYNC_PREFVER PFSYNC_MODVER
	#define PFSYNC_MAXVER 1
	#endif /* __FreeBSD__ */

	#ifdef INET
	#ifndef INET6
	#define PF_INET_ONLY
	#endif /* ! INET6 */
	#endif /* INET */

	#ifdef INET6
	#ifndef INET
	#define PF_INET6_ONLY
	#endif /* ! INET */
	#endif /* INET6 */

	#ifdef INET
	#ifdef INET6
	#define PF_INET_INET6
	#endif /* INET6 */
	#endif /* INET */

	#else

	#define PF_INET_INET6

	#endif /* _KERNEL */

	/* Both IPv4 and IPv6 */
	#ifdef PF_INET_INET6

	#define PF_AEQ(a, b, c) \
	((c == AF_INET && (a)->addr32[0] == (b)->addr32[0]) \|\| \
	((a)->addr32[3] == (b)->addr32[3] && \
	(a)->addr32[2] == (b)->addr32[2] && \
	(a)->addr32[1] == (b)->addr32[1] && \
	(a)->addr32[0] == (b)->addr32[0])) \

	#define PF_ANEQ(a, b, c) \
	((c == AF_INET && (a)->addr32[0] != (b)->addr32[0]) \|\| \
	((a)->addr32[3] != (b)->addr32[3] \|\| \
	(a)->addr32[2] != (b)->addr32[2] \|\| \
	(a)->addr32[1] != (b)->addr32[1] \|\| \
	(a)->addr32[0] != (b)->addr32[0])) \

	#define PF_AZERO(a, c) \
	((c == AF_INET && !(a)->addr32[0]) \|\| \
	(!(a)->addr32[0] && !(a)->addr32[1] && \
	!(a)->addr32[2] && !(a)->addr32[3] )) \

	#define PF_MATCHA(n, a, m, b, f) \
	pf_match_addr(n, a, m, b, f)

	#define PF_ACPY(a, b, f) \
	pf_addrcpy(a, b, f)

	#define PF_AINC(a, f) \
	pf_addr_inc(a, f)

	#define PF_POOLMASK(a, b, c, d, f) \
	pf_poolmask(a, b, c, d, f)

	#else

	/* Just IPv6 */

	#ifdef PF_INET6_ONLY

	#define PF_AEQ(a, b, c) \
	((a)->addr32[3] == (b)->addr32[3] && \
	(a)->addr32[2] == (b)->addr32[2] && \
	(a)->addr32[1] == (b)->addr32[1] && \
	(a)->addr32[0] == (b)->addr32[0]) \

	#define PF_ANEQ(a, b, c) \
	((a)->addr32[3] != (b)->addr32[3] \|\| \
	(a)->addr32[2] != (b)->addr32[2] \|\| \
	(a)->addr32[1] != (b)->addr32[1] \|\| \
	(a)->addr32[0] != (b)->addr32[0]) \

	#define PF_AZERO(a, c) \
	(!(a)->addr32[0] && \
	!(a)->addr32[1] && \
	!(a)->addr32[2] && \
	!(a)->addr32[3] ) \

	#define PF_MATCHA(n, a, m, b, f) \
	pf_match_addr(n, a, m, b, f)

	#define PF_ACPY(a, b, f) \
	pf_addrcpy(a, b, f)

	#define PF_AINC(a, f) \
	pf_addr_inc(a, f)

	#define PF_POOLMASK(a, b, c, d, f) \
	pf_poolmask(a, b, c, d, f)

	#else

	/* Just IPv4 */
	#ifdef PF_INET_ONLY

	#define PF_AEQ(a, b, c) \
	((a)->addr32[0] == (b)->addr32[0])

	#define PF_ANEQ(a, b, c) \
	((a)->addr32[0] != (b)->addr32[0])

	#define PF_AZERO(a, c) \
	(!(a)->addr32[0])

	#define PF_MATCHA(n, a, m, b, f) \
	pf_match_addr(n, a, m, b, f)

	#define PF_ACPY(a, b, f) \
	(a)->v4.s_addr = (b)->v4.s_addr

	#define PF_AINC(a, f) \
	do { \
	(a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \
	} while (0)

	#define PF_POOLMASK(a, b, c, d, f) \
	do { \
	(a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) \| \
	(((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \
	} while (0)

	#endif /* PF_INET_ONLY */
	#endif /* PF_INET6_ONLY */
	#endif /* PF_INET_INET6 */

	#define PF_MISMATCHAW(aw, x, af, neg, ifp) \
	( \
	(((aw)->type == PF_ADDR_NOROUTE && \
	pf_routable((x), (af), NULL)) \|\| \
	(((aw)->type == PF_ADDR_URPFFAILED && (ifp) != NULL && \
	pf_routable((x), (af), (ifp))) \|\| \
	((aw)->type == PF_ADDR_RTLABEL && \
	!pf_rtlabel_match((x), (af), (aw))) \|\| \
	((aw)->type == PF_ADDR_TABLE && \
	!pfr_match_addr((aw)->p.tbl, (x), (af))) \|\| \
	((aw)->type == PF_ADDR_DYNIFTL && \
	!pfi_match_addr((aw)->p.dyn, (x), (af))) \|\| \
	((aw)->type == PF_ADDR_ADDRMASK && \
	!PF_AZERO(&(aw)->v.a.mask, (af)) && \
	!PF_MATCHA(0, &(aw)->v.a.addr, \
	&(aw)->v.a.mask, (x), (af))))) != \
	(neg) \
	)


	struct pf_rule_uid {
	uid_t uid[2];
	u_int8_t op;
	};

	struct pf_rule_gid {
	uid_t gid[2];
	u_int8_t op;
	};

	struct pf_rule_addr {
	struct pf_addr_wrap addr;
	u_int16_t port[2];
	u_int8_t neg;
	u_int8_t port_op;
	};

	struct pf_pooladdr {
	struct pf_addr_wrap addr;
	TAILQ_ENTRY(pf_pooladdr) entries;
	char ifname[IFNAMSIZ];
	struct pfi_kif *kif;
	};

	TAILQ_HEAD(pf_palist, pf_pooladdr);

	struct pf_poolhashkey {
	union {
	u_int8_t key8[16];
	u_int16_t key16[8];
	u_int32_t key32[4];
	} pfk; /* 128-bit hash key */
	#define key8 pfk.key8
	#define key16 pfk.key16
	#define key32 pfk.key32
	};

	struct pf_pool {
	struct pf_palist list;
	struct pf_pooladdr *cur;
	struct pf_poolhashkey key;
	struct pf_addr counter;
	int tblidx;
	u_int16_t proxy_port[2];
	u_int8_t port_op;
	u_int8_t opts;
	};


	/* A packed Operating System description for fingerprinting */
	typedef u_int32_t pf_osfp_t;
	#define PF_OSFP_ANY ((pf_osfp_t)0)
	#define PF_OSFP_UNKNOWN ((pf_osfp_t)-1)
	#define PF_OSFP_NOMATCH ((pf_osfp_t)-2)

	struct pf_osfp_entry {
	SLIST_ENTRY(pf_osfp_entry) fp_entry;
	pf_osfp_t fp_os;
	int fp_enflags;
	#define PF_OSFP_EXPANDED 0x001 /* expanded entry */
	#define PF_OSFP_GENERIC 0x002 /* generic signature */
	#define PF_OSFP_NODETAIL 0x004 /* no p0f details */
	#define PF_OSFP_LEN 32
	char fp_class_nm[PF_OSFP_LEN];
	char fp_version_nm[PF_OSFP_LEN];
	char fp_subtype_nm[PF_OSFP_LEN];
	};
	#define PF_OSFP_ENTRY_EQ(a, b) \
	((a)->fp_os == (b)->fp_os && \
	memcmp((a)->fp_class_nm, (b)->fp_class_nm, PF_OSFP_LEN) == 0 && \
	memcmp((a)->fp_version_nm, (b)->fp_version_nm, PF_OSFP_LEN) == 0 && \
	memcmp((a)->fp_subtype_nm, (b)->fp_subtype_nm, PF_OSFP_LEN) == 0)

	/* handle pf_osfp_t packing */
	#define _FP_RESERVED_BIT 1 /* For the special negative #defines */
	#define _FP_UNUSED_BITS 1
	#define _FP_CLASS_BITS 10 /* OS Class (Windows, Linux) */
	#define _FP_VERSION_BITS 10 /* OS version (95, 98, NT, 2.4.54, 3.2) */
	#define _FP_SUBTYPE_BITS 10 /* patch level (NT SP4, SP3, ECN patch) */
	#define PF_OSFP_UNPACK(osfp, class, version, subtype) do { \
	(class) = ((osfp) >> (_FP_VERSION_BITS+_FP_SUBTYPE_BITS)) & \
	((1 << _FP_CLASS_BITS) - 1); \
	(version) = ((osfp) >> _FP_SUBTYPE_BITS) & \
	((1 << _FP_VERSION_BITS) - 1);\
	(subtype) = (osfp) & ((1 << _FP_SUBTYPE_BITS) - 1); \
	} while(0)
	#define PF_OSFP_PACK(osfp, class, version, subtype) do { \
	(osfp) = ((class) & ((1 << _FP_CLASS_BITS) - 1)) << (_FP_VERSION_BITS \
	+ _FP_SUBTYPE_BITS); \
	(osfp) \|= ((version) & ((1 << _FP_VERSION_BITS) - 1)) << \
	_FP_SUBTYPE_BITS; \
	(osfp) \|= (subtype) & ((1 << _FP_SUBTYPE_BITS) - 1); \
	} while(0)

	/* the fingerprint of an OSes TCP SYN packet */
	typedef u_int64_t pf_tcpopts_t;
	struct pf_os_fingerprint {
	SLIST_HEAD(pf_osfp_enlist, pf_osfp_entry) fp_oses; /* list of matches */
	pf_tcpopts_t fp_tcpopts; /* packed TCP options */
	u_int16_t fp_wsize; /* TCP window size */
	u_int16_t fp_psize; /* ip->ip_len */
	u_int16_t fp_mss; /* TCP MSS */
	u_int16_t fp_flags;
	#define PF_OSFP_WSIZE_MOD 0x0001 /* Window modulus */
	#define PF_OSFP_WSIZE_DC 0x0002 /* Window don't care */
	#define PF_OSFP_WSIZE_MSS 0x0004 /* Window multiple of MSS */
	#define PF_OSFP_WSIZE_MTU 0x0008 /* Window multiple of MTU */
	#define PF_OSFP_PSIZE_MOD 0x0010 /* packet size modulus */
	#define PF_OSFP_PSIZE_DC 0x0020 /* packet size don't care */
	#define PF_OSFP_WSCALE 0x0040 /* TCP window scaling */
	#define PF_OSFP_WSCALE_MOD 0x0080 /* TCP window scale modulus */
	#define PF_OSFP_WSCALE_DC 0x0100 /* TCP window scale dont-care */
	#define PF_OSFP_MSS 0x0200 /* TCP MSS */
	#define PF_OSFP_MSS_MOD 0x0400 /* TCP MSS modulus */
	#define PF_OSFP_MSS_DC 0x0800 /* TCP MSS dont-care */
	#define PF_OSFP_DF 0x1000 /* IPv4 don't fragment bit */
	#define PF_OSFP_TS0 0x2000 /* Zero timestamp */
	#define PF_OSFP_INET6 0x4000 /* IPv6 */
	u_int8_t fp_optcnt; /* TCP option count */
	u_int8_t fp_wscale; /* TCP window scaling */
	u_int8_t fp_ttl; /* IPv4 TTL */
	#define PF_OSFP_MAXTTL_OFFSET 40
	/* TCP options packing */
	#define PF_OSFP_TCPOPT_NOP 0x0 /* TCP NOP option */
	#define PF_OSFP_TCPOPT_WSCALE 0x1 /* TCP window scaling option */
	#define PF_OSFP_TCPOPT_MSS 0x2 /* TCP max segment size opt */
	#define PF_OSFP_TCPOPT_SACK 0x3 /* TCP SACK OK option */
	#define PF_OSFP_TCPOPT_TS 0x4 /* TCP timestamp option */
	#define PF_OSFP_TCPOPT_BITS 3 /* bits used by each option */
	#define PF_OSFP_MAX_OPTS \
	(sizeof(((struct pf_os_fingerprint )0)->fp_tcpopts) 8) \
	/ PF_OSFP_TCPOPT_BITS

	SLIST_ENTRY(pf_os_fingerprint) fp_next;
	};

	struct pf_osfp_ioctl {
	struct pf_osfp_entry fp_os;
	pf_tcpopts_t fp_tcpopts; /* packed TCP options */
	u_int16_t fp_wsize; /* TCP window size */
	u_int16_t fp_psize; /* ip->ip_len */
	u_int16_t fp_mss; /* TCP MSS */
	u_int16_t fp_flags;
	u_int8_t fp_optcnt; /* TCP option count */
	u_int8_t fp_wscale; /* TCP window scaling */
	u_int8_t fp_ttl; /* IPv4 TTL */

	int fp_getnum; /* DIOCOSFPGET number */
	};


	union pf_rule_ptr {
	struct pf_rule *ptr;
	u_int32_t nr;
	};

	#define PF_ANCHOR_NAME_SIZE 64

	struct pf_rule {
	struct pf_rule_addr src;
	struct pf_rule_addr dst;
	#define PF_SKIP_IFP 0
	#define PF_SKIP_DIR 1
	#define PF_SKIP_AF 2
	#define PF_SKIP_PROTO 3
	#define PF_SKIP_SRC_ADDR 4
	#define PF_SKIP_SRC_PORT 5
	#define PF_SKIP_DST_ADDR 6
	#define PF_SKIP_DST_PORT 7
	#define PF_SKIP_COUNT 8
	union pf_rule_ptr skip[PF_SKIP_COUNT];
	#define PF_RULE_LABEL_SIZE 64
	char label[PF_RULE_LABEL_SIZE];
	#define PF_QNAME_SIZE 64
	char ifname[IFNAMSIZ];
	char qname[PF_QNAME_SIZE];
	char pqname[PF_QNAME_SIZE];
	#define PF_TAG_NAME_SIZE 64
	char tagname[PF_TAG_NAME_SIZE];
	char match_tagname[PF_TAG_NAME_SIZE];

	char overload_tblname[PF_TABLE_NAME_SIZE];

	TAILQ_ENTRY(pf_rule) entries;
	struct pf_pool rpool;

	u_int64_t evaluations;
	u_int64_t packets[2];
	u_int64_t bytes[2];

	struct pfi_kif *kif;
	struct pf_anchor *anchor;
	struct pfr_ktable *overload_tbl;

	pf_osfp_t os_fingerprint;

	int rtableid;
	u_int32_t timeout[PFTM_MAX];
	u_int32_t states;
	u_int32_t max_states;
	u_int32_t src_nodes;
	u_int32_t max_src_nodes;
	u_int32_t max_src_states;
	u_int32_t spare1; /* netgraph */
	u_int32_t max_src_conn;
	struct {
	u_int32_t limit;
	u_int32_t seconds;
	} max_src_conn_rate;
	u_int32_t qid;
	u_int32_t pqid;
	u_int32_t rt_listid;
	u_int32_t nr;
	u_int32_t prob;
	uid_t cuid;
	pid_t cpid;

	u_int16_t return_icmp;
	u_int16_t return_icmp6;
	u_int16_t max_mss;
	u_int16_t tag;
	u_int16_t match_tag;
	u_int16_t spare2; /* netgraph */

	struct pf_rule_uid uid;
	struct pf_rule_gid gid;

	u_int32_t rule_flag;
	u_int8_t action;
	u_int8_t direction;
	u_int8_t log;
	u_int8_t logif;
	u_int8_t quick;
	u_int8_t ifnot;
	u_int8_t match_tag_not;
	u_int8_t natpass;

	#define PF_STATE_NORMAL 0x1
	#define PF_STATE_MODULATE 0x2
	#define PF_STATE_SYNPROXY 0x3
	u_int8_t keep_state;
	sa_family_t af;
	u_int8_t proto;
	u_int8_t type;
	u_int8_t code;
	u_int8_t flags;
	u_int8_t flagset;
	u_int8_t min_ttl;
	u_int8_t allow_opts;
	u_int8_t rt;
	u_int8_t return_ttl;
	u_int8_t tos;
	u_int8_t anchor_relative;
	u_int8_t anchor_wildcard;

	#define PF_FLUSH 0x01
	#define PF_FLUSH_GLOBAL 0x02
	u_int8_t flush;
	};

	/* rule flags */
	#define PFRULE_DROP 0x0000
	#define PFRULE_RETURNRST 0x0001
	#define PFRULE_FRAGMENT 0x0002
	#define PFRULE_RETURNICMP 0x0004
	#define PFRULE_RETURN 0x0008
	#define PFRULE_NOSYNC 0x0010
	#define PFRULE_SRCTRACK 0x0020 /* track source states */
	#define PFRULE_RULESRCTRACK 0x0040 /* per rule */

	/* scrub flags */
	#define PFRULE_NODF 0x0100
	#define PFRULE_FRAGCROP 0x0200 /* non-buffering frag cache */
	#define PFRULE_FRAGDROP 0x0400 /* drop funny fragments */
	#define PFRULE_RANDOMID 0x0800
	#define PFRULE_REASSEMBLE_TCP 0x1000

	/* rule flags again */
	#define PFRULE_IFBOUND 0x00010000 /* if-bound */

	#define PFSTATE_HIWAT 10000 /* default state table size */
	#define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */
	#define PFSTATE_ADAPT_END 12000 /* default adaptive timeout end */


	struct pf_threshold {
	u_int32_t limit;
	#define PF_THRESHOLD_MULT 1000
	#define PF_THRESHOLD_MAX 0xffffffff / PF_THRESHOLD_MULT
	u_int32_t seconds;
	u_int32_t count;
	u_int32_t last;
	};

	struct pf_src_node {
	RB_ENTRY(pf_src_node) entry;
	struct pf_addr addr;
	struct pf_addr raddr;
	union pf_rule_ptr rule;
	struct pfi_kif *kif;
	u_int64_t bytes[2];
	u_int64_t packets[2];
	u_int32_t states;
	u_int32_t conn;
	struct pf_threshold conn_rate;
	u_int32_t creation;
	u_int32_t expire;
	sa_family_t af;
	u_int8_t ruletype;
	};

	#define PFSNODE_HIWAT 10000 /* default source node table size */

	struct pf_state_scrub {
	struct timeval pfss_last; /* time received last packet */
	u_int32_t pfss_tsecr; /* last echoed timestamp */
	u_int32_t pfss_tsval; /* largest timestamp */
	u_int32_t pfss_tsval0; /* original timestamp */
	u_int16_t pfss_flags;
	#define PFSS_TIMESTAMP 0x0001 /* modulate timestamp */
	#define PFSS_PAWS 0x0010 /* stricter PAWS checks */
	#define PFSS_PAWS_IDLED 0x0020 /* was idle too long. no PAWS */
	#define PFSS_DATA_TS 0x0040 /* timestamp on data packets */
	#define PFSS_DATA_NOTS 0x0080 /* no timestamp on data packets */
	u_int8_t pfss_ttl; /* stashed TTL */
	u_int8_t pad;
	u_int32_t pfss_ts_mod; /* timestamp modulation */
	};

	struct pf_state_host {
	struct pf_addr addr;
	u_int16_t port;
	u_int16_t pad;
	};

	struct pf_state_peer {
	u_int32_t seqlo; /* Max sequence number sent */
	u_int32_t seqhi; /* Max the other end ACKd + win */
	u_int32_t seqdiff; /* Sequence number modulator */
	u_int16_t max_win; /* largest window (pre scaling) */
	u_int8_t state; /* active state level */
	u_int8_t wscale; /* window scaling factor */
	u_int16_t mss; /* Maximum segment size option */
	u_int8_t tcp_est; /* Did we reach TCPS_ESTABLISHED */
	struct pf_state_scrub scrub; / state is scrubbed */
	u_int8_t pad[3];
	};

	TAILQ_HEAD(pf_state_queue, pf_state);

	/* keep synced with struct pf_state, used in RB_FIND */
	struct pf_state_cmp {
	u_int64_t id;
	u_int32_t creatorid;
	struct pf_state_host lan;
	struct pf_state_host gwy;
	struct pf_state_host ext;
	sa_family_t af;
	u_int8_t proto;
	u_int8_t direction;
	u_int8_t pad;
	};

	struct pf_state {
	u_int64_t id;
	u_int32_t creatorid;
	struct pf_state_host lan;
	struct pf_state_host gwy;
	struct pf_state_host ext;
	sa_family_t af;
	u_int8_t proto;
	u_int8_t direction;
	#ifdef __FreeBSD__
	u_int8_t local_flags;
	#define PFSTATE_EXPIRING 0x01
	#else
	u_int8_t pad;
	#endif
	u_int8_t log;
	u_int8_t allow_opts;
	u_int8_t timeout;
	u_int8_t sync_flags;
	#define PFSTATE_NOSYNC 0x01
	#define PFSTATE_FROMSYNC 0x02
	#define PFSTATE_STALE 0x04
	union {
	struct {
	RB_ENTRY(pf_state) entry_lan_ext;
	RB_ENTRY(pf_state) entry_ext_gwy;
	RB_ENTRY(pf_state) entry_id;
	TAILQ_ENTRY(pf_state) entry_list;
	struct pfi_kif *kif;
	} s;
	char ifname[IFNAMSIZ];
	} u;
	struct pf_state_peer src;
	struct pf_state_peer dst;
	union pf_rule_ptr rule;
	union pf_rule_ptr anchor;
	union pf_rule_ptr nat_rule;
	struct pf_addr rt_addr;
	struct pfi_kif *rt_kif;
	struct pf_src_node *src_node;
	struct pf_src_node *nat_src_node;
	u_int64_t packets[2];
	u_int64_t bytes[2];
	u_int32_t creation;
	u_int32_t expire;
	u_int32_t pfsync_time;
	u_int16_t tag;
	};

	TAILQ_HEAD(pf_rulequeue, pf_rule);

	struct pf_anchor;

	struct pf_ruleset {
	struct {
	struct pf_rulequeue queues[2];
	struct {
	struct pf_rulequeue *ptr;
	struct pf_rule **ptr_array;
	u_int32_t rcount;
	u_int32_t ticket;
	int open;
	} active, inactive;
	} rules[PF_RULESET_MAX];
	struct pf_anchor *anchor;
	u_int32_t tticket;
	int tables;
	int topen;
	};

	RB_HEAD(pf_anchor_global, pf_anchor);
	RB_HEAD(pf_anchor_node, pf_anchor);
	struct pf_anchor {
	RB_ENTRY(pf_anchor) entry_global;
	RB_ENTRY(pf_anchor) entry_node;
	struct pf_anchor *parent;
	struct pf_anchor_node children;
	char name[PF_ANCHOR_NAME_SIZE];
	char path[MAXPATHLEN];
	struct pf_ruleset ruleset;
	int refcnt; /* anchor rules */
	int match;
	};
	RB_PROTOTYPE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare);
	RB_PROTOTYPE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare);

	#define PF_RESERVED_ANCHOR "_pf"

	#define PFR_TFLAG_PERSIST 0x00000001
	#define PFR_TFLAG_CONST 0x00000002
	#define PFR_TFLAG_ACTIVE 0x00000004
	#define PFR_TFLAG_INACTIVE 0x00000008
	#define PFR_TFLAG_REFERENCED 0x00000010
	#define PFR_TFLAG_REFDANCHOR 0x00000020
	#define PFR_TFLAG_USRMASK 0x00000003
	#define PFR_TFLAG_SETMASK 0x0000003C
	#define PFR_TFLAG_ALLMASK 0x0000003F

	struct pfr_table {
	char pfrt_anchor[MAXPATHLEN];
	char pfrt_name[PF_TABLE_NAME_SIZE];
	u_int32_t pfrt_flags;
	u_int8_t pfrt_fback;
	};

	enum { PFR_FB_NONE, PFR_FB_MATCH, PFR_FB_ADDED, PFR_FB_DELETED,
	PFR_FB_CHANGED, PFR_FB_CLEARED, PFR_FB_DUPLICATE,
	PFR_FB_NOTMATCH, PFR_FB_CONFLICT, PFR_FB_MAX };

	struct pfr_addr {
	union {
	struct in_addr _pfra_ip4addr;
	struct in6_addr _pfra_ip6addr;
	} pfra_u;
	u_int8_t pfra_af;
	u_int8_t pfra_net;
	u_int8_t pfra_not;
	u_int8_t pfra_fback;
	};
	#define pfra_ip4addr pfra_u._pfra_ip4addr
	#define pfra_ip6addr pfra_u._pfra_ip6addr

	enum { PFR_DIR_IN, PFR_DIR_OUT, PFR_DIR_MAX };
	enum { PFR_OP_BLOCK, PFR_OP_PASS, PFR_OP_ADDR_MAX, PFR_OP_TABLE_MAX };
	#define PFR_OP_XPASS PFR_OP_ADDR_MAX

	struct pfr_astats {
	struct pfr_addr pfras_a;
	u_int64_t pfras_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX];
	u_int64_t pfras_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX];
	long pfras_tzero;
	};

	enum { PFR_REFCNT_RULE, PFR_REFCNT_ANCHOR, PFR_REFCNT_MAX };

	struct pfr_tstats {
	struct pfr_table pfrts_t;
	u_int64_t pfrts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX];
	u_int64_t pfrts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX];
	u_int64_t pfrts_match;
	u_int64_t pfrts_nomatch;
	long pfrts_tzero;
	int pfrts_cnt;
	int pfrts_refcnt[PFR_REFCNT_MAX];
	};
	#define pfrts_name pfrts_t.pfrt_name
	#define pfrts_flags pfrts_t.pfrt_flags

	#ifndef _SOCKADDR_UNION_DEFINED
	#define _SOCKADDR_UNION_DEFINED
	union sockaddr_union {
	struct sockaddr sa;
	struct sockaddr_in sin;
	struct sockaddr_in6 sin6;
	};
	#endif /* _SOCKADDR_UNION_DEFINED */

	SLIST_HEAD(pfr_kentryworkq, pfr_kentry);
	struct pfr_kentry {
	struct radix_node pfrke_node[2];
	union sockaddr_union pfrke_sa;
	u_int64_t pfrke_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX];
	u_int64_t pfrke_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX];
	SLIST_ENTRY(pfr_kentry) pfrke_workq;
	long pfrke_tzero;
	u_int8_t pfrke_af;
	u_int8_t pfrke_net;
	u_int8_t pfrke_not;
	u_int8_t pfrke_mark;
	u_int8_t pfrke_intrpool;
	};

	SLIST_HEAD(pfr_ktableworkq, pfr_ktable);
	RB_HEAD(pfr_ktablehead, pfr_ktable);
	struct pfr_ktable {
	struct pfr_tstats pfrkt_ts;
	RB_ENTRY(pfr_ktable) pfrkt_tree;
	SLIST_ENTRY(pfr_ktable) pfrkt_workq;
	struct radix_node_head *pfrkt_ip4;
	struct radix_node_head *pfrkt_ip6;
	struct pfr_ktable *pfrkt_shadow;
	struct pfr_ktable *pfrkt_root;
	struct pf_ruleset *pfrkt_rs;
	long pfrkt_larg;
	int pfrkt_nflags;
	};
	#define pfrkt_t pfrkt_ts.pfrts_t
	#define pfrkt_name pfrkt_t.pfrt_name
	#define pfrkt_anchor pfrkt_t.pfrt_anchor
	#define pfrkt_ruleset pfrkt_t.pfrt_ruleset
	#define pfrkt_flags pfrkt_t.pfrt_flags
	#define pfrkt_cnt pfrkt_ts.pfrts_cnt
	#define pfrkt_refcnt pfrkt_ts.pfrts_refcnt
	#define pfrkt_packets pfrkt_ts.pfrts_packets
	#define pfrkt_bytes pfrkt_ts.pfrts_bytes
	#define pfrkt_match pfrkt_ts.pfrts_match
	#define pfrkt_nomatch pfrkt_ts.pfrts_nomatch
	#define pfrkt_tzero pfrkt_ts.pfrts_tzero

	RB_HEAD(pf_state_tree_lan_ext, pf_state);
	RB_PROTOTYPE(pf_state_tree_lan_ext, pf_state,
	u.s.entry_lan_ext, pf_state_compare_lan_ext);

	RB_HEAD(pf_state_tree_ext_gwy, pf_state);
	RB_PROTOTYPE(pf_state_tree_ext_gwy, pf_state,
	u.s.entry_ext_gwy, pf_state_compare_ext_gwy);

	TAILQ_HEAD(pfi_statehead, pfi_kif);
	RB_HEAD(pfi_ifhead, pfi_kif);

	/* keep synced with pfi_kif, used in RB_FIND */
	struct pfi_kif_cmp {
	char pfik_name[IFNAMSIZ];
	};

	struct pfi_kif {
	char pfik_name[IFNAMSIZ];
	RB_ENTRY(pfi_kif) pfik_tree;
	u_int64_t pfik_packets[2][2][2];
	u_int64_t pfik_bytes[2][2][2];
	u_int32_t pfik_tzero;
	int pfik_flags;
	struct pf_state_tree_lan_ext pfik_lan_ext;
	struct pf_state_tree_ext_gwy pfik_ext_gwy;
	TAILQ_ENTRY(pfi_kif) pfik_w_states;
	#ifndef __FreeBSD__
	void *pfik_ah_cookie;
	#endif
	struct ifnet *pfik_ifp;
	struct ifg_group *pfik_group;
	int pfik_states;
	int pfik_rules;
	TAILQ_HEAD(, pfi_dynaddr) pfik_dynaddrs;
	};

	enum pfi_kif_refs {
	PFI_KIF_REF_NONE,
	PFI_KIF_REF_STATE,
	PFI_KIF_REF_RULE
	};

	#define PFI_IFLAG_SKIP 0x0100 /* skip filtering on interface */
	/* XXX: revisist */
	#define PFI_IFLAG_SETABLE_MASK 0x0100 /* setable via DIOC{SET,CLR}IFFLAG */
	#define PFI_IFLAG_PLACEHOLDER 0x8000 /* placeholder group/interface */

	struct pf_pdesc {
	struct {
	int done;
	uid_t uid;
	gid_t gid;
	pid_t pid;
	} lookup;
	u_int64_t tot_len; /* Make Mickey money */
	union {
	struct tcphdr *tcp;
	struct udphdr *udp;
	struct icmp *icmp;
	#ifdef INET6
	struct icmp6_hdr *icmp6;
	#endif /* INET6 */
	void *any;
	} hdr;
	struct pf_addr baddr; /* address before translation */
	struct pf_addr naddr; /* address after translation */
	struct pf_rule nat_rule; / nat/rdr rule applied to packet */
	struct pf_addr *src;
	struct pf_addr *dst;
	struct ether_header
	*eh;
	struct pf_mtag *pf_mtag;
	u_int16_t *ip_sum;
	u_int32_t p_len; /* total length of payload */
	u_int16_t flags; /* Let SCRUB trigger behavior in
	* state code. Easier than tags */
	#define PFDESC_TCP_NORM 0x0001 /* TCP shall be statefully scrubbed */
	#define PFDESC_IP_REAS 0x0002 /* IP frags would've been reassembled */
	sa_family_t af;
	u_int8_t proto;
	u_int8_t tos;
	};

	/* flags for RDR options */
	#define PF_DPORT_RANGE 0x01 /* Dest port uses range */
	#define PF_RPORT_RANGE 0x02 /* RDR'ed port uses range */

	/* Reasons code for passing/dropping a packet */
	#define PFRES_MATCH 0 /* Explicit match of a rule */
	#define PFRES_BADOFF 1 /* Bad offset for pull_hdr */
	#define PFRES_FRAG 2 /* Dropping following fragment */
	#define PFRES_SHORT 3 /* Dropping short packet */
	#define PFRES_NORM 4 /* Dropping by normalizer */
	#define PFRES_MEMORY 5 /* Dropped due to lacking mem */
	#define PFRES_TS 6 /* Bad TCP Timestamp (RFC1323) */
	#define PFRES_CONGEST 7 /* Congestion (of ipintrq) */
	#define PFRES_IPOPTIONS 8 /* IP option */
	#define PFRES_PROTCKSUM 9 /* Protocol checksum invalid */
	#define PFRES_BADSTATE 10 /* State mismatch */
	#define PFRES_STATEINS 11 /* State insertion failure */
	#define PFRES_MAXSTATES 12 /* State limit */
	#define PFRES_SRCLIMIT 13 /* Source node/conn limit */
	#define PFRES_SYNPROXY 14 /* SYN proxy */
	#define PFRES_MAX 15 /* total+1 */

	#define PFRES_NAMES { \
	"match", \
	"bad-offset", \
	"fragment", \
	"short", \
	"normalize", \
	"memory", \
	"bad-timestamp", \
	"congestion", \
	"ip-option", \
	"proto-cksum", \
	"state-mismatch", \
	"state-insert", \
	"state-limit", \
	"src-limit", \
	"synproxy", \
	NULL \
	}

	/* Counters for other things we want to keep track of */
	#define LCNT_STATES 0 /* states */
	#define LCNT_SRCSTATES 1 /* max-src-states */
	#define LCNT_SRCNODES 2 /* max-src-nodes */
	#define LCNT_SRCCONN 3 /* max-src-conn */
	#define LCNT_SRCCONNRATE 4 /* max-src-conn-rate */
	#define LCNT_OVERLOAD_TABLE 5 /* entry added to overload table */
	#define LCNT_OVERLOAD_FLUSH 6 /* state entries flushed */
	#define LCNT_MAX 7 /* total+1 */

	#define LCNT_NAMES { \
	"max states per rule", \
	"max-src-states", \
	"max-src-nodes", \
	"max-src-conn", \
	"max-src-conn-rate", \
	"overload table insertion", \
	"overload flush states", \
	NULL \
	}

	/* UDP state enumeration */
	#define PFUDPS_NO_TRAFFIC 0
	#define PFUDPS_SINGLE 1
	#define PFUDPS_MULTIPLE 2

	#define PFUDPS_NSTATES 3 /* number of state levels */

	#define PFUDPS_NAMES { \
	"NO_TRAFFIC", \
	"SINGLE", \
	"MULTIPLE", \
	NULL \
	}

	/* Other protocol state enumeration */
	#define PFOTHERS_NO_TRAFFIC 0
	#define PFOTHERS_SINGLE 1
	#define PFOTHERS_MULTIPLE 2

	#define PFOTHERS_NSTATES 3 /* number of state levels */

	#define PFOTHERS_NAMES { \
	"NO_TRAFFIC", \
	"SINGLE", \
	"MULTIPLE", \
	NULL \
	}

	#define FCNT_STATE_SEARCH 0
	#define FCNT_STATE_INSERT 1
	#define FCNT_STATE_REMOVALS 2
	#define FCNT_MAX 3

	#define SCNT_SRC_NODE_SEARCH 0
	#define SCNT_SRC_NODE_INSERT 1
	#define SCNT_SRC_NODE_REMOVALS 2
	#define SCNT_MAX 3

	#define ACTION_SET(a, x) \
	do { \
	if ((a) != NULL) \
	*(a) = (x); \
	} while (0)

	#define REASON_SET(a, x) \
	do { \
	if ((a) != NULL) \
	*(a) = (x); \
	if (x < PFRES_MAX) \
	pf_status.counters[x]++; \
	} while (0)

	struct pf_status {
	u_int64_t counters[PFRES_MAX];
	u_int64_t lcounters[LCNT_MAX]; /* limit counters */
	u_int64_t fcounters[FCNT_MAX];
	u_int64_t scounters[SCNT_MAX];
	u_int64_t pcounters[2][2][3];
	u_int64_t bcounters[2][2];
	u_int64_t stateid;
	u_int32_t running;
	u_int32_t states;
	u_int32_t src_nodes;
	u_int32_t since;
	u_int32_t debug;
	u_int32_t hostid;
	char ifname[IFNAMSIZ];
	u_int8_t pf_chksum[PF_MD5_DIGEST_LENGTH];
	};

	struct cbq_opts {
	u_int minburst;
	u_int maxburst;
	u_int pktsize;
	u_int maxpktsize;
	u_int ns_per_byte;
	u_int maxidle;
	int minidle;
	u_int offtime;
	int flags;
	};

	struct priq_opts {
	int flags;
	};

	struct hfsc_opts {
	/* real-time service curve */
	u_int rtsc_m1; /* slope of the 1st segment in bps */
	u_int rtsc_d; /* the x-projection of m1 in msec */
	u_int rtsc_m2; /* slope of the 2nd segment in bps */
	/* link-sharing service curve */
	u_int lssc_m1;
	u_int lssc_d;
	u_int lssc_m2;
	/* upper-limit service curve */
	u_int ulsc_m1;
	u_int ulsc_d;
	u_int ulsc_m2;
	int flags;
	};

	struct pf_altq {
	char ifname[IFNAMSIZ];

	void altq_disc; / discipline-specific state */
	TAILQ_ENTRY(pf_altq) entries;

	/* scheduler spec */
	u_int8_t scheduler; /* scheduler type */
	u_int16_t tbrsize; /* tokenbucket regulator size */
	u_int32_t ifbandwidth; /* interface bandwidth */

	/* queue spec */
	char qname[PF_QNAME_SIZE]; /* queue name */
	char parent[PF_QNAME_SIZE]; /* parent name */
	u_int32_t parent_qid; /* parent queue id */
	u_int32_t bandwidth; /* queue bandwidth */
	u_int8_t priority; /* priority */
	#ifdef __FreeBSD__
	u_int8_t local_flags; /* dynamic interface */
	#define PFALTQ_FLAG_IF_REMOVED 0x01
	#endif
	u_int16_t qlimit; /* queue size limit */
	u_int16_t flags; /* misc flags */
	union {
	struct cbq_opts cbq_opts;
	struct priq_opts priq_opts;
	struct hfsc_opts hfsc_opts;
	} pq_u;

	u_int32_t qid; /* return value */
	};

	#ifndef __FreeBSD__

	#define PF_TAG_GENERATED 0x01
	#define PF_TAG_FRAGCACHE 0x02
	#define PF_TAG_TRANSLATE_LOCALHOST 0x04

	struct pf_mtag {
	void hdr; / saved hdr pos in mbuf, for ECN */
	u_int rtableid; /* alternate routing table id */
	u_int32_t qid; /* queue id */
	u_int16_t tag; /* tag id */
	u_int8_t flags;
	u_int8_t routed;
	sa_family_t af; /* for ECN */
	};
	#endif

	struct pf_tag {
	u_int16_t tag; /* tag id */
	};

	struct pf_tagname {
	TAILQ_ENTRY(pf_tagname) entries;
	char name[PF_TAG_NAME_SIZE];
	u_int16_t tag;
	int ref;
	};

	#define PFFRAG_FRENT_HIWAT 5000 /* Number of fragment entries */
	#define PFFRAG_FRAG_HIWAT 1000 /* Number of fragmented packets */
	#define PFFRAG_FRCENT_HIWAT 50000 /* Number of fragment cache entries */
	#define PFFRAG_FRCACHE_HIWAT 10000 /* Number of fragment descriptors */

	#define PFR_KTABLE_HIWAT 1000 /* Number of tables */
	#define PFR_KENTRY_HIWAT 200000 /* Number of table entries */
	#define PFR_KENTRY_HIWAT_SMALL 100000 /* Number of table entries (tiny hosts) */

	/*
	* ioctl parameter structures
	*/

	struct pfioc_pooladdr {
	u_int32_t action;
	u_int32_t ticket;
	u_int32_t nr;
	u_int32_t r_num;
	u_int8_t r_action;
	u_int8_t r_last;
	u_int8_t af;
	char anchor[MAXPATHLEN];
	struct pf_pooladdr addr;
	};

	struct pfioc_rule {
	u_int32_t action;
	u_int32_t ticket;
	u_int32_t pool_ticket;
	u_int32_t nr;
	char anchor[MAXPATHLEN];
	char anchor_call[MAXPATHLEN];
	struct pf_rule rule;
	};

	struct pfioc_natlook {
	struct pf_addr saddr;
	struct pf_addr daddr;
	struct pf_addr rsaddr;
	struct pf_addr rdaddr;
	u_int16_t sport;
	u_int16_t dport;
	u_int16_t rsport;
	u_int16_t rdport;
	sa_family_t af;
	u_int8_t proto;
	u_int8_t direction;
	};

	struct pfioc_state {
	u_int32_t nr;
	struct pf_state state;
	};

	struct pfioc_src_node_kill {
	/* XXX returns the number of src nodes killed in psnk_af */
	sa_family_t psnk_af;
	struct pf_rule_addr psnk_src;
	struct pf_rule_addr psnk_dst;
	};

	struct pfioc_state_kill {
	/* XXX returns the number of states killed in psk_af */
	sa_family_t psk_af;
	int psk_proto;
	struct pf_rule_addr psk_src;
	struct pf_rule_addr psk_dst;
	char psk_ifname[IFNAMSIZ];
	};

	struct pfioc_states {
	int ps_len;
	union {
	caddr_t psu_buf;
	struct pf_state *psu_states;
	} ps_u;
	#define ps_buf ps_u.psu_buf
	#define ps_states ps_u.psu_states
	};

	struct pfioc_src_nodes {
	int psn_len;
	union {
	caddr_t psu_buf;
	struct pf_src_node *psu_src_nodes;
	} psn_u;
	#define psn_buf psn_u.psu_buf
	#define psn_src_nodes psn_u.psu_src_nodes
	};

	struct pfioc_if {
	char ifname[IFNAMSIZ];
	};

	struct pfioc_tm {
	int timeout;
	int seconds;
	};

	struct pfioc_limit {
	int index;
	unsigned limit;
	};

	struct pfioc_altq {
	u_int32_t action;
	u_int32_t ticket;
	u_int32_t nr;
	struct pf_altq altq;
	};

	struct pfioc_qstats {
	u_int32_t ticket;
	u_int32_t nr;
	void *buf;
	int nbytes;
	u_int8_t scheduler;
	};

	struct pfioc_ruleset {
	u_int32_t nr;
	char path[MAXPATHLEN];
	char name[PF_ANCHOR_NAME_SIZE];
	};

	#define PF_RULESET_ALTQ (PF_RULESET_MAX)
	#define PF_RULESET_TABLE (PF_RULESET_MAX+1)
	struct pfioc_trans {
	int size; /* number of elements */
	int esize; /* size of each element in bytes */
	struct pfioc_trans_e {
	int rs_num;
	char anchor[MAXPATHLEN];
	u_int32_t ticket;
	} *array;
	};

	#define PFR_FLAG_ATOMIC 0x00000001
	#define PFR_FLAG_DUMMY 0x00000002
	#define PFR_FLAG_FEEDBACK 0x00000004
	#define PFR_FLAG_CLSTATS 0x00000008
	#define PFR_FLAG_ADDRSTOO 0x00000010
	#define PFR_FLAG_REPLACE 0x00000020
	#define PFR_FLAG_ALLRSETS 0x00000040
	#define PFR_FLAG_ALLMASK 0x0000007F
	#ifdef _KERNEL
	#define PFR_FLAG_USERIOCTL 0x10000000
	#endif

	struct pfioc_table {
	struct pfr_table pfrio_table;
	void *pfrio_buffer;
	int pfrio_esize;
	int pfrio_size;
	int pfrio_size2;
	int pfrio_nadd;
	int pfrio_ndel;
	int pfrio_nchange;
	int pfrio_flags;
	u_int32_t pfrio_ticket;
	};
	#define pfrio_exists pfrio_nadd
	#define pfrio_nzero pfrio_nadd
	#define pfrio_nmatch pfrio_nadd
	#define pfrio_naddr pfrio_size2
	#define pfrio_setflag pfrio_size2
	#define pfrio_clrflag pfrio_nadd

	struct pfioc_iface {
	char pfiio_name[IFNAMSIZ];
	void *pfiio_buffer;
	int pfiio_esize;
	int pfiio_size;
	int pfiio_nzero;
	int pfiio_flags;
	};


	/*
	* ioctl operations
	*/

	#define DIOCSTART _IO ('D', 1)
	#define DIOCSTOP _IO ('D', 2)
	#define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule)
	#define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule)
	#define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule)
	/* XXX cut 8 - 17 */
	#define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill)
	#define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state)
	#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if)
	#define DIOCGETSTATUS _IOWR('D', 21, struct pf_status)
	#define DIOCCLRSTATUS _IO ('D', 22)
	#define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook)
	#define DIOCSETDEBUG _IOWR('D', 24, u_int32_t)
	#define DIOCGETSTATES _IOWR('D', 25, struct pfioc_states)
	#define DIOCCHANGERULE _IOWR('D', 26, struct pfioc_rule)
	/* XXX cut 26 - 28 */
	#define DIOCSETTIMEOUT _IOWR('D', 29, struct pfioc_tm)
	#define DIOCGETTIMEOUT _IOWR('D', 30, struct pfioc_tm)
	#define DIOCADDSTATE _IOWR('D', 37, struct pfioc_state)
	#define DIOCCLRRULECTRS _IO ('D', 38)
	#define DIOCGETLIMIT _IOWR('D', 39, struct pfioc_limit)
	#define DIOCSETLIMIT _IOWR('D', 40, struct pfioc_limit)
	#define DIOCKILLSTATES _IOWR('D', 41, struct pfioc_state_kill)
	#define DIOCSTARTALTQ _IO ('D', 42)
	#define DIOCSTOPALTQ _IO ('D', 43)
	#define DIOCADDALTQ _IOWR('D', 45, struct pfioc_altq)
	#define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq)
	#define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq)
	#define DIOCCHANGEALTQ _IOWR('D', 49, struct pfioc_altq)
	#define DIOCGETQSTATS _IOWR('D', 50, struct pfioc_qstats)
	#define DIOCBEGINADDRS _IOWR('D', 51, struct pfioc_pooladdr)
	#define DIOCADDADDR _IOWR('D', 52, struct pfioc_pooladdr)
	#define DIOCGETADDRS _IOWR('D', 53, struct pfioc_pooladdr)
	#define DIOCGETADDR _IOWR('D', 54, struct pfioc_pooladdr)
	#define DIOCCHANGEADDR _IOWR('D', 55, struct pfioc_pooladdr)
	/* XXX cut 55 - 57 */
	#define DIOCGETRULESETS _IOWR('D', 58, struct pfioc_ruleset)
	#define DIOCGETRULESET _IOWR('D', 59, struct pfioc_ruleset)
	#define DIOCRCLRTABLES _IOWR('D', 60, struct pfioc_table)
	#define DIOCRADDTABLES _IOWR('D', 61, struct pfioc_table)
	#define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table)
	#define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table)
	#define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table)
	#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table)
	#define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table)
	#define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table)
	#define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table)
	#define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table)
	#define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table)
	#define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table)
	#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table)
	#define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table)
	#define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table)
	#define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table)
	#define DIOCOSFPFLUSH _IO('D', 78)
	#define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl)
	#define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl)
	#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans)
	#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans)
	#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans)
	#define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes)
	#define DIOCCLRSRCNODES _IO('D', 85)
	#define DIOCSETHOSTID _IOWR('D', 86, u_int32_t)
	#define DIOCIGETIFACES _IOWR('D', 87, struct pfioc_iface)
	#define DIOCSETIFFLAG _IOWR('D', 89, struct pfioc_iface)
	#define DIOCCLRIFFLAG _IOWR('D', 90, struct pfioc_iface)
	#define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill)
	#ifdef __FreeBSD__
	struct pf_ifspeed {
	char ifname[IFNAMSIZ];
	u_int32_t baudrate;
	};
	#define DIOCGIFSPEED _IOWR('D', 92, struct pf_ifspeed)
	#endif

	#ifdef _KERNEL
	RB_HEAD(pf_src_tree, pf_src_node);
	RB_PROTOTYPE(pf_src_tree, pf_src_node, entry, pf_src_compare);
	extern struct pf_src_tree tree_src_tracking;

	RB_HEAD(pf_state_tree_id, pf_state);
	RB_PROTOTYPE(pf_state_tree_id, pf_state,
	entry_id, pf_state_compare_id);
	extern struct pf_state_tree_id tree_id;
	extern struct pf_state_queue state_list;

	TAILQ_HEAD(pf_poolqueue, pf_pool);
	extern struct pf_poolqueue pf_pools[2];
	TAILQ_HEAD(pf_altqqueue, pf_altq);
	extern struct pf_altqqueue pf_altqs[2];
	extern struct pf_palist pf_pabuf;

	extern u_int32_t ticket_altqs_active;
	extern u_int32_t ticket_altqs_inactive;
	extern int altqs_inactive_open;
	extern u_int32_t ticket_pabuf;
	extern struct pf_altqqueue *pf_altqs_active;
	extern struct pf_altqqueue *pf_altqs_inactive;
	extern struct pf_poolqueue *pf_pools_active;
	extern struct pf_poolqueue *pf_pools_inactive;
	extern int pf_tbladdr_setup(struct pf_ruleset *,
	struct pf_addr_wrap *);
	extern void pf_tbladdr_remove(struct pf_addr_wrap *);
	extern void pf_tbladdr_copyout(struct pf_addr_wrap *);
	extern void pf_calc_skip_steps(struct pf_rulequeue *);
	#ifdef __FreeBSD__
	#ifdef ALTQ
	extern void pf_altq_ifnet_event(struct ifnet *, int);
	#endif
	extern uma_zone_t pf_src_tree_pl, pf_rule_pl;
	extern uma_zone_t pf_state_pl, pf_altq_pl, pf_pooladdr_pl;
	extern uma_zone_t pfr_ktable_pl, pfr_kentry_pl, pfr_kentry_pl2;
	extern uma_zone_t pf_cache_pl, pf_cent_pl;
	extern uma_zone_t pf_state_scrub_pl;
	extern uma_zone_t pfi_addr_pl;
	#else
	extern struct pool pf_src_tree_pl, pf_rule_pl;
	extern struct pool pf_state_pl, pf_altq_pl, pf_pooladdr_pl;
	extern struct pool pf_state_scrub_pl;
	#endif
	extern void pf_purge_thread(void *);
	extern void pf_purge_expired_src_nodes(int);
	extern void pf_purge_expired_states(u_int32_t);
	extern void pf_unlink_state(struct pf_state *);
	extern void pf_free_state(struct pf_state *);
	extern int pf_insert_state(struct pfi_kif *,
	struct pf_state *);
	extern int pf_insert_src_node(struct pf_src_node **,
	struct pf_rule , struct pf_addr ,
	sa_family_t);
	void pf_src_tree_remove_state(struct pf_state *);
	extern struct pf_state pf_find_state_byid(struct pf_state_cmp );
	extern struct pf_state pf_find_state_all(struct pf_state_cmp key,
	u_int8_t tree, int *more);
	extern void pf_print_state(struct pf_state *);
	extern void pf_print_flags(u_int8_t);
	extern u_int16_t pf_cksum_fixup(u_int16_t, u_int16_t, u_int16_t,
	u_int8_t);

	extern struct ifnet *sync_ifp;
	extern struct pf_rule pf_default_rule;
	extern void pf_addrcpy(struct pf_addr , struct pf_addr ,
	u_int8_t);
	void pf_rm_rule(struct pf_rulequeue *,
	struct pf_rule *);

	#ifdef INET
	#ifdef __FreeBSD__
	int pf_test(int, struct ifnet , struct mbuf , struct ether_header ,
	struct inpcb *);
	#else
	int pf_test(int, struct ifnet , struct mbuf , struct ether_header );
	#endif
	#endif /* INET */

	#ifdef INET6
	#ifdef __FreeBSD__
	int pf_test6(int, struct ifnet , struct mbuf , struct ether_header ,
	struct inpcb *);
	#else
	int pf_test6(int, struct ifnet , struct mbuf , struct ether_header );
	#endif
	void pf_poolmask(struct pf_addr , struct pf_addr,
	struct pf_addr , struct pf_addr , u_int8_t);
	void pf_addr_inc(struct pf_addr *, sa_family_t);
	#endif /* INET6 */

	#ifdef __FreeBSD__
	u_int32_t pf_new_isn(struct pf_state *);
	#endif
	void pf_pull_hdr(struct mbuf , int, void , int, u_short , u_short *,
	sa_family_t);
	void pf_change_a(void , u_int16_t , u_int32_t, u_int8_t);
	int pflog_packet(struct pfi_kif , struct mbuf , sa_family_t, u_int8_t,
	u_int8_t, struct pf_rule , struct pf_rule , struct pf_ruleset *,
	struct pf_pdesc *);
	int pf_match_addr(u_int8_t, struct pf_addr , struct pf_addr ,
	struct pf_addr *, sa_family_t);
	int pf_match(u_int8_t, u_int32_t, u_int32_t, u_int32_t);
	int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t);
	int pf_match_uid(u_int8_t, uid_t, uid_t, uid_t);
	int pf_match_gid(u_int8_t, gid_t, gid_t, gid_t);

	void pf_normalize_init(void);
	int pf_normalize_ip(struct mbuf *, int, struct pfi_kif , u_short *,
	struct pf_pdesc *);
	int pf_normalize_ip6(struct mbuf *, int, struct pfi_kif , u_short *,
	struct pf_pdesc *);
	int pf_normalize_tcp(int, struct pfi_kif , struct mbuf , int, int, void *,
	struct pf_pdesc *);
	void pf_normalize_tcp_cleanup(struct pf_state *);
	int pf_normalize_tcp_init(struct mbuf , int, struct pf_pdesc ,
	struct tcphdr , struct pf_state_peer , struct pf_state_peer *);
	int pf_normalize_tcp_stateful(struct mbuf , int, struct pf_pdesc ,
	u_short , struct tcphdr , struct pf_state *,
	struct pf_state_peer , struct pf_state_peer , int *);
	u_int32_t
	pf_state_expires(const struct pf_state *);
	void pf_purge_expired_fragments(void);
	int pf_routable(struct pf_addr addr, sa_family_t af, struct pfi_kif );
	int pf_rtlabel_match(struct pf_addr , sa_family_t, struct pf_addr_wrap );
	#ifdef __FreeBSD__
	int pf_socket_lookup(int, struct pf_pdesc , struct inpcb );
	#else
	int pf_socket_lookup(int, struct pf_pdesc *);
	#endif
	void pfr_initialize(void);
	int pfr_match_addr(struct pfr_ktable , struct pf_addr , sa_family_t);
	void pfr_update_stats(struct pfr_ktable , struct pf_addr , sa_family_t,
	u_int64_t, int, int, int);
	int pfr_pool_get(struct pfr_ktable , int , struct pf_addr *,
	struct pf_addr , struct pf_addr , sa_family_t);
	void pfr_dynaddr_update(struct pfr_ktable , struct pfi_dynaddr );
	struct pfr_ktable *
	pfr_attach_table(struct pf_ruleset , char );
	void pfr_detach_table(struct pfr_ktable *);
	int pfr_clr_tables(struct pfr_table , int , int);
	int pfr_add_tables(struct pfr_table , int, int , int);
	int pfr_del_tables(struct pfr_table , int, int , int);
	int pfr_get_tables(struct pfr_table , struct pfr_table , int *, int);
	int pfr_get_tstats(struct pfr_table , struct pfr_tstats , int *, int);
	int pfr_clr_tstats(struct pfr_table , int, int , int);
	int pfr_set_tflags(struct pfr_table , int, int, int, int , int *, int);
	int pfr_clr_addrs(struct pfr_table , int , int);
	int pfr_insert_kentry(struct pfr_ktable , struct pfr_addr , long);
	int pfr_add_addrs(struct pfr_table , struct pfr_addr , int, int *,
	int);
	int pfr_del_addrs(struct pfr_table , struct pfr_addr , int, int *,
	int);
	int pfr_set_addrs(struct pfr_table , struct pfr_addr , int, int *,
	int , int , int *, int, u_int32_t);
	int pfr_get_addrs(struct pfr_table , struct pfr_addr , int *, int);
	int pfr_get_astats(struct pfr_table , struct pfr_astats , int *, int);
	int pfr_clr_astats(struct pfr_table , struct pfr_addr , int, int *,
	int);
	int pfr_tst_addrs(struct pfr_table , struct pfr_addr , int, int *,
	int);
	int pfr_ina_begin(struct pfr_table , u_int32_t , int *, int);
	int pfr_ina_rollback(struct pfr_table , u_int32_t, int , int);
	int pfr_ina_commit(struct pfr_table , u_int32_t, int , int *, int);
	int pfr_ina_define(struct pfr_table , struct pfr_addr , int, int *,
	int *, u_int32_t, int);

	extern struct pfi_statehead pfi_statehead;
	extern struct pfi_kif *pfi_all;

	void pfi_initialize(void);
	#ifdef __FreeBSD__
	void pfi_cleanup(void);
	#endif
	struct pfi_kif pfi_kif_get(const char );
	void pfi_kif_ref(struct pfi_kif *, enum pfi_kif_refs);
	void pfi_kif_unref(struct pfi_kif *, enum pfi_kif_refs);
	int pfi_kif_match(struct pfi_kif , struct pfi_kif );
	void pfi_attach_ifnet(struct ifnet *);
	void pfi_detach_ifnet(struct ifnet *);
	void pfi_attach_ifgroup(struct ifg_group *);
	void pfi_detach_ifgroup(struct ifg_group *);
	void pfi_group_change(const char *);
	int pfi_match_addr(struct pfi_dynaddr , struct pf_addr ,
	sa_family_t);
	int pfi_dynaddr_setup(struct pf_addr_wrap *, sa_family_t);
	void pfi_dynaddr_remove(struct pf_addr_wrap *);
	void pfi_dynaddr_copyout(struct pf_addr_wrap *);
	void pfi_fill_oldstatus(struct pf_status *);
	int pfi_clr_istats(const char *);
	int pfi_get_ifaces(const char , struct pfi_kif , int *);
	int pfi_set_flags(const char *, int);
	int pfi_clear_flags(const char *, int);

	u_int16_t pf_tagname2tag(char *);
	void pf_tag2tagname(u_int16_t, char *);
	void pf_tag_ref(u_int16_t);
	void pf_tag_unref(u_int16_t);
	int pf_tag_packet(struct mbuf , struct pf_mtag , int, int);
	u_int32_t pf_qname2qid(char *);
	void pf_qid2qname(u_int32_t, char *);
	void pf_qid_unref(u_int32_t);
	#ifndef __FreeBSD__
	struct pf_mtag pf_find_mtag(struct mbuf );
	struct pf_mtag pf_get_mtag(struct mbuf );
	#endif

	extern struct pf_status pf_status;

	#ifdef __FreeBSD__
	extern uma_zone_t pf_frent_pl, pf_frag_pl;
	extern struct sx pf_consistency_lock;
	#else
	extern struct pool pf_frent_pl, pf_frag_pl;
	extern struct rwlock pf_consistency_lock;
	#endif

	struct pf_pool_limit {
	void *pp;
	unsigned limit;
	};
	extern struct pf_pool_limit pf_pool_limits[PF_LIMIT_MAX];

	#ifdef __FreeBSD__
	struct pf_frent {
	LIST_ENTRY(pf_frent) fr_next;
	struct ip *fr_ip;
	struct mbuf *fr_m;
	};

	struct pf_frcache {
	LIST_ENTRY(pf_frcache) fr_next;
	uint16_t fr_off;
	uint16_t fr_end;
	};

	struct pf_fragment {
	RB_ENTRY(pf_fragment) fr_entry;
	TAILQ_ENTRY(pf_fragment) frag_next;
	struct in_addr fr_src;
	struct in_addr fr_dst;
	u_int8_t fr_p; /* protocol of this fragment */
	u_int8_t fr_flags; /* status flags */
	u_int16_t fr_id; /* fragment id for reassemble */
	u_int16_t fr_max; /* fragment data max */
	u_int32_t fr_timeout;
	#define fr_queue fr_u.fru_queue
	#define fr_cache fr_u.fru_cache
	union {
	LIST_HEAD(pf_fragq, pf_frent) fru_queue; /* buffering */
	LIST_HEAD(pf_cacheq, pf_frcache) fru_cache; /* non-buf */
	} fr_u;
	};
	#endif /* (__FreeBSD__) */

	#endif /* _KERNEL */

	extern struct pf_anchor_global pf_anchors;
	extern struct pf_anchor pf_main_anchor;
	#define pf_main_ruleset pf_main_anchor.ruleset

	/* these ruleset functions can be linked into userland programs (pfctl) */
	int pf_get_ruleset_number(u_int8_t);
	void pf_init_ruleset(struct pf_ruleset *);
	int pf_anchor_setup(struct pf_rule *,
	const struct pf_ruleset , const char );
	int pf_anchor_copyout(const struct pf_ruleset *,
	const struct pf_rule , struct pfioc_rule );
	void pf_anchor_remove(struct pf_rule *);
	void pf_remove_if_empty_ruleset(struct pf_ruleset *);
	struct pf_anchor pf_find_anchor(const char );
	struct pf_ruleset pf_find_ruleset(const char );
	struct pf_ruleset pf_find_or_create_ruleset(const char );
	void pf_rs_initialize(void);

	#ifndef __FreeBSD__
	/* ?!? */
	#ifdef _KERNEL
	int pf_anchor_copyout(const struct pf_ruleset *,
	const struct pf_rule , struct pfioc_rule );
	void pf_anchor_remove(struct pf_rule *);

	#endif /* _KERNEL */
	#endif

	/* The fingerprint functions can be linked into userland programs (tcpdump) */
	int pf_osfp_add(struct pf_osfp_ioctl *);
	#ifdef _KERNEL
	struct pf_osfp_enlist *
	pf_osfp_fingerprint(struct pf_pdesc , struct mbuf , int,
	const struct tcphdr *);
	#endif /* _KERNEL */
	struct pf_osfp_enlist *
	pf_osfp_fingerprint_hdr(const struct ip , const struct ip6_hdr ,
	const struct tcphdr *);
	void pf_osfp_flush(void);
	int pf_osfp_get(struct pf_osfp_ioctl *);
	#ifdef __FreeBSD__
	int pf_osfp_initialize(void);
	void pf_osfp_cleanup(void);
	#else
	void pf_osfp_initialize(void);
	#endif
	int pf_osfp_match(struct pf_osfp_enlist *, pf_osfp_t);
	struct pf_os_fingerprint *
	pf_osfp_validate(void);

	+/*
	+ * Symbol translation macros
	+ */
	+#define INIT_VNET_PF(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_PF, struct vnet_pf, vnet_pf)
	+
	+#define VNET_PF(sym) VSYM(vnet_pf, sym)

	#endif /* _NET_PFVAR_H_ */
	Index: head/sys/contrib/rdma/rdma_cma.c
	===================================================================
	--- head/sys/contrib/rdma/rdma_cma.c (revision 183549)
	+++ head/sys/contrib/rdma/rdma_cma.c (revision 183550)
	@@ -1,2999 +1,3003 @@
	/*
	* Copyright (c) 2005 Voltaire Inc. All rights reserved.
	* Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
	* Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
	* Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
	*
	* This Software is licensed under one of the following licenses:
	*
	* 1) under the terms of the "Common Public License 1.0" a copy of which is
	* available from the Open Source Initiative, see
	* http://www.opensource.org/licenses/cpl.php.
	*
	* 2) under the terms of the "The BSD License" a copy of which is
	* available from the Open Source Initiative, see
	* http://www.opensource.org/licenses/bsd-license.php.
	*
	* 3) under the terms of the "GNU General Public License (GPL) Version 2" a
	* copy of which is available from the Open Source Initiative, see
	* http://www.opensource.org/licenses/gpl-license.php.
	*
	* Licensee has the right to choose one of the above licenses.
	*
	* Redistributions of source code must retain the above copyright
	* notice and one of the license notices.
	*
	* Redistributions in binary form must reproduce both the above copyright
	* notice, one of the license notices in the documentation
	* and/or other materials provided with the distribution.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/condvar.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/libkern.h>
	#include <sys/socket.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/rwlock.h>
	#include <sys/queue.h>
	#include <sys/taskqueue.h>
	#include <sys/priv.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	+#include <net/if.h>
	#include <netinet/in.h>
	#include <netinet/in_pcb.h>

	#include <contrib/rdma/rdma_cm.h>
	#include <contrib/rdma/ib_cache.h>
	#include <contrib/rdma/ib_cm.h>
	#include <contrib/rdma/ib_sa.h>
	#include <contrib/rdma/iw_cm.h>

	#define CMA_CM_RESPONSE_TIMEOUT 20
	#define CMA_MAX_CM_RETRIES 15

	static void cma_add_one(struct ib_device *device);
	static void cma_remove_one(struct ib_device *device);

	static struct ib_client cma_client = {
	.name = "cma",
	.add = cma_add_one,
	.remove = cma_remove_one
	};

	#ifdef IB_SUPPORTED
	static struct ib_sa_client sa_client;
	#endif
	static struct rdma_addr_client addr_client;
	static TAILQ_HEAD(, cma_device) dev_list;
	static LIST_HEAD(, rdma_id_private) listen_any_list;
	static struct mtx lock;
	static struct taskqueue *cma_wq;
	static DEFINE_KVL(sdp_ps);
	static DEFINE_KVL(tcp_ps);
	static DEFINE_KVL(udp_ps);
	static DEFINE_KVL(ipoib_ps);
	static int next_port;

	struct cma_device {
	struct ib_device *device;
	struct mtx lock;
	struct cv comp;
	int refcount;

	LIST_HEAD(, rdma_id_private) id_list;
	TAILQ_ENTRY(cma_device) list;
	};

	enum cma_state {
	CMA_IDLE,
	CMA_ADDR_QUERY,
	CMA_ADDR_RESOLVED,
	CMA_ROUTE_QUERY,
	CMA_ROUTE_RESOLVED,
	CMA_CONNECT,
	CMA_DISCONNECT,
	CMA_ADDR_BOUND,
	CMA_LISTEN,
	CMA_DEVICE_REMOVAL,
	CMA_DESTROYING
	};

	struct rdma_bind_list {
	struct kvl *ps;
	TAILQ_HEAD(, rdma_id_private) owners;
	unsigned short port;
	};

	/*
	* Device removal can occur at anytime, so we need extra handling to
	* serialize notifying the user of device removal with other callbacks.
	* We do this by disabling removal notification while a callback is in process,
	* and reporting it after the callback completes.
	*/
	struct rdma_id_private {
	struct rdma_cm_id id;

	struct rdma_bind_list *bind_list;
	struct socket *so;
	TAILQ_ENTRY(rdma_id_private) node;
	LIST_ENTRY(rdma_id_private) list; /* listen_any_list or cma_dev.list */
	LIST_HEAD(, rdma_id_private) listen_list; /* per-device listens */
	LIST_ENTRY(rdma_id_private) listen_entry;
	struct cma_device *cma_dev;
	#ifdef IB_SUPPORTED
	LIST_HEAD(, cma_multicast) mc_list;
	#endif
	enum cma_state state;
	struct mtx lock;
	struct cv comp;
	int refcount;
	struct cv wait_remove;
	int dev_remove;

	int backlog;
	int timeout_ms;
	struct ib_sa_query *query;
	int query_id;
	union {
	struct ib_cm_id *ib;
	struct iw_cm_id *iw;
	} cm_id;

	u32 seq_num;
	u32 qkey;
	u32 qp_num;
	u8 srq;
	};

	#ifdef IB_SUPPORTED
	struct cma_multicast {
	struct rdma_id_private *id_priv;
	union {
	struct ib_sa_multicast *ib;
	} multicast;
	struct list_head list;
	void *context;
	struct sockaddr addr;
	u8 pad[sizeof(struct sockaddr_in6) -
	sizeof(struct sockaddr)];
	};
	#endif

	struct cma_work {
	struct task task;
	struct rdma_id_private *id;
	enum cma_state old_state;
	enum cma_state new_state;
	struct rdma_cm_event event;
	};

	union cma_ip_addr {
	struct in6_addr ip6;
	struct {
	__u32 pad[3];
	__u32 addr;
	} ip4;
	};

	struct cma_hdr {
	u8 cma_version;
	u8 ip_version; /* IP version: 7:4 */
	__u16 port;
	union cma_ip_addr src_addr;
	union cma_ip_addr dst_addr;
	};

	struct sdp_hh {
	u8 bsdh[16];
	u8 sdp_version; /* Major version: 7:4 */
	u8 ip_version; /* IP version: 7:4 */
	u8 sdp_specific1[10];
	__u16 port;
	__u16 sdp_specific2;
	union cma_ip_addr src_addr;
	union cma_ip_addr dst_addr;
	};

	struct sdp_hah {
	u8 bsdh[16];
	u8 sdp_version;
	};

	#define CMA_VERSION 0x00
	#define SDP_MAJ_VERSION 0x2

	static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp)
	{
	int ret;

	mtx_lock(&id_priv->lock);
	ret = (id_priv->state == comp);
	mtx_unlock(&id_priv->lock);
	return ret;
	}

	static int cma_comp_exch(struct rdma_id_private *id_priv,
	enum cma_state comp, enum cma_state exch)
	{
	int ret;

	mtx_lock(&id_priv->lock);
	if ((ret = (id_priv->state == comp)))
	id_priv->state = exch;
	mtx_unlock(&id_priv->lock);
	return ret;
	}

	static enum cma_state cma_exch(struct rdma_id_private *id_priv,
	enum cma_state exch)
	{
	enum cma_state old;

	mtx_lock(&id_priv->lock);
	old = id_priv->state;
	id_priv->state = exch;
	mtx_unlock(&id_priv->lock);
	return old;
	}

	static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
	{
	return hdr->ip_version >> 4;
	}

	static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
	{
	hdr->ip_version = (ip_ver << 4) \| (hdr->ip_version & 0xF);
	}

	static inline u8 sdp_get_majv(u8 sdp_version)
	{
	return sdp_version >> 4;
	}

	static inline u8 sdp_get_ip_ver(struct sdp_hh *hh)
	{
	return hh->ip_version >> 4;
	}

	static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver)
	{
	hh->ip_version = (ip_ver << 4) \| (hh->ip_version & 0xF);
	}

	static inline int cma_is_ud_ps(enum rdma_port_space ps)
	{
	return (ps == RDMA_PS_UDP \|\| ps == RDMA_PS_IPOIB);
	}

	static void cma_attach_to_dev(struct rdma_id_private *id_priv,
	struct cma_device *cma_dev)
	{
	mtx_lock(&cma_dev->lock);
	cma_dev->refcount++;
	mtx_unlock(&cma_dev->lock);
	id_priv->cma_dev = cma_dev;
	id_priv->id.device = cma_dev->device;
	LIST_INSERT_HEAD(&cma_dev->id_list, id_priv, list);
	}

	static inline void cma_deref_dev(struct cma_device *cma_dev)
	{
	mtx_lock(&cma_dev->lock);
	if (--cma_dev->refcount == 0)
	cv_broadcast(&cma_dev->comp);
	mtx_unlock(&cma_dev->lock);
	}

	static void cma_detach_from_dev(struct rdma_id_private *id_priv)
	{
	LIST_REMOVE(id_priv, list);
	cma_deref_dev(id_priv->cma_dev);
	id_priv->cma_dev = NULL;
	}

	#ifdef IB_SUPPORTED
	static int cma_set_qkey(struct ib_device *device, u8 port_num,
	enum rdma_port_space ps,
	struct rdma_dev_addr dev_addr, u32 qkey)
	{
	struct ib_sa_mcmember_rec rec;
	int ret = 0;

	switch (ps) {
	case RDMA_PS_UDP:
	*qkey = RDMA_UDP_QKEY;
	break;
	case RDMA_PS_IPOIB:
	ib_addr_get_mgid(dev_addr, &rec.mgid);
	ret = ib_sa_get_mcmember_rec(device, port_num, &rec.mgid, &rec);
	*qkey = be32_to_cpu(rec.qkey);
	break;
	default:
	break;
	}
	return ret;
	}
	#endif

	static int cma_acquire_dev(struct rdma_id_private *id_priv)
	{
	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
	struct cma_device *cma_dev;
	union ib_gid gid;
	int ret = ENODEV;

	switch (rdma_node_get_transport(dev_addr->dev_type)) {
	#ifdef IB_SUPPORTED
	case RDMA_TRANSPORT_IB:
	ib_addr_get_sgid(dev_addr, &gid);
	break;
	#endif
	case RDMA_TRANSPORT_IWARP:
	iw_addr_get_sgid(dev_addr, &gid);
	break;
	default:
	return (ENODEV);
	}

	TAILQ_FOREACH(cma_dev, &dev_list, list) {
	ret = ib_find_cached_gid(cma_dev->device, &gid,
	&id_priv->id.port_num, NULL);
	if (!ret) {
	#ifdef IB_SUPPORTED
	ret = cma_set_qkey(cma_dev->device,
	id_priv->id.port_num,
	id_priv->id.ps, dev_addr,
	&id_priv->qkey);
	if (!ret)
	#endif
	cma_attach_to_dev(id_priv, cma_dev);
	break;
	}
	}
	return ret;
	}

	static void cma_deref_id(struct rdma_id_private *id_priv)
	{
	mtx_lock(&id_priv->lock);
	if (--id_priv->refcount == 0) {
	cv_broadcast(&id_priv->comp);
	}
	mtx_unlock(&id_priv->lock);
	}

	static int cma_disable_remove(struct rdma_id_private *id_priv,
	enum cma_state state)
	{
	int ret;

	mtx_lock(&id_priv->lock);
	if (id_priv->state == state) {
	id_priv->dev_remove++;
	ret = 0;
	} else
	ret = EINVAL;
	mtx_unlock(&id_priv->lock);
	return ret;
	}

	static void cma_enable_remove(struct rdma_id_private *id_priv)
	{
	mtx_lock(&id_priv->lock);
	if (--id_priv->dev_remove == 0)
	cv_broadcast(&id_priv->wait_remove);
	mtx_unlock(&id_priv->lock);
	}

	static int cma_has_cm_dev(struct rdma_id_private *id_priv)
	{
	return (id_priv->id.device && id_priv->cm_id.ib);
	}

	struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
	void *context, enum rdma_port_space ps)
	{
	struct rdma_id_private *id_priv;

	id_priv = malloc(sizeof *id_priv, M_DEVBUF, M_NOWAIT);
	if (!id_priv)
	return ERR_PTR(-ENOMEM);
	bzero(id_priv, sizeof *id_priv);

	id_priv->state = CMA_IDLE;
	id_priv->id.context = context;
	id_priv->id.event_handler = event_handler;
	id_priv->id.ps = ps;
	mtx_init(&id_priv->lock, "rdma_cm_id_priv", NULL, MTX_DUPOK\|MTX_DEF);
	cv_init(&id_priv->comp, "rdma_cm_id_priv");
	id_priv->refcount = 1;
	cv_init(&id_priv->wait_remove, "id priv wait remove");
	LIST_INIT(&id_priv->listen_list);
	arc4rand(&id_priv->seq_num, sizeof id_priv->seq_num, 0);

	return &id_priv->id;
	}

	static int cma_init_ud_qp(struct rdma_id_private id_priv, struct ib_qp qp)
	{
	struct ib_qp_attr qp_attr;
	int qp_attr_mask, ret;

	qp_attr.qp_state = IB_QPS_INIT;
	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
	if (ret)
	return ret;

	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
	if (ret)
	return ret;

	qp_attr.qp_state = IB_QPS_RTR;
	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
	if (ret)
	return ret;

	qp_attr.qp_state = IB_QPS_RTS;
	qp_attr.sq_psn = 0;
	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE \| IB_QP_SQ_PSN);

	return ret;
	}

	static int cma_init_conn_qp(struct rdma_id_private id_priv, struct ib_qp qp)
	{
	struct ib_qp_attr qp_attr;
	int qp_attr_mask, ret;

	qp_attr.qp_state = IB_QPS_INIT;
	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
	if (ret)
	return ret;

	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
	}

	int rdma_create_qp(struct rdma_cm_id id, struct ib_pd pd,
	struct ib_qp_init_attr *qp_init_attr)
	{
	struct rdma_id_private *id_priv;
	struct ib_qp *qp;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (id->device != pd->device)
	return (EINVAL);

	qp = ib_create_qp(pd, qp_init_attr);
	if (IS_ERR(qp))
	return PTR_ERR(qp);
	if (cma_is_ud_ps(id_priv->id.ps))
	ret = cma_init_ud_qp(id_priv, qp);
	else
	ret = cma_init_conn_qp(id_priv, qp);
	if (ret)
	goto err;

	id->qp = qp;
	id_priv->qp_num = qp->qp_num;
	id_priv->srq = (qp->srq != NULL);
	return 0;
	err:
	ib_destroy_qp(qp);
	return ret;
	}

	void rdma_destroy_qp(struct rdma_cm_id *id)
	{
	ib_destroy_qp(id->qp);
	}

	static int cma_modify_qp_rtr(struct rdma_cm_id *id)
	{
	struct ib_qp_attr qp_attr;
	int qp_attr_mask, ret;

	if (!id->qp)
	return 0;

	/* Need to update QP attributes from default values. */
	qp_attr.qp_state = IB_QPS_INIT;
	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
	if (ret)
	return ret;

	ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
	if (ret)
	return ret;

	qp_attr.qp_state = IB_QPS_RTR;
	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
	if (ret)
	return ret;

	return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
	}

	#ifdef IB_SUPPORTED
	static int cma_modify_qp_rts(struct rdma_cm_id *id)
	{
	struct ib_qp_attr qp_attr;
	int qp_attr_mask, ret;

	if (!id->qp)
	return 0;

	qp_attr.qp_state = IB_QPS_RTS;
	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
	if (ret)
	return ret;

	return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask);
	}
	#endif

	static int cma_modify_qp_err(struct rdma_cm_id *id)
	{
	struct ib_qp_attr qp_attr;

	if (!id->qp)
	return 0;

	qp_attr.qp_state = IB_QPS_ERR;
	return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE);
	}

	#ifdef IB_SUPPORTED
	static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
	struct ib_qp_attr qp_attr, int qp_attr_mask)
	{
	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
	int ret;

	ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
	ib_addr_get_pkey(dev_addr),
	&qp_attr->pkey_index);
	if (ret)
	return ret;

	qp_attr->port_num = id_priv->id.port_num;
	*qp_attr_mask = IB_QP_STATE \| IB_QP_PKEY_INDEX \| IB_QP_PORT;

	if (cma_is_ud_ps(id_priv->id.ps)) {
	qp_attr->qkey = id_priv->qkey;
	*qp_attr_mask \|= IB_QP_QKEY;
	} else {
	qp_attr->qp_access_flags = 0;
	*qp_attr_mask \|= IB_QP_ACCESS_FLAGS;
	}
	return 0;
	}
	#endif

	int rdma_init_qp_attr(struct rdma_cm_id id, struct ib_qp_attr qp_attr,
	int *qp_attr_mask)
	{
	struct rdma_id_private *id_priv;
	int ret = 0;

	id_priv = container_of(id, struct rdma_id_private, id);
	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (!id_priv->cm_id.ib \|\| cma_is_ud_ps(id_priv->id.ps))
	ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
	else
	ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
	qp_attr_mask);
	if (qp_attr->qp_state == IB_QPS_RTR)
	qp_attr->rq_psn = id_priv->seq_num;
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	if (!id_priv->cm_id.iw) {
	qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE;
	*qp_attr_mask = IB_QP_STATE \| IB_QP_ACCESS_FLAGS;
	} else
	ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
	qp_attr_mask);
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = ENOSYS;
	break;
	}
	#endif

	return ret;
	}

	static inline int cma_zero_addr(struct sockaddr *addr)
	{
	struct in6_addr *ip6;

	if (addr->sa_family == AF_INET)
	return in_nullhost(((struct sockaddr_in *) addr)->sin_addr);
	else {
	ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr;
	return (ip6->s6_addr32[0] \| ip6->s6_addr32[1] \|
	ip6->s6_addr32[2] \| ip6->s6_addr32[3]) == 0;
	}
	}

	static inline int cma_loopback_addr(struct sockaddr *addr)
	{
	return ((struct sockaddr_in *)addr)->sin_addr.s_addr == INADDR_LOOPBACK;
	}

	static inline int cma_any_addr(struct sockaddr *addr)
	{
	return cma_zero_addr(addr) \|\| cma_loopback_addr(addr);
	}

	static inline __be16 cma_port(struct sockaddr *addr)
	{
	if (addr->sa_family == AF_INET)
	return ((struct sockaddr_in *) addr)->sin_port;
	else
	return ((struct sockaddr_in6 *) addr)->sin6_port;
	}

	static inline int cma_any_port(struct sockaddr *addr)
	{
	return !cma_port(addr);
	}

	#ifdef IB_SUPPORTED
	static int cma_get_net_info(void *hdr, enum rdma_port_space ps,
	u8 ip_ver, __u16 port,
	union cma_ip_addr src, union cma_ip_addr dst)
	{
	switch (ps) {
	case RDMA_PS_SDP:
	if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) !=
	SDP_MAJ_VERSION)
	return (EINVAL);

	*ip_ver = sdp_get_ip_ver(hdr);
	port = ((struct sdp_hh ) hdr)->port;
	src = &((struct sdp_hh ) hdr)->src_addr;
	dst = &((struct sdp_hh ) hdr)->dst_addr;
	break;
	default:
	if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION)
	return (EINVAL);

	*ip_ver = cma_get_ip_ver(hdr);
	port = ((struct cma_hdr ) hdr)->port;
	src = &((struct cma_hdr ) hdr)->src_addr;
	dst = &((struct cma_hdr ) hdr)->dst_addr;
	break;
	}

	if (ip_ver != 4 && ip_ver != 6)
	return (EINVAL);
	return 0;
	}

	static void cma_save_net_info(struct rdma_addr *addr,
	struct rdma_addr *listen_addr,
	u8 ip_ver, __u16 port,
	union cma_ip_addr src, union cma_ip_addr dst)
	{
	struct sockaddr_in listen4, ip4;
	struct sockaddr_in6 listen6, ip6;

	switch (ip_ver) {
	case 4:
	listen4 = (struct sockaddr_in *) &listen_addr->src_addr;
	ip4 = (struct sockaddr_in *) &addr->src_addr;
	ip4->sin_family = listen4->sin_family;
	ip4->sin_addr.s_addr = dst->ip4.addr;
	ip4->sin_port = listen4->sin_port;

	ip4 = (struct sockaddr_in *) &addr->dst_addr;
	ip4->sin_family = listen4->sin_family;
	ip4->sin_addr.s_addr = src->ip4.addr;
	ip4->sin_port = port;
	break;
	case 6:
	listen6 = (struct sockaddr_in6 *) &listen_addr->src_addr;
	ip6 = (struct sockaddr_in6 *) &addr->src_addr;
	ip6->sin6_family = listen6->sin6_family;
	ip6->sin6_addr = dst->ip6;
	ip6->sin6_port = listen6->sin6_port;

	ip6 = (struct sockaddr_in6 *) &addr->dst_addr;
	ip6->sin6_family = listen6->sin6_family;
	ip6->sin6_addr = src->ip6;
	ip6->sin6_port = port;
	break;
	default:
	break;
	}
	}
	#endif

	static inline int cma_user_data_offset(enum rdma_port_space ps)
	{
	switch (ps) {
	case RDMA_PS_SDP:
	return 0;
	default:
	return sizeof(struct cma_hdr);
	}
	}

	static void cma_cancel_route(struct rdma_id_private *id_priv)
	{
	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (id_priv->query)
	ib_sa_cancel_query(id_priv->query_id, id_priv->query);
	break;
	default:
	break;
	}
	#endif
	}

	static inline int cma_internal_listen(struct rdma_id_private *id_priv)
	{
	return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev &&
	cma_any_addr(&id_priv->id.route.addr.src_addr);
	}

	static void cma_destroy_listen(struct rdma_id_private *id_priv)
	{
	cma_exch(id_priv, CMA_DESTROYING);

	if (id_priv->cma_dev) {
	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
	ib_destroy_cm_id(id_priv->cm_id.ib);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
	iw_destroy_cm_id(id_priv->cm_id.iw);
	#ifdef IB_SUPPORTED
	break;
	default:
	break;
	}
	#endif
	cma_detach_from_dev(id_priv);
	}
	LIST_REMOVE(id_priv, listen_entry);

	cma_deref_id(id_priv);
	mtx_lock(&id_priv->lock);
	if (id_priv->refcount)
	cv_wait(&id_priv->comp, &id_priv->lock);
	mtx_unlock(&id_priv->lock);

	free(id_priv, M_DEVBUF);
	}

	static void cma_cancel_listens(struct rdma_id_private *id_priv)
	{
	struct rdma_id_private *dev_id_priv;

	mtx_lock(&lock);
	LIST_REMOVE(id_priv, list);

	while (!LIST_EMPTY(&id_priv->listen_list)) {
	dev_id_priv = LIST_FIRST(&id_priv->listen_list);
	cma_destroy_listen(dev_id_priv);
	}
	mtx_unlock(&lock);
	}

	static void cma_cancel_operation(struct rdma_id_private *id_priv,
	enum cma_state state)
	{
	switch (state) {
	case CMA_ADDR_QUERY:
	rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
	break;
	case CMA_ROUTE_QUERY:
	cma_cancel_route(id_priv);
	break;
	case CMA_LISTEN:
	if (cma_any_addr(&id_priv->id.route.addr.src_addr) &&
	!id_priv->cma_dev)
	cma_cancel_listens(id_priv);
	break;
	default:
	break;
	}
	}

	static void cma_release_port(struct rdma_id_private *id_priv)
	{
	struct rdma_bind_list *bind_list = id_priv->bind_list;

	if (!bind_list)
	return;

	mtx_lock(&lock);
	TAILQ_REMOVE(&bind_list->owners, id_priv, node);
	if (TAILQ_EMPTY(&bind_list->owners)) {
	kvl_delete(bind_list->ps, bind_list->port);
	free(bind_list, M_DEVBUF);
	}
	mtx_unlock(&lock);
	if (id_priv->so)
	soclose(id_priv->so);
	}

	#ifdef IB_SUPPORTED
	static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
	{
	struct cma_multicast *mc;

	while (!LIST_EMPTY(&id_priv->mc_list)) {
	mc = LIST_FIRST(&id_priv->mc_list);
	LIST_REMOVE(mc, list);
	ib_sa_free_multicast(mc->multicast.ib);
	free(mc, M_DEVBUF);
	}
	}
	#endif

	void rdma_destroy_id(struct rdma_cm_id *id)
	{
	struct rdma_id_private *id_priv;
	enum cma_state state;

	id_priv = container_of(id, struct rdma_id_private, id);
	state = cma_exch(id_priv, CMA_DESTROYING);
	cma_cancel_operation(id_priv, state);

	mtx_lock(&lock);
	if (id_priv->cma_dev) {
	mtx_unlock(&lock);
	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib))
	ib_destroy_cm_id(id_priv->cm_id.ib);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw))
	iw_destroy_cm_id(id_priv->cm_id.iw);
	#ifdef IB_SUPPORTED
	break;
	default:
	break;
	}
	cma_leave_mc_groups(id_priv);
	#endif
	mtx_lock(&lock);
	cma_detach_from_dev(id_priv);
	}
	mtx_unlock(&lock);
	cma_release_port(id_priv);
	cma_deref_id(id_priv);
	mtx_lock(&id_priv->lock);
	PANIC_IF(id_priv->refcount < 0);
	if (id_priv->refcount)
	cv_wait(&id_priv->comp, &id_priv->lock);
	mtx_unlock(&id_priv->lock);
	free(id_priv->id.route.path_rec, M_DEVBUF);
	free(id_priv, M_DEVBUF);
	}

	#ifdef IB_SUPPORTED
	static int cma_rep_recv(struct rdma_id_private *id_priv)
	{
	int ret;

	ret = cma_modify_qp_rtr(&id_priv->id);
	if (ret)
	goto reject;

	ret = cma_modify_qp_rts(&id_priv->id);
	if (ret)
	goto reject;

	ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
	if (ret)
	goto reject;

	return 0;
	reject:
	cma_modify_qp_err(&id_priv->id);
	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
	NULL, 0, NULL, 0);
	return ret;
	}

	static int cma_verify_rep(struct rdma_id_private id_priv, void data)
	{
	if (id_priv->id.ps == RDMA_PS_SDP &&
	sdp_get_majv(((struct sdp_hah *) data)->sdp_version) !=
	SDP_MAJ_VERSION)
	return (EINVAL);

	return 0;
	}

	static void cma_set_rep_event_data(struct rdma_cm_event *event,
	struct ib_cm_rep_event_param *rep_data,
	void *private_data)
	{
	event->param.conn.private_data = private_data;
	event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
	event->param.conn.responder_resources = rep_data->responder_resources;
	event->param.conn.initiator_depth = rep_data->initiator_depth;
	event->param.conn.flow_control = rep_data->flow_control;
	event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
	event->param.conn.srq = rep_data->srq;
	event->param.conn.qp_num = rep_data->remote_qpn;
	}

	static int cma_ib_handler(struct ib_cm_id cm_id, struct ib_cm_event ib_event)
	{
	struct rdma_id_private *id_priv = cm_id->context;
	struct rdma_cm_event event;
	int ret = 0;

	if (cma_disable_remove(id_priv, CMA_CONNECT))
	return 0;

	memset(&event, 0, sizeof event);
	switch (ib_event->event) {
	case IB_CM_REQ_ERROR:
	case IB_CM_REP_ERROR:
	event.event = RDMA_CM_EVENT_UNREACHABLE;
	event.status = ETIMEDOUT;
	break;
	case IB_CM_REP_RECEIVED:
	event.status = cma_verify_rep(id_priv, ib_event->private_data);
	if (event.status)
	event.event = RDMA_CM_EVENT_CONNECT_ERROR;
	else if (id_priv->id.qp && id_priv->id.ps != RDMA_PS_SDP) {
	event.status = cma_rep_recv(id_priv);
	event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
	RDMA_CM_EVENT_ESTABLISHED;
	} else
	event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
	cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
	ib_event->private_data);
	break;
	case IB_CM_RTU_RECEIVED:
	case IB_CM_USER_ESTABLISHED:
	event.event = RDMA_CM_EVENT_ESTABLISHED;
	break;
	case IB_CM_DREQ_ERROR:
	event.status = ETIMEDOUT; /* fall through */
	case IB_CM_DREQ_RECEIVED:
	case IB_CM_DREP_RECEIVED:
	if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT))
	goto out;
	event.event = RDMA_CM_EVENT_DISCONNECTED;
	break;
	case IB_CM_TIMEWAIT_EXIT:
	case IB_CM_MRA_RECEIVED:
	/* ignore event */
	goto out;
	case IB_CM_REJ_RECEIVED:
	cma_modify_qp_err(&id_priv->id);
	event.status = ib_event->param.rej_rcvd.reason;
	event.event = RDMA_CM_EVENT_REJECTED;
	event.param.conn.private_data = ib_event->private_data;
	event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
	break;
	default:
	log(LOG_ERR, "RDMA CMA: unexpected IB CM event: %d",
	ib_event->event);
	goto out;
	}

	ret = id_priv->id.event_handler(&id_priv->id, &event);
	if (ret) {
	/* Destroy the CM ID by returning a non-zero value. */
	id_priv->cm_id.ib = NULL;
	cma_exch(id_priv, CMA_DESTROYING);
	cma_enable_remove(id_priv);
	rdma_destroy_id(&id_priv->id);
	return ret;
	}
	out:
	cma_enable_remove(id_priv);
	return ret;
	}

	static struct rdma_id_private cma_new_conn_id(struct rdma_cm_id listen_id,
	struct ib_cm_event *ib_event)
	{
	struct rdma_id_private *id_priv;
	struct rdma_cm_id *id;
	struct rdma_route *rt;
	union cma_ip_addr src, dst;
	__u16 port;
	u8 ip_ver;

	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
	&ip_ver, &port, &src, &dst))
	goto err;

	id = rdma_create_id(listen_id->event_handler, listen_id->context,
	listen_id->ps);
	if (IS_ERR(id))
	goto err;

	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
	ip_ver, port, src, dst);

	rt = &id->route;
	rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
	rt->path_rec = malloc(sizeof rt->path_rec rt->num_paths,
	M_DEVBUF, M_NOWAIT);
	if (!rt->path_rec)
	goto destroy_id;

	rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
	if (rt->num_paths == 2)
	rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;

	ib_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
	ib_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
	ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
	rt->addr.dev_addr.dev_type = RDMA_NODE_IB_CA;

	id_priv = container_of(id, struct rdma_id_private, id);
	id_priv->state = CMA_CONNECT;
	return id_priv;

	destroy_id:
	rdma_destroy_id(id);
	err:
	return NULL;
	}

	static struct rdma_id_private cma_new_udp_id(struct rdma_cm_id listen_id,
	struct ib_cm_event *ib_event)
	{
	struct rdma_id_private *id_priv;
	struct rdma_cm_id *id;
	union cma_ip_addr src, dst;
	__u16 port;
	u8 ip_ver;
	int ret;

	id = rdma_create_id(listen_id->event_handler, listen_id->context,
	listen_id->ps);
	if (IS_ERR(id))
	return NULL;


	if (cma_get_net_info(ib_event->private_data, listen_id->ps,
	&ip_ver, &port, &src, &dst))
	goto err;

	cma_save_net_info(&id->route.addr, &listen_id->route.addr,
	ip_ver, port, src, dst);

	ret = rdma_translate_ip(&id->route.addr.src_addr,
	&id->route.addr.dev_addr);
	if (ret)
	goto err;

	id_priv = container_of(id, struct rdma_id_private, id);
	id_priv->state = CMA_CONNECT;
	return id_priv;
	err:
	rdma_destroy_id(id);
	return NULL;
	}

	static void cma_set_req_event_data(struct rdma_cm_event *event,
	struct ib_cm_req_event_param *req_data,
	void *private_data, int offset)
	{
	event->param.conn.private_data = private_data + offset;
	event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
	event->param.conn.responder_resources = req_data->responder_resources;
	event->param.conn.initiator_depth = req_data->initiator_depth;
	event->param.conn.flow_control = req_data->flow_control;
	event->param.conn.retry_count = req_data->retry_count;
	event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
	event->param.conn.srq = req_data->srq;
	event->param.conn.qp_num = req_data->remote_qpn;
	}

	static int cma_req_handler(struct ib_cm_id cm_id, struct ib_cm_event ib_event)
	{
	struct rdma_id_private listen_id, conn_id;
	struct rdma_cm_event event;
	int offset, ret;

	listen_id = cm_id->context;
	if (cma_disable_remove(listen_id, CMA_LISTEN))
	return (ECONNABORTED);

	memset(&event, 0, sizeof event);
	offset = cma_user_data_offset(listen_id->id.ps);
	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
	if (cma_is_ud_ps(listen_id->id.ps)) {
	conn_id = cma_new_udp_id(&listen_id->id, ib_event);
	event.param.ud.private_data = ib_event->private_data + offset;
	event.param.ud.private_data_len =
	IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
	} else {
	conn_id = cma_new_conn_id(&listen_id->id, ib_event);
	cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
	ib_event->private_data, offset);
	}
	if (!conn_id) {
	ret = ENOMEM;
	goto out;
	}

	mtx_lock(&conn_id->lock);
	conn_id->dev_remove++;
	mtx_unlock(&conn_id->lock);
	mtx_lock(&lock);
	ret = cma_acquire_dev(conn_id);
	mtx_unlock(&lock);
	if (ret)
	goto release_conn_id;

	conn_id->cm_id.ib = cm_id;
	cm_id->context = conn_id;
	cm_id->cm_handler = cma_ib_handler;

	ret = conn_id->id.event_handler(&conn_id->id, &event);
	if (!ret)
	goto out;

	/* Destroy the CM ID by returning a non-zero value. */
	conn_id->cm_id.ib = NULL;

	release_conn_id:
	cma_exch(conn_id, CMA_DESTROYING);
	cma_enable_remove(conn_id);
	rdma_destroy_id(&conn_id->id);

	out:
	cma_enable_remove(listen_id);
	return ret;
	}

	static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr)
	{
	return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr)));
	}

	static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
	struct ib_cm_compare_data *compare)
	{
	struct cma_hdr cma_data, cma_mask;
	struct sdp_hh sdp_data, sdp_mask;
	__u32 ip4_addr;
	struct in6_addr ip6_addr;

	memset(compare, 0, sizeof *compare);
	cma_data = (void *) compare->data;
	cma_mask = (void *) compare->mask;
	sdp_data = (void *) compare->data;
	sdp_mask = (void *) compare->mask;

	switch (addr->sa_family) {
	case AF_INET:
	ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
	if (ps == RDMA_PS_SDP) {
	sdp_set_ip_ver(sdp_data, 4);
	sdp_set_ip_ver(sdp_mask, 0xF);
	sdp_data->dst_addr.ip4.addr = ip4_addr;
	sdp_mask->dst_addr.ip4.addr = ~0;
	} else {
	cma_set_ip_ver(cma_data, 4);
	cma_set_ip_ver(cma_mask, 0xF);
	cma_data->dst_addr.ip4.addr = ip4_addr;
	cma_mask->dst_addr.ip4.addr = ~0;
	}
	break;
	case AF_INET6:
	ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
	if (ps == RDMA_PS_SDP) {
	sdp_set_ip_ver(sdp_data, 6);
	sdp_set_ip_ver(sdp_mask, 0xF);
	sdp_data->dst_addr.ip6 = ip6_addr;
	memset(&sdp_mask->dst_addr.ip6, 0xFF,
	sizeof sdp_mask->dst_addr.ip6);
	} else {
	cma_set_ip_ver(cma_data, 6);
	cma_set_ip_ver(cma_mask, 0xF);
	cma_data->dst_addr.ip6 = ip6_addr;
	memset(&cma_mask->dst_addr.ip6, 0xFF,
	sizeof cma_mask->dst_addr.ip6);
	}
	break;
	default:
	break;
	}
	}
	#endif /* IB_SUPPORTED */

	static int cma_iw_handler(struct iw_cm_id iw_id, struct iw_cm_event iw_event)
	{
	struct rdma_id_private *id_priv = iw_id->context;
	struct rdma_cm_event event;
	struct sockaddr_in *sin;
	int ret = 0;

	if (cma_disable_remove(id_priv, CMA_CONNECT))
	return 0;

	memset(&event, 0, sizeof event);
	switch (iw_event->event) {
	case IW_CM_EVENT_CLOSE:
	event.event = RDMA_CM_EVENT_DISCONNECTED;
	break;
	case IW_CM_EVENT_CONNECT_REPLY:
	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
	*sin = iw_event->local_addr;
	sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr;
	*sin = iw_event->remote_addr;
	switch (iw_event->status) {
	case 0:
	event.event = RDMA_CM_EVENT_ESTABLISHED;
	break;
	case ECONNRESET:
	case ECONNREFUSED:
	event.event = RDMA_CM_EVENT_REJECTED;
	break;
	case ETIMEDOUT:
	event.event = RDMA_CM_EVENT_UNREACHABLE;
	break;
	default:
	event.event = RDMA_CM_EVENT_CONNECT_ERROR;
	break;
	}
	break;
	case IW_CM_EVENT_ESTABLISHED:
	event.event = RDMA_CM_EVENT_ESTABLISHED;
	break;
	default:
	panic("unknown event type %d", iw_event->event);

	}

	event.status = iw_event->status;
	event.param.conn.private_data = iw_event->private_data;
	event.param.conn.private_data_len = iw_event->private_data_len;
	ret = id_priv->id.event_handler(&id_priv->id, &event);
	if (ret) {
	/* Destroy the CM ID by returning a non-zero value. */
	id_priv->cm_id.iw = NULL;
	cma_exch(id_priv, CMA_DESTROYING);
	cma_enable_remove(id_priv);
	rdma_destroy_id(&id_priv->id);
	return ret;
	}

	cma_enable_remove(id_priv);
	return ret;
	}

	static int iw_conn_req_handler(struct iw_cm_id *cm_id,
	struct iw_cm_event *iw_event)
	{
	struct rdma_cm_id *new_cm_id;
	struct rdma_id_private listen_id, conn_id;
	struct sockaddr_in *sin;
	struct ifnet *dev;
	struct rdma_cm_event event;
	int ret;
	struct ifaddr *ifa;
	uint16_t port;

	listen_id = cm_id->context;
	if (cma_disable_remove(listen_id, CMA_LISTEN))
	return (ECONNABORTED);

	/* Create a new RDMA id for the new IW CM ID */
	new_cm_id = rdma_create_id(listen_id->id.event_handler,
	listen_id->id.context,
	RDMA_PS_TCP);
	if (!new_cm_id) {
	ret = ENOMEM;
	goto out;
	}
	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
	mtx_lock(&conn_id->lock);
	++conn_id->dev_remove;
	mtx_unlock(&conn_id->lock);
	conn_id->state = CMA_CONNECT;

	port = iw_event->local_addr.sin_port;
	iw_event->local_addr.sin_port = 0;
	ifa = ifa_ifwithaddr((struct sockaddr *)&iw_event->local_addr);
	iw_event->local_addr.sin_port = port;
	if (!ifa) {
	ret = EADDRNOTAVAIL;
	cma_enable_remove(conn_id);
	rdma_destroy_id(new_cm_id);
	goto out;
	}
	dev = ifa->ifa_ifp;
	ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL);
	if (ret) {
	cma_enable_remove(conn_id);
	rdma_destroy_id(new_cm_id);
	goto out;
	}

	mtx_lock(&lock);
	ret = cma_acquire_dev(conn_id);
	mtx_unlock(&lock);
	if (ret) {
	cma_enable_remove(conn_id);
	rdma_destroy_id(new_cm_id);
	goto out;
	}

	conn_id->cm_id.iw = cm_id;
	cm_id->context = conn_id;
	cm_id->cm_handler = cma_iw_handler;

	sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr;
	*sin = iw_event->local_addr;
	sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
	*sin = iw_event->remote_addr;
	conn_id->so = cm_id->so;

	memset(&event, 0, sizeof event);
	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
	event.param.conn.private_data = iw_event->private_data;
	event.param.conn.private_data_len = iw_event->private_data_len;
	ret = conn_id->id.event_handler(&conn_id->id, &event);
	if (ret) {
	/* User wants to destroy the CM ID */
	conn_id->cm_id.iw = NULL;
	cma_exch(conn_id, CMA_DESTROYING);
	cma_enable_remove(conn_id);
	rdma_destroy_id(&conn_id->id);
	}

	out:
	cma_enable_remove(listen_id);
	return ret;
	}

	#ifdef IB_SUPPORTED
	static int cma_ib_listen(struct rdma_id_private *id_priv)
	{
	struct ib_cm_compare_data compare_data;
	struct sockaddr *addr;
	__be64 svc_id;
	int ret;

	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler,
	id_priv);
	if (IS_ERR(id_priv->cm_id.ib))
	return PTR_ERR(id_priv->cm_id.ib);

	addr = &id_priv->id.route.addr.src_addr;
	svc_id = cma_get_service_id(id_priv->id.ps, addr);
	if (cma_any_addr(addr))
	ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
	else {
	cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
	ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
	}

	if (ret) {
	ib_destroy_cm_id(id_priv->cm_id.ib);
	id_priv->cm_id.ib = NULL;
	}

	return ret;
	}
	#endif

	static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
	{
	int ret;
	struct sockaddr_in *sin;

	id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, id_priv->so,
	iw_conn_req_handler, id_priv);
	if (IS_ERR(id_priv->cm_id.iw))
	return PTR_ERR(id_priv->cm_id.iw);

	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
	id_priv->cm_id.iw->local_addr = *sin;

	ret = iw_cm_listen(id_priv->cm_id.iw, backlog);

	if (ret) {
	iw_destroy_cm_id(id_priv->cm_id.iw);
	id_priv->cm_id.iw = NULL;
	}

	return ret;
	}

	static int cma_listen_handler(struct rdma_cm_id *id,
	struct rdma_cm_event *event)
	{
	struct rdma_id_private *id_priv = id->context;

	id->context = id_priv->id.context;
	id->event_handler = id_priv->id.event_handler;
	return id_priv->id.event_handler(id, event);
	}

	static void cma_listen_on_dev(struct rdma_id_private *id_priv,
	struct cma_device *cma_dev)
	{
	struct rdma_id_private *dev_id_priv;
	struct rdma_cm_id *id;
	int ret;

	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps);
	if (IS_ERR(id))
	return;

	dev_id_priv = container_of(id, struct rdma_id_private, id);

	dev_id_priv->state = CMA_ADDR_BOUND;
	memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr,
	ip_addr_size(&id_priv->id.route.addr.src_addr));
	dev_id_priv->so = id_priv->so; /* XXX */

	cma_attach_to_dev(dev_id_priv, cma_dev);
	LIST_INSERT_HEAD(&id_priv->listen_list, dev_id_priv, listen_entry);

	ret = rdma_listen(id, id_priv->backlog);
	if (ret)
	goto err;

	return;
	err:
	cma_destroy_listen(dev_id_priv);
	}

	static void cma_listen_on_all(struct rdma_id_private *id_priv)
	{
	struct cma_device *cma_dev;

	mtx_lock(&lock);
	LIST_INSERT_HEAD(&listen_any_list, id_priv, list);
	TAILQ_FOREACH(cma_dev, &dev_list, list)
	cma_listen_on_dev(id_priv, cma_dev);
	mtx_unlock(&lock);
	}

	static int cma_bind_any(struct rdma_cm_id *id, sa_family_t af)
	{
	struct sockaddr_in addr_in;

	memset(&addr_in, 0, sizeof addr_in);
	addr_in.sin_family = af;
	addr_in.sin_len = sizeof addr_in;
	return rdma_bind_addr(id, (struct sockaddr *) &addr_in);
	}

	int rdma_listen(struct rdma_cm_id *id, int backlog)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (id_priv->state == CMA_IDLE) {
	ret = cma_bind_any(id, AF_INET);
	if (ret)
	return ret;
	}

	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
	return (EINVAL);

	id_priv->backlog = backlog;
	if (id->device) {
	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	ret = cma_ib_listen(id_priv);
	if (ret)
	goto err;
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	ret = cma_iw_listen(id_priv, backlog);
	if (ret)
	goto err;
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = ENOSYS;
	goto err;
	}
	#endif
	} else
	cma_listen_on_all(id_priv);

	return 0;
	err:
	id_priv->backlog = 0;
	cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
	return ret;
	}

	#ifdef IB_SUPPORTED
	static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
	void *context)
	{
	struct cma_work *work = context;
	struct rdma_route *route;

	route = &work->id->id.route;

	if (!status) {
	route->num_paths = 1;
	route->path_rec = path_rec;
	} else {
	work->old_state = CMA_ROUTE_QUERY;
	work->new_state = CMA_ADDR_RESOLVED;
	work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
	work->event.status = status;
	}

	taskqueue_enqueue(cma_wq, &work->task);
	}

	static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
	struct cma_work *work)
	{
	struct rdma_dev_addr *addr = &id_priv->id.route.addr.dev_addr;
	struct ib_sa_path_rec path_rec;

	memset(&path_rec, 0, sizeof path_rec);
	ib_addr_get_sgid(addr, &path_rec.sgid);
	ib_addr_get_dgid(addr, &path_rec.dgid);
	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(addr));
	path_rec.numb_path = 1;
	path_rec.reversible = 1;

	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
	id_priv->id.port_num, &path_rec,
	IB_SA_PATH_REC_DGID \| IB_SA_PATH_REC_SGID \|
	IB_SA_PATH_REC_PKEY \| IB_SA_PATH_REC_NUMB_PATH \|
	IB_SA_PATH_REC_REVERSIBLE,
	timeout_ms, M_NOWAIT,
	cma_query_handler, work, &id_priv->query);

	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
	}
	#endif

	static void cma_work_handler(void *context, int pending)
	{
	struct cma_work *work = context;
	struct rdma_id_private *id_priv = work->id;
	int destroy = 0;

	mtx_lock(&id_priv->lock);
	++id_priv->dev_remove;
	mtx_unlock(&id_priv->lock);
	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
	goto out;

	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
	cma_exch(id_priv, CMA_DESTROYING);
	destroy = 1;
	}
	out:
	cma_enable_remove(id_priv);
	cma_deref_id(id_priv);
	if (destroy)
	rdma_destroy_id(&id_priv->id);
	free(work, M_DEVBUF);
	}

	#ifdef IB_SUPPORTED
	static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
	{
	struct rdma_route *route = &id_priv->id.route;
	struct cma_work *work;
	int ret;

	work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
	if (!work)
	return (ENOMEM);
	bzero(work, sizeof *work);

	work->id = id_priv;
	TASK_INIT(&work->task, 0, cma_work_handler, work);
	work->old_state = CMA_ROUTE_QUERY;
	work->new_state = CMA_ROUTE_RESOLVED;
	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;

	route->path_rec = malloc(sizeof *route->path_rec, M_DEVBUF, M_NOWAIT);
	if (!route->path_rec) {
	ret = ENOMEM;
	goto err1;
	}

	ret = cma_query_ib_route(id_priv, timeout_ms, work);
	if (ret)
	goto err2;

	return 0;
	err2:
	free(route->path_rec, M_DEVBUF);
	route->path_rec = NULL;
	err1:
	free(work, M_DEVBUF);
	return ret;
	}

	int rdma_set_ib_paths(struct rdma_cm_id *id,
	struct ib_sa_path_rec *path_rec, int num_paths)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED))
	return (EINVAL);

	id->route.path_rec = malloc(sizeof path_rec num_paths, M_DEVBUF, M_NOWAIT);
	if (!id->route.path_rec) {
	ret = ENOMEM;
	goto err;
	}

	memcpy(id->route.path_rec, path_rec, sizeof path_rec num_paths);
	return 0;
	err:
	cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED);
	return ret;
	}
	#endif

	static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
	{
	struct cma_work *work;

	work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
	if (!work)
	return (ENOMEM);
	bzero(work, sizeof *work);

	work->id = id_priv;
	TASK_INIT(&work->task, 0, cma_work_handler, work);
	work->old_state = CMA_ROUTE_QUERY;
	work->new_state = CMA_ROUTE_RESOLVED;
	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
	taskqueue_enqueue(cma_wq, &work->task);
	return 0;
	}

	int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY))
	return (EINVAL);

	mtx_lock(&id_priv->lock);
	id_priv->refcount++;
	mtx_unlock(&id_priv->lock);
	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	ret = cma_resolve_ib_route(id_priv, timeout_ms);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	ret = cma_resolve_iw_route(id_priv, timeout_ms);
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = ENOSYS;
	break;
	}
	#endif
	if (ret)
	goto err;

	return 0;
	err:
	cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED);
	cma_deref_id(id_priv);
	return ret;
	}

	static int cma_bind_loopback(struct rdma_id_private *id_priv)
	{
	struct cma_device *cma_dev;
	struct ib_port_attr port_attr;
	union ib_gid gid;
	u16 pkey;
	int ret;
	u8 p;

	mtx_lock(&lock);
	if (TAILQ_EMPTY(&dev_list)) {
	ret = ENODEV;
	goto out;
	}
	TAILQ_FOREACH(cma_dev, &dev_list, list)
	for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p)
	if (!ib_query_port(cma_dev->device, p, &port_attr) &&
	port_attr.state == IB_PORT_ACTIVE)
	goto port_found;

	p = 1;
	cma_dev = TAILQ_FIRST(&dev_list);

	port_found:
	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
	if (ret)
	goto out;

	ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
	if (ret)
	goto out;

	ib_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
	id_priv->id.port_num = p;
	cma_attach_to_dev(id_priv, cma_dev);
	out:
	mtx_unlock(&lock);
	return ret;
	}

	static void addr_handler(int status, struct sockaddr *src_addr,
	struct rdma_dev_addr dev_addr, void context)
	{
	struct rdma_id_private *id_priv = context;
	struct rdma_cm_event event;

	memset(&event, 0, sizeof event);
	mtx_lock(&id_priv->lock);
	++id_priv->dev_remove;
	mtx_unlock(&id_priv->lock);

	/*
	* Grab mutex to block rdma_destroy_id() from removing the device while
	* we're trying to acquire it.
	*/
	mtx_lock(&lock);
	if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) {
	mtx_unlock(&lock);
	goto out;
	}

	if (!status && !id_priv->cma_dev)
	status = cma_acquire_dev(id_priv);
	mtx_unlock(&lock);

	if (status) {
	if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND))
	goto out;
	event.event = RDMA_CM_EVENT_ADDR_ERROR;
	event.status = status;
	} else {
	memcpy(&id_priv->id.route.addr.src_addr, src_addr,
	ip_addr_size(src_addr));
	event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
	}

	if (id_priv->id.event_handler(&id_priv->id, &event)) {
	cma_exch(id_priv, CMA_DESTROYING);
	cma_enable_remove(id_priv);
	cma_deref_id(id_priv);
	rdma_destroy_id(&id_priv->id);
	return;
	}
	out:
	cma_enable_remove(id_priv);
	cma_deref_id(id_priv);
	}

	static int cma_resolve_loopback(struct rdma_id_private *id_priv)
	{
	struct cma_work *work;
	struct sockaddr_in src_in, dst_in;
	union ib_gid gid;
	int ret;

	work = malloc(sizeof *work, M_DEVBUF, M_NOWAIT);
	if (!work)
	return (ENOMEM);
	bzero(work, sizeof *work);

	if (!id_priv->cma_dev) {
	ret = cma_bind_loopback(id_priv);
	if (ret)
	goto err;
	}

	ib_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
	ib_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);

	if (cma_zero_addr(&id_priv->id.route.addr.src_addr)) {
	src_in = (struct sockaddr_in *)&id_priv->id.route.addr.src_addr;
	dst_in = (struct sockaddr_in *)&id_priv->id.route.addr.dst_addr;
	src_in->sin_family = dst_in->sin_family;
	src_in->sin_addr.s_addr = dst_in->sin_addr.s_addr;
	}

	work->id = id_priv;
	TASK_INIT(&work->task, 0, cma_work_handler, work);
	work->old_state = CMA_ADDR_QUERY;
	work->new_state = CMA_ADDR_RESOLVED;
	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
	taskqueue_enqueue(cma_wq, &work->task);
	return 0;
	err:
	free(work, M_DEVBUF);
	return ret;
	}

	static int cma_bind_addr(struct rdma_cm_id id, struct sockaddr src_addr,
	struct sockaddr *dst_addr)
	{
	if (src_addr && src_addr->sa_family)
	return rdma_bind_addr(id, src_addr);
	else
	return cma_bind_any(id, dst_addr->sa_family);
	}

	int rdma_resolve_addr(struct rdma_cm_id id, struct sockaddr src_addr,
	struct sockaddr *dst_addr, int timeout_ms)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (id_priv->state == CMA_IDLE) {
	ret = cma_bind_addr(id, src_addr, dst_addr);
	if (ret)
	return ret;
	}

	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY))
	return (EINVAL);

	mtx_lock(&id_priv->lock);
	id_priv->refcount++;
	mtx_unlock(&id_priv->lock);
	memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
	if (cma_any_addr(dst_addr))
	ret = cma_resolve_loopback(id_priv);
	else
	ret = rdma_resolve_ip(&addr_client, &id->route.addr.src_addr,
	dst_addr, &id->route.addr.dev_addr,
	timeout_ms, addr_handler, id_priv);
	if (ret)
	goto err;

	return 0;
	err:
	cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND);
	cma_deref_id(id_priv);
	return ret;
	}

	static void cma_bind_port(struct rdma_bind_list *bind_list,
	struct rdma_id_private *id_priv)
	{
	struct sockaddr_in *sin;

	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
	sin->sin_port = htons(bind_list->port);
	id_priv->bind_list = bind_list;
	TAILQ_INSERT_HEAD(&bind_list->owners, id_priv, node);
	}

	static int cma_alloc_port(struct kvl ps, struct rdma_id_private id_priv,
	unsigned short snum)
	{
	struct rdma_bind_list *bind_list;
	int port, ret;

	bind_list = malloc(sizeof *bind_list, M_DEVBUF, M_NOWAIT);
	if (!bind_list)
	return (ENOMEM);
	bzero(bind_list, sizeof *bind_list);

	do {
	ret = kvl_alloc_above(ps, bind_list, snum, &port);
	} while (ret == EAGAIN);

	if (ret)
	goto err1;

	if (port != snum) {
	ret = EADDRNOTAVAIL;
	goto err2;
	}

	bind_list->ps = ps;
	bind_list->port = (unsigned short) port;
	cma_bind_port(bind_list, id_priv);
	return 0;
	err2:
	kvl_delete(ps, port);
	err1:
	free(bind_list, M_DEVBUF);
	return ret;
	}

	static int cma_alloc_any_port(struct kvl ps, struct rdma_id_private id_priv)
	{
	+ INIT_VNET_INET(curvnet);
	struct rdma_bind_list *bind_list;
	int port, ret;

	bind_list = malloc(sizeof *bind_list, M_DEVBUF, M_NOWAIT);
	if (!bind_list)
	return (ENOMEM);
	bzero(bind_list, sizeof *bind_list);

	retry:
	do {
	ret = kvl_alloc_above(ps, bind_list, next_port, &port);
	} while (ret == EAGAIN);

	if (ret)
	goto err1;

	if (port > V_ipport_lastauto) {
	if (next_port != V_ipport_firstauto) {
	kvl_delete(ps, port);
	next_port = V_ipport_firstauto;
	goto retry;
	}
	ret = EADDRNOTAVAIL;
	goto err2;
	}

	if (port == V_ipport_lastauto)
	next_port = V_ipport_firstauto;
	else
	next_port = port + 1;

	bind_list->ps = ps;
	bind_list->port = (unsigned short) port;
	cma_bind_port(bind_list, id_priv);
	return 0;
	err2:
	kvl_delete(ps, port);
	err1:
	free(bind_list, M_DEVBUF);
	return ret;
	}

	static int cma_use_port(struct kvl ps, struct rdma_id_private id_priv)
	{
	+ INIT_VNET_INET(curvnet);
	struct rdma_id_private *cur_id;
	struct sockaddr_in sin, cur_sin;
	struct rdma_bind_list *bind_list;
	unsigned short snum;

	sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr;
	snum = ntohs(sin->sin_port);
	if (snum <= V_ipport_reservedhigh && snum >= V_ipport_reservedlow &&
	priv_check(curthread, PRIV_NETINET_RESERVEDPORT))
	return (EACCES);

	bind_list = kvl_lookup(ps, snum);
	if (!bind_list)
	return cma_alloc_port(ps, id_priv, snum);

	/*
	* We don't support binding to any address if anyone is bound to
	* a specific address on the same port.
	*/
	if (cma_any_addr(&id_priv->id.route.addr.src_addr))
	return (EADDRNOTAVAIL);

	TAILQ_FOREACH(cur_id, &bind_list->owners, node) {
	if (cma_any_addr(&cur_id->id.route.addr.src_addr))
	return (EADDRNOTAVAIL);

	cur_sin = (struct sockaddr_in *)&cur_id->id.route.addr.src_addr;
	if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr)
	return (EADDRINUSE);
	}

	cma_bind_port(bind_list, id_priv);
	return 0;
	}

	static int cma_get_tcp_port(struct rdma_id_private *id_priv)
	{
	int ret;
	struct socket *so;

	ret = socreate(AF_INET, &so, SOCK_STREAM, IPPROTO_TCP,
	curthread->td_ucred, curthread);
	if (ret) {
	printf("%s socreate err %d\n", __FUNCTION__, ret);
	return ret;
	}

	ret = sobind(so, (struct sockaddr *)&id_priv->id.route.addr.src_addr,
	curthread);
	if (ret) {
	soclose(so);
	return ret;
	}
	id_priv->so = so;
	return 0;
	}

	static int cma_get_port(struct rdma_id_private *id_priv)
	{
	struct kvl *ps;
	int ret;

	switch (id_priv->id.ps) {
	case RDMA_PS_SDP:
	ps = &sdp_ps;
	break;
	case RDMA_PS_TCP:
	ps = &tcp_ps;
	ret = cma_get_tcp_port(id_priv); /* Synch with native stack */
	if (ret)
	return ret;
	break;
	case RDMA_PS_UDP:
	ps = &udp_ps;
	break;
	case RDMA_PS_IPOIB:
	ps = &ipoib_ps;
	break;
	default:
	return (EPROTONOSUPPORT);
	}

	mtx_lock(&lock);
	if (cma_any_port(&id_priv->id.route.addr.src_addr))
	ret = cma_alloc_any_port(ps, id_priv);
	else
	ret = cma_use_port(ps, id_priv);
	mtx_unlock(&lock);

	return ret;
	}

	int rdma_bind_addr(struct rdma_cm_id id, struct sockaddr addr)
	{
	struct rdma_id_private *id_priv;
	int ret;

	if (addr->sa_family != AF_INET)
	return (EAFNOSUPPORT);

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
	return (EINVAL);

	if (!cma_any_addr(addr)) {
	ret = rdma_translate_ip(addr, &id->route.addr.dev_addr);
	if (ret)
	goto err1;

	mtx_lock(&lock);
	ret = cma_acquire_dev(id_priv);
	mtx_unlock(&lock);
	if (ret)
	goto err1;
	}

	memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr));
	ret = cma_get_port(id_priv);
	if (ret)
	goto err2;

	return 0;
	err2:
	if (!cma_any_addr(addr)) {
	mtx_lock(&lock);
	cma_detach_from_dev(id_priv);
	mtx_unlock(&lock);
	}
	err1:
	cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE);
	return ret;
	}

	#ifdef IB_SUPPORTED
	static int cma_format_hdr(void *hdr, enum rdma_port_space ps,
	struct rdma_route *route)
	{
	struct sockaddr_in src4, dst4;
	struct cma_hdr *cma_hdr;
	struct sdp_hh *sdp_hdr;

	src4 = (struct sockaddr_in *) &route->addr.src_addr;
	dst4 = (struct sockaddr_in *) &route->addr.dst_addr;

	switch (ps) {
	case RDMA_PS_SDP:
	sdp_hdr = hdr;
	if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION)
	return (EINVAL);
	sdp_set_ip_ver(sdp_hdr, 4);
	sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
	sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
	sdp_hdr->port = src4->sin_port;
	break;
	default:
	cma_hdr = hdr;
	cma_hdr->cma_version = CMA_VERSION;
	cma_set_ip_ver(cma_hdr, 4);
	cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
	cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
	cma_hdr->port = src4->sin_port;
	break;
	}
	return 0;
	}

	static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
	struct ib_cm_event *ib_event)
	{
	struct rdma_id_private *id_priv = cm_id->context;
	struct rdma_cm_event event;
	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
	int ret = 0;

	if (cma_disable_remove(id_priv, CMA_CONNECT))
	return 0;

	memset(&event, 0, sizeof event);
	switch (ib_event->event) {
	case IB_CM_SIDR_REQ_ERROR:
	event.event = RDMA_CM_EVENT_UNREACHABLE;
	event.status = ETIMEDOUT;
	break;
	case IB_CM_SIDR_REP_RECEIVED:
	event.param.ud.private_data = ib_event->private_data;
	event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
	if (rep->status != IB_SIDR_SUCCESS) {
	event.event = RDMA_CM_EVENT_UNREACHABLE;
	event.status = ib_event->param.sidr_rep_rcvd.status;
	break;
	}
	if (id_priv->qkey != rep->qkey) {
	event.event = RDMA_CM_EVENT_UNREACHABLE;
	event.status = EINVAL;
	break;
	}
	ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
	id_priv->id.route.path_rec,
	&event.param.ud.ah_attr);
	event.param.ud.qp_num = rep->qpn;
	event.param.ud.qkey = rep->qkey;
	event.event = RDMA_CM_EVENT_ESTABLISHED;
	event.status = 0;
	break;
	default:
	log(LOG_ERR, "RDMA CMA: unexpected IB CM event: %d",
	ib_event->event);
	goto out;
	}

	ret = id_priv->id.event_handler(&id_priv->id, &event);
	if (ret) {
	/* Destroy the CM ID by returning a non-zero value. */
	id_priv->cm_id.ib = NULL;
	cma_exch(id_priv, CMA_DESTROYING);
	cma_enable_remove(id_priv);
	rdma_destroy_id(&id_priv->id);
	return ret;
	}
	out:
	cma_enable_remove(id_priv);
	return ret;
	}

	static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
	struct rdma_conn_param *conn_param)
	{
	struct ib_cm_sidr_req_param req;
	struct rdma_route *route;
	int ret;

	req.private_data_len = sizeof(struct cma_hdr) +
	conn_param->private_data_len;
	req.private_data = malloc(req.private_data_len, M_DEVBUF, M_NOWAIT);
	if (!req.private_data)
	return (ENOMEM);
	bzero((void *)req.private_data, req.private_data_len);

	if (conn_param->private_data && conn_param->private_data_len)
	memcpy((caddr_t) req.private_data + sizeof(struct cma_hdr),
	conn_param->private_data, conn_param->private_data_len);

	route = &id_priv->id.route;
	ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route);
	if (ret)
	goto out;

	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device,
	cma_sidr_rep_handler, id_priv);
	if (IS_ERR(id_priv->cm_id.ib)) {
	ret = PTR_ERR(id_priv->cm_id.ib);
	goto out;
	}

	req.path = route->path_rec;
	req.service_id = cma_get_service_id(id_priv->id.ps,
	&route->addr.dst_addr);
	req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
	req.max_cm_retries = CMA_MAX_CM_RETRIES;

	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
	if (ret) {
	ib_destroy_cm_id(id_priv->cm_id.ib);
	id_priv->cm_id.ib = NULL;
	}
	out:
	free(req.private_data, M_DEVBUF);
	return ret;
	}

	static int cma_connect_ib(struct rdma_id_private *id_priv,
	struct rdma_conn_param *conn_param)
	{
	struct ib_cm_req_param req;
	struct rdma_route *route;
	void *private_data;
	int offset, ret;

	memset(&req, 0, sizeof req);
	offset = cma_user_data_offset(id_priv->id.ps);
	req.private_data_len = offset + conn_param->private_data_len;
	private_data = malloc(req.private_data_len, M_DEVBUF, M_NOWAIT);
	if (!private_data)
	return (ENOMEM);
	bzero(private_data, req.private_data_len);

	if (conn_param->private_data && conn_param->private_data_len)
	memcpy(private_data + offset, conn_param->private_data,
	conn_param->private_data_len);

	id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler,
	id_priv);
	if (IS_ERR(id_priv->cm_id.ib)) {
	ret = PTR_ERR(id_priv->cm_id.ib);
	goto out;
	}

	route = &id_priv->id.route;
	ret = cma_format_hdr(private_data, id_priv->id.ps, route);
	if (ret)
	goto out;
	req.private_data = private_data;

	req.primary_path = &route->path_rec[0];
	if (route->num_paths == 2)
	req.alternate_path = &route->path_rec[1];

	req.service_id = cma_get_service_id(id_priv->id.ps,
	&route->addr.dst_addr);
	req.qp_num = id_priv->qp_num;
	req.qp_type = IB_QPT_RC;
	req.starting_psn = id_priv->seq_num;
	req.responder_resources = conn_param->responder_resources;
	req.initiator_depth = conn_param->initiator_depth;
	req.flow_control = conn_param->flow_control;
	req.retry_count = conn_param->retry_count;
	req.rnr_retry_count = conn_param->rnr_retry_count;
	req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
	req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
	req.max_cm_retries = CMA_MAX_CM_RETRIES;
	req.srq = id_priv->srq ? 1 : 0;

	ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
	out:
	if (ret && !IS_ERR(id_priv->cm_id.ib)) {
	ib_destroy_cm_id(id_priv->cm_id.ib);
	id_priv->cm_id.ib = NULL;
	}

	free(private_data, M_DEVBUF);
	return ret;
	}
	#endif

	static int cma_connect_iw(struct rdma_id_private *id_priv,
	struct rdma_conn_param *conn_param)
	{
	struct iw_cm_id *cm_id;
	struct sockaddr_in* sin;
	int ret;
	struct iw_cm_conn_param iw_param;

	cm_id = iw_create_cm_id(id_priv->id.device, id_priv->so,
	cma_iw_handler, id_priv);
	if (IS_ERR(cm_id)) {
	ret = PTR_ERR(cm_id);
	goto out;
	}

	id_priv->cm_id.iw = cm_id;

	sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr;
	cm_id->local_addr = *sin;

	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
	cm_id->remote_addr = *sin;

	ret = cma_modify_qp_rtr(&id_priv->id);
	if (ret)
	goto out;

	iw_param.ord = conn_param->initiator_depth;
	iw_param.ird = conn_param->responder_resources;
	iw_param.private_data = conn_param->private_data;
	iw_param.private_data_len = conn_param->private_data_len;
	if (id_priv->id.qp)
	iw_param.qpn = id_priv->qp_num;
	else
	iw_param.qpn = conn_param->qp_num;
	ret = iw_cm_connect(cm_id, &iw_param);
	out:
	if (ret && !IS_ERR(cm_id)) {
	iw_destroy_cm_id(cm_id);
	id_priv->cm_id.iw = NULL;
	}
	return ret;
	}

	int rdma_connect(struct rdma_cm_id id, struct rdma_conn_param conn_param)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT))
	return (EINVAL);

	if (!id->qp) {
	id_priv->qp_num = conn_param->qp_num;
	id_priv->srq = conn_param->srq;
	}

	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (cma_is_ud_ps(id->ps))
	ret = cma_resolve_ib_udp(id_priv, conn_param);
	else
	ret = cma_connect_ib(id_priv, conn_param);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	ret = cma_connect_iw(id_priv, conn_param);
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = ENOSYS;
	break;
	}
	#endif
	if (ret)
	goto err;

	return 0;
	err:
	cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED);
	return ret;
	}

	#ifdef IB_SUPPORTED
	static int cma_accept_ib(struct rdma_id_private *id_priv,
	struct rdma_conn_param *conn_param)
	{
	struct ib_cm_rep_param rep;
	struct ib_qp_attr qp_attr;
	int qp_attr_mask, ret;

	if (id_priv->id.qp) {
	ret = cma_modify_qp_rtr(&id_priv->id);
	if (ret)
	goto out;

	qp_attr.qp_state = IB_QPS_RTS;
	ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr,
	&qp_attr_mask);
	if (ret)
	goto out;

	qp_attr.max_rd_atomic = conn_param->initiator_depth;
	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
	if (ret)
	goto out;
	}

	memset(&rep, 0, sizeof rep);
	rep.qp_num = id_priv->qp_num;
	rep.starting_psn = id_priv->seq_num;
	rep.private_data = conn_param->private_data;
	rep.private_data_len = conn_param->private_data_len;
	rep.responder_resources = conn_param->responder_resources;
	rep.initiator_depth = conn_param->initiator_depth;
	rep.target_ack_delay = CMA_CM_RESPONSE_TIMEOUT;
	rep.failover_accepted = 0;
	rep.flow_control = conn_param->flow_control;
	rep.rnr_retry_count = conn_param->rnr_retry_count;
	rep.srq = id_priv->srq ? 1 : 0;

	ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
	out:
	return ret;
	}
	#endif

	static int cma_accept_iw(struct rdma_id_private *id_priv,
	struct rdma_conn_param *conn_param)
	{
	struct iw_cm_conn_param iw_param;
	int ret;

	ret = cma_modify_qp_rtr(&id_priv->id);
	if (ret)
	return ret;

	iw_param.ord = conn_param->initiator_depth;
	iw_param.ird = conn_param->responder_resources;
	iw_param.private_data = conn_param->private_data;
	iw_param.private_data_len = conn_param->private_data_len;
	if (id_priv->id.qp) {
	iw_param.qpn = id_priv->qp_num;
	} else
	iw_param.qpn = conn_param->qp_num;

	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
	}

	#ifdef IB_SUPPORTED
	static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
	enum ib_cm_sidr_status status,
	const void *private_data, int private_data_len)
	{
	struct ib_cm_sidr_rep_param rep;

	memset(&rep, 0, sizeof rep);
	rep.status = status;
	if (status == IB_SIDR_SUCCESS) {
	rep.qp_num = id_priv->qp_num;
	rep.qkey = id_priv->qkey;
	}
	rep.private_data = private_data;
	rep.private_data_len = private_data_len;

	return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
	}
	#endif

	int rdma_accept(struct rdma_cm_id id, struct rdma_conn_param conn_param)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_comp(id_priv, CMA_CONNECT))
	return (EINVAL);

	if (!id->qp && conn_param) {
	id_priv->qp_num = conn_param->qp_num;
	id_priv->srq = conn_param->srq;
	}

	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (cma_is_ud_ps(id->ps))
	ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
	conn_param->private_data,
	conn_param->private_data_len);
	else if (conn_param)
	ret = cma_accept_ib(id_priv, conn_param);
	else
	ret = cma_rep_recv(id_priv);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	ret = cma_accept_iw(id_priv, conn_param);
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = ENOSYS;
	break;
	}
	#endif

	if (ret)
	goto reject;

	return 0;
	reject:
	cma_modify_qp_err(id);
	rdma_reject(id, NULL, 0);
	return ret;
	}

	int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_has_cm_dev(id_priv))
	return (EINVAL);
	#ifdef IB_SUPPORTED
	switch (id->device->node_type) {
	case RDMA_NODE_IB_CA:
	ret = ib_cm_notify(id_priv->cm_id.ib, event);
	break;
	default:
	#endif
	ret = 0;
	#ifdef IB_SUPPORTED
	break;
	}
	#endif
	return ret;
	}

	int rdma_reject(struct rdma_cm_id id, const void private_data,
	u8 private_data_len)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_has_cm_dev(id_priv))
	return (EINVAL);

	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	if (cma_is_ud_ps(id->ps))
	ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT,
	private_data, private_data_len);
	else
	ret = ib_send_cm_rej(id_priv->cm_id.ib,
	IB_CM_REJ_CONSUMER_DEFINED, NULL,
	0, private_data, private_data_len);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	ret = iw_cm_reject(id_priv->cm_id.iw,
	private_data, private_data_len);
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = ENOSYS;
	break;
	}
	#endif
	return ret;
	}

	int rdma_disconnect(struct rdma_cm_id *id)
	{
	struct rdma_id_private *id_priv;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_has_cm_dev(id_priv))
	return (EINVAL);

	#ifdef IB_SUPPORTED
	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	ret = cma_modify_qp_err(id);
	if (ret)
	goto out;
	/* Initiate or respond to a disconnect. */
	if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
	ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
	break;
	case RDMA_TRANSPORT_IWARP:
	#endif
	ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
	#ifdef IB_SUPPORTED
	break;
	default:
	ret = EINVAL;
	break;
	}
	out:
	#endif
	return ret;
	}

	#ifdef IB_SUPPORTED
	static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
	{
	struct rdma_id_private *id_priv;
	struct cma_multicast *mc = multicast->context;
	struct rdma_cm_event event;
	int ret;

	id_priv = mc->id_priv;
	if (cma_disable_remove(id_priv, CMA_ADDR_BOUND) &&
	cma_disable_remove(id_priv, CMA_ADDR_RESOLVED))
	return 0;

	if (!status && id_priv->id.qp)
	status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
	multicast->rec.mlid);

	memset(&event, 0, sizeof event);
	event.status = status;
	event.param.ud.private_data = mc->context;
	if (!status) {
	event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
	ib_init_ah_from_mcmember(id_priv->id.device,
	id_priv->id.port_num, &multicast->rec,
	&event.param.ud.ah_attr);
	event.param.ud.qp_num = 0xFFFFFF;
	event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
	} else
	event.event = RDMA_CM_EVENT_MULTICAST_ERROR;

	ret = id_priv->id.event_handler(&id_priv->id, &event);
	if (ret) {
	cma_exch(id_priv, CMA_DESTROYING);
	cma_enable_remove(id_priv);
	rdma_destroy_id(&id_priv->id);
	return 0;
	}

	cma_enable_remove(id_priv);
	return 0;
	}

	static void cma_set_mgid(struct rdma_id_private *id_priv,
	struct sockaddr addr, union ib_gid mgid)
	{
	unsigned char mc_map[MAX_ADDR_LEN];
	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
	struct sockaddr_in sin = (struct sockaddr_in ) addr;
	struct sockaddr_in6 sin6 = (struct sockaddr_in6 ) addr;

	if (cma_any_addr(addr)) {
	memset(mgid, 0, sizeof *mgid);
	} else if ((addr->sa_family == AF_INET6) &&
	((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFF10A01B) ==
	0xFF10A01B)) {
	/* IPv6 address is an SA assigned MGID. */
	memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
	} else {
	ip_ib_mc_map(sin->sin_addr.s_addr, mc_map);
	if (id_priv->id.ps == RDMA_PS_UDP)
	mc_map[7] = 0x01; /* Use RDMA CM signature */
	mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8;
	mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr);
	mgid = (union ib_gid *) (mc_map + 4);
	}
	}

	static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
	struct cma_multicast *mc)
	{
	struct ib_sa_mcmember_rec rec;
	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
	ib_sa_comp_mask comp_mask;
	int ret;

	ib_addr_get_mgid(dev_addr, &rec.mgid);
	ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
	&rec.mgid, &rec);
	if (ret)
	return ret;

	cma_set_mgid(id_priv, &mc->addr, &rec.mgid);
	if (id_priv->id.ps == RDMA_PS_UDP)
	rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
	ib_addr_get_sgid(dev_addr, &rec.port_gid);
	rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
	rec.join_state = 1;

	comp_mask = IB_SA_MCMEMBER_REC_MGID \| IB_SA_MCMEMBER_REC_PORT_GID \|
	IB_SA_MCMEMBER_REC_PKEY \| IB_SA_MCMEMBER_REC_JOIN_STATE \|
	IB_SA_MCMEMBER_REC_QKEY \| IB_SA_MCMEMBER_REC_SL \|
	IB_SA_MCMEMBER_REC_FLOW_LABEL \|
	IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;

	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
	id_priv->id.port_num, &rec,
	comp_mask, M_NOWAIT,
	cma_ib_mc_handler, mc);
	if (IS_ERR(mc->multicast.ib))
	return PTR_ERR(mc->multicast.ib);

	return 0;
	}

	int rdma_join_multicast(struct rdma_cm_id id, struct sockaddr addr,
	void *context)
	{
	struct rdma_id_private *id_priv;
	struct cma_multicast *mc;
	int ret;

	id_priv = container_of(id, struct rdma_id_private, id);
	if (!cma_comp(id_priv, CMA_ADDR_BOUND) &&
	!cma_comp(id_priv, CMA_ADDR_RESOLVED))
	return (EINVAL);

	mc = malloc(sizeof *mc, M_DEVBUF, M_NOWAIT);
	if (!mc)
	return (ENOMEM);

	memcpy(&mc->addr, addr, ip_addr_size(addr));
	mc->context = context;
	mc->id_priv = id_priv;

	mtx_lock(&id_priv->lock);
	LIST_INSERT_HEAD(&id_priv->mc_list, mc, list);
	mtx_unlock(&id_priv->lock);

	switch (rdma_node_get_transport(id->device->node_type)) {
	case RDMA_TRANSPORT_IB:
	ret = cma_join_ib_multicast(id_priv, mc);
	break;
	default:
	ret = ENOSYS;
	break;
	}

	if (ret) {
	mtx_lock(&id_priv->lock);
	list_del(&mc->list);
	mtx_unlock(&id_priv->lock);
	free(mc, M_DEVBUF);
	}
	return ret;
	}

	void rdma_leave_multicast(struct rdma_cm_id id, struct sockaddr addr)
	{
	struct rdma_id_private *id_priv;
	struct cma_multicast *mc;

	id_priv = container_of(id, struct rdma_id_private, id);
	mtx_lock(&id_priv->lock);
	LIST_FOREACH(mc, &id_priv->mc_list, list) {
	if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) {
	list_del(&mc->list);
	mtx_unlock(&id_priv->lock);

	if (id->qp)
	ib_detach_mcast(id->qp,
	&mc->multicast.ib->rec.mgid,
	mc->multicast.ib->rec.mlid);
	ib_sa_free_multicast(mc->multicast.ib, M_DEVBUF);
	free(mc, M_DEVBUF);
	return;
	}
	}
	mtx_unlock(&id_priv->lock);
	}
	#endif

	static void cma_add_one(struct ib_device *device)
	{
	struct cma_device *cma_dev;
	struct rdma_id_private *id_priv;

	cma_dev = malloc(sizeof *cma_dev, M_DEVBUF, M_NOWAIT\|M_ZERO);
	if (!cma_dev)
	return;

	cma_dev->device = device;

	cv_init(&cma_dev->comp, "cma_device");
	mtx_init(&cma_dev->lock, "cma_device", NULL, MTX_DUPOK\|MTX_DEF);
	cma_dev->refcount = 1;
	LIST_INIT(&cma_dev->id_list);
	ib_set_client_data(device, &cma_client, cma_dev);

	mtx_lock(&lock);
	TAILQ_INSERT_TAIL(&dev_list, cma_dev, list);
	LIST_FOREACH(id_priv, &listen_any_list, list)
	cma_listen_on_dev(id_priv, cma_dev);
	mtx_unlock(&lock);
	}

	static int cma_remove_id_dev(struct rdma_id_private *id_priv)
	{
	struct rdma_cm_event event;
	enum cma_state state;

	/* Record that we want to remove the device */
	state = cma_exch(id_priv, CMA_DEVICE_REMOVAL);
	if (state == CMA_DESTROYING)
	return 0;

	cma_cancel_operation(id_priv, state);
	mtx_lock(&id_priv->lock);
	PANIC_IF(id_priv->dev_remove < 0);
	if (id_priv->dev_remove)
	cv_wait(&id_priv->wait_remove, &id_priv->lock);
	mtx_unlock(&id_priv->lock);

	/* Check for destruction from another callback. */
	if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL))
	return 0;

	memset(&event, 0, sizeof event);
	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
	return id_priv->id.event_handler(&id_priv->id, &event);
	}

	static void cma_process_remove(struct cma_device *cma_dev)
	{
	struct rdma_id_private *id_priv;
	int ret;

	mtx_lock(&lock);
	while (!LIST_EMPTY(&cma_dev->id_list)) {
	id_priv = LIST_FIRST(&cma_dev->id_list);

	if (cma_internal_listen(id_priv)) {
	cma_destroy_listen(id_priv);
	continue;
	}

	LIST_REMOVE(id_priv, list);
	mtx_lock(&id_priv->lock);
	id_priv->refcount++;
	mtx_unlock(&id_priv->lock);
	mtx_unlock(&lock);

	ret = cma_remove_id_dev(id_priv);
	cma_deref_id(id_priv);
	if (ret)
	rdma_destroy_id(&id_priv->id);

	mtx_lock(&lock);
	}
	mtx_unlock(&lock);

	cma_deref_dev(cma_dev);
	mtx_lock(&cma_dev->lock);
	PANIC_IF(cma_dev->refcount < 0);
	if (cma_dev->refcount)
	cv_wait(&cma_dev->comp, &cma_dev->lock);
	mtx_unlock(&cma_dev->lock);
	}

	static void cma_remove_one(struct ib_device *device)
	{
	struct cma_device *cma_dev;

	cma_dev = ib_get_client_data(device, &cma_client);
	if (!cma_dev)
	return;

	mtx_lock(&lock);
	TAILQ_REMOVE(&dev_list, cma_dev, list);
	mtx_unlock(&lock);

	cma_process_remove(cma_dev);
	free(cma_dev, M_DEVBUF);
	}

	static int cma_init(void)
	{
	+ INIT_VNET_INET(curvnet);
	int ret;

	LIST_INIT(&listen_any_list);
	TAILQ_INIT(&dev_list);
	mtx_init(&lock, "cma_device list", NULL, MTX_DEF);

	arc4rand(&next_port, sizeof next_port, 0);
	next_port = ((unsigned int) next_port %
	(V_ipport_lastauto - V_ipport_firstauto)) +
	V_ipport_firstauto;
	cma_wq = taskqueue_create("rdma_cm", M_NOWAIT, taskqueue_thread_enqueue,
	&cma_wq);

	if (!cma_wq)
	return (ENOMEM);

	taskqueue_start_threads(&cma_wq, 1, PI_NET, "cma_wq thread");
	#ifdef IB_SUPPORTED
	ib_sa_register_client(&sa_client);
	#endif
	rdma_addr_register_client(&addr_client);

	ret = ib_register_client(&cma_client);
	if (ret)
	goto err;
	return 0;

	err:
	rdma_addr_unregister_client(&addr_client);
	#ifdef IB_SUPPORTED
	ib_sa_unregister_client(&sa_client);
	#endif
	taskqueue_free(cma_wq);
	return ret;
	}

	static void cma_cleanup(void)
	{
	ib_unregister_client(&cma_client);
	rdma_addr_unregister_client(&addr_client);
	#ifdef IB_SUPPORTED
	ib_sa_unregister_client(&sa_client);
	#endif
	taskqueue_free(cma_wq);
	kvl_free(&sdp_ps);
	kvl_free(&tcp_ps);
	kvl_free(&udp_ps);
	kvl_free(&ipoib_ps);
	}

	static int
	cma_load(module_t mod, int cmd, void *arg)
	{
	int err = 0;

	switch (cmd) {
	case MOD_LOAD:
	printf("Loading rdma_cma.\n");
	cma_init();
	break;
	case MOD_QUIESCE:
	break;
	case MOD_UNLOAD:
	printf("Unloading rdma_cma.\n");
	cma_cleanup();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	err = EOPNOTSUPP;
	break;
	}

	return (err);
	}

	static moduledata_t mod_data = {
	"rdma_cma",
	cma_load,
	0
	};

	MODULE_VERSION(rdma_cma, 1);
	MODULE_DEPEND(rdma_cma, rdma_core, 1, 1, 1);
	MODULE_DEPEND(rdma_cma, rdma_addr, 1, 1, 1);
	MODULE_DEPEND(rdma_cma, rdma_iwcm, 1, 1, 1);
	DECLARE_MODULE(rdma_cma, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
	Index: head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c
	===================================================================
	--- head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c (revision 183549)
	+++ head/sys/dev/cxgb/ulp/iw_cxgb/iw_cxgb.c (revision 183550)
	@@ -1,290 +1,297 @@
	/**************************************************************************

	Copyright (c) 2007, Chelsio Inc.
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:

	1. Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.

	2. Neither the name of the Chelsio Corporation nor the names of its
	contributors may be used to endorse or promote products derived from
	this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	POSSIBILITY OF SUCH DAMAGE.

	***************************************************************************/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/pciio.h>
	#include <sys/conf.h>
	#include <machine/bus.h>
	#include <machine/resource.h>
	#include <sys/bus_dma.h>
	#include <sys/rman.h>
	#include <sys/ioccom.h>
	#include <sys/mbuf.h>
	#include <sys/rwlock.h>
	#include <sys/linker.h>
	#include <sys/firmware.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>
	#include <sys/queue.h>
	#include <sys/taskqueue.h>
	#include <sys/proc.h>
	#include <sys/eventhandler.h>

	#if __FreeBSD_version >= 800044
	#include <sys/vimage.h>
	#else
	#define V_ifnet ifnet
	#endif

	#include <net/if.h>
	#include <net/if_var.h>

	#include <netinet/in.h>

	#include <contrib/rdma/ib_verbs.h>

	#include <cxgb_include.h>
	#include <ulp/iw_cxgb/iw_cxgb_wr.h>
	#include <ulp/iw_cxgb/iw_cxgb_hal.h>
	#include <ulp/iw_cxgb/iw_cxgb_provider.h>
	#include <ulp/iw_cxgb/iw_cxgb_cm.h>
	#include <ulp/iw_cxgb/iw_cxgb.h>

	/*
	* XXX :-/
	*
	*/

	#define idr_init(x)

	cxgb_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];

	static void open_rnic_dev(struct t3cdev *);
	static void close_rnic_dev(struct t3cdev *);

	static TAILQ_HEAD( ,iwch_dev) dev_list;
	static struct mtx dev_mutex;
	static eventhandler_tag event_tag;

	static void
	rnic_init(struct iwch_dev *rnicp)
	{
	CTR2(KTR_IW_CXGB, "%s iwch_dev %p", __FUNCTION__, rnicp);
	idr_init(&rnicp->cqidr);
	idr_init(&rnicp->qpidr);
	idr_init(&rnicp->mmidr);
	mtx_init(&rnicp->lock, "iwch rnic lock", NULL, MTX_DEF\|MTX_DUPOK);

	rnicp->attr.vendor_id = 0x168;
	rnicp->attr.vendor_part_id = 7;
	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
	rnicp->attr.max_wrs = (1UL << 24) - 1;
	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
	rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1;
	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
	rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */
	rnicp->attr.can_resize_wq = 0;
	rnicp->attr.max_rdma_reads_per_qp = 8;
	rnicp->attr.max_rdma_read_resources =
	rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
	rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */
	rnicp->attr.max_rdma_read_depth =
	rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
	rnicp->attr.rq_overflow_handled = 0;
	rnicp->attr.can_modify_ird = 0;
	rnicp->attr.can_modify_ord = 0;
	rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
	rnicp->attr.stag0_value = 1;
	rnicp->attr.zbva_support = 1;
	rnicp->attr.local_invalidate_fence = 1;
	rnicp->attr.cq_overflow_detection = 1;
	return;
	}

	static void
	open_rnic_dev(struct t3cdev *tdev)
	{
	struct iwch_dev *rnicp;
	static int vers_printed;

	CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__, tdev);
	if (!vers_printed++)
	printf("Chelsio T3 RDMA Driver - version %s\n",
	DRV_VERSION);
	rnicp = (struct iwch_dev )ib_alloc_device(sizeof(rnicp));
	if (!rnicp) {
	printf("Cannot allocate ib device\n");
	return;
	}
	rnicp->rdev.ulp = rnicp;
	rnicp->rdev.t3cdev_p = tdev;

	mtx_lock(&dev_mutex);

	if (cxio_rdev_open(&rnicp->rdev)) {
	mtx_unlock(&dev_mutex);
	printf("Unable to open CXIO rdev\n");
	ib_dealloc_device(&rnicp->ibdev);
	return;
	}

	rnic_init(rnicp);

	TAILQ_INSERT_TAIL(&dev_list, rnicp, entry);
	mtx_unlock(&dev_mutex);

	if (iwch_register_device(rnicp)) {
	printf("Unable to register device\n");
	close_rnic_dev(tdev);
	}
	#ifdef notyet
	printf("Initialized device %s\n",
	pci_name(rnicp->rdev.rnic_info.pdev));
	#endif
	return;
	}

	static void
	close_rnic_dev(struct t3cdev *tdev)
	{
	struct iwch_dev dev, tmp;
	CTR2(KTR_IW_CXGB, "%s t3cdev %p", __FUNCTION__, tdev);
	mtx_lock(&dev_mutex);

	TAILQ_FOREACH_SAFE(dev, &dev_list, entry, tmp) {
	if (dev->rdev.t3cdev_p == tdev) {
	#ifdef notyet
	list_del(&dev->entry);
	iwch_unregister_device(dev);
	cxio_rdev_close(&dev->rdev);
	idr_destroy(&dev->cqidr);
	idr_destroy(&dev->qpidr);
	idr_destroy(&dev->mmidr);
	ib_dealloc_device(&dev->ibdev);
	#endif
	break;
	}
	}
	mtx_unlock(&dev_mutex);
	}

	static ifaddr_event_handler_t
	ifaddr_event_handler(void arg, struct ifnet ifp)
	{
	printf("%s if name %s \n", __FUNCTION__, ifp->if_xname);
	if (ifp->if_capabilities & IFCAP_TOE4) {
	KASSERT(T3CDEV(ifp) != NULL, ("null t3cdev ptr!"));
	if (cxio_hal_find_rdev_by_t3cdev(T3CDEV(ifp)) == NULL)
	open_rnic_dev(T3CDEV(ifp));
	}
	return 0;
	}


	static int
	iwch_init_module(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	int err;
	struct ifnet *ifp;

	printf("%s enter\n", __FUNCTION__);
	TAILQ_INIT(&dev_list);
	mtx_init(&dev_mutex, "iwch dev_list lock", NULL, MTX_DEF);

	err = cxio_hal_init();
	if (err)
	return err;
	err = iwch_cm_init();
	if (err)
	return err;
	cxio_register_ev_cb(iwch_ev_dispatch);

	/* Register for ifaddr events to dynamically add TOE devs */
	event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_event_handler,
	NULL, EVENTHANDLER_PRI_ANY);

	/* Register existing TOE interfaces by walking the ifnet chain */
	IFNET_RLOCK();
	- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	- (void)ifaddr_event_handler(NULL, ifp);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter); /* XXX CURVNET_SET_QUIET() ? */
	+ INIT_VNET_NET(vnet_iter);
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_link)
	+ (void)ifaddr_event_handler(NULL, ifp);
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IFNET_RUNLOCK();
	return 0;
	}

	static void
	iwch_exit_module(void)
	{
	EVENTHANDLER_DEREGISTER(ifaddr_event, event_tag);
	cxio_unregister_ev_cb(iwch_ev_dispatch);
	iwch_cm_term();
	cxio_hal_exit();
	}

	static int
	iwch_load(module_t mod, int cmd, void *arg)
	{
	int err = 0;

	switch (cmd) {
	case MOD_LOAD:
	printf("Loading iw_cxgb.\n");

	iwch_init_module();
	break;
	case MOD_QUIESCE:
	break;
	case MOD_UNLOAD:
	printf("Unloading iw_cxgb.\n");
	iwch_exit_module();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	err = EOPNOTSUPP;
	break;
	}

	return (err);
	}

	static moduledata_t mod_data = {
	"iw_cxgb",
	iwch_load,
	0
	};

	MODULE_VERSION(iw_cxgb, 1);
	DECLARE_MODULE(iw_cxgb, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
	MODULE_DEPEND(iw_cxgb, rdma_core, 1, 1, 1);
	MODULE_DEPEND(iw_cxgb, if_cxgb, 1, 1, 1);
	MODULE_DEPEND(iw_cxgb, t3_tom, 1, 1, 1);

	Index: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
	===================================================================
	--- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 183549)
	+++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c (revision 183550)
	@@ -1,4471 +1,4475 @@
	/**************************************************************************

	Copyright (c) 2007-2008, Chelsio Inc.
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:

	1. Redistributions of source code must retain the above copyright notice,
	this list of conditions and the following disclaimer.

	2. Neither the name of the Chelsio Corporation nor the names of its
	contributors may be used to endorse or promote products derived from
	this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	POSSIBILITY OF SUCH DAMAGE.

	***************************************************************************/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/sockstate.h>
	#include <sys/sockopt.h>
	#include <sys/socket.h>
	#include <sys/sockbuf.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/protosw.h>
	#include <sys/priv.h>

	#if __FreeBSD_version >= 800044
	#include <sys/vimage.h>
	#else
	#define V_tcp_do_autosndbuf tcp_do_autosndbuf
	#define V_tcp_autosndbuf_max tcp_autosndbuf_max
	#define V_tcp_do_rfc1323 tcp_do_rfc1323
	#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
	#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
	#define V_tcpstat tcpstat
	#endif

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>


	#include <cxgb_osdep.h>
	#include <sys/mbufq.h>

	#include <netinet/ip.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_offload.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_syncache.h>
	#include <netinet/tcp_timer.h>
	#include <net/route.h>

	#include <t3cdev.h>
	#include <common/cxgb_firmware_exports.h>
	#include <common/cxgb_t3_cpl.h>
	#include <common/cxgb_tcb.h>
	#include <common/cxgb_ctl_defs.h>
	#include <cxgb_offload.h>
	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <machine/bus.h>
	#include <sys/mvec.h>
	#include <ulp/toecore/cxgb_toedev.h>
	#include <ulp/tom/cxgb_l2t.h>
	#include <ulp/tom/cxgb_defs.h>
	#include <ulp/tom/cxgb_tom.h>
	#include <ulp/tom/cxgb_t3_ddp.h>
	#include <ulp/tom/cxgb_toepcb.h>
	#include <ulp/tom/cxgb_tcp.h>
	#include <ulp/tom/cxgb_tcp_offload.h>

	/*
	* For ULP connections HW may add headers, e.g., for digests, that aren't part
	* of the messages sent by the host but that are part of the TCP payload and
	* therefore consume TCP sequence space. Tx connection parameters that
	* operate in TCP sequence space are affected by the HW additions and need to
	* compensate for them to accurately track TCP sequence numbers. This array
	* contains the compensating extra lengths for ULP packets. It is indexed by
	* a packet's ULP submode.
	*/
	const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};

	#ifdef notyet
	/*
	* This sk_buff holds a fake header-only TCP segment that we use whenever we
	* need to exploit SW TCP functionality that expects TCP headers, such as
	* tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
	* CPUs without locking.
	*/
	static struct mbuf *tcphdr_mbuf __read_mostly;
	#endif

	/*
	* Size of WRs in bytes. Note that we assume all devices we are handling have
	* the same WR size.
	*/
	static unsigned int wrlen __read_mostly;

	/*
	* The number of WRs needed for an skb depends on the number of page fragments
	* in the skb and whether it has any payload in its main body. This maps the
	* length of the gather list represented by an skb into the # of necessary WRs.
	*/
	static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;

	/*
	* Max receive window supported by HW in bytes. Only a small part of it can
	* be set through option0, the rest needs to be set through RX_DATA_ACK.
	*/
	#define MAX_RCV_WND ((1U << 27) - 1)

	/*
	* Min receive window. We want it to be large enough to accommodate receive
	* coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
	*/
	#define MIN_RCV_WND (24 * 1024U)
	#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)

	#define VALIDATE_SEQ 0
	#define VALIDATE_SOCK(so)
	#define DEBUG_WR 0

	#define TCP_TIMEWAIT 1
	#define TCP_CLOSE 2
	#define TCP_DROP 3

	extern int tcp_do_autorcvbuf;
	extern int tcp_do_autosndbuf;
	extern int tcp_autorcvbuf_max;
	extern int tcp_autosndbuf_max;

	static void t3_send_reset(struct toepcb *toep);
	static void send_abort_rpl(struct mbuf m, struct toedev tdev, int rst_status);
	static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
	static void handle_syncache_event(int event, void *arg);

	static inline void
	SBAPPEND(struct sockbuf sb, struct mbuf n)
	{
	struct mbuf *m;

	m = sb->sb_mb;
	while (m) {
	KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) \|\|
	!(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
	!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
	KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
	m->m_next, m->m_nextpkt, m->m_flags));
	m = m->m_next;
	}
	m = n;
	while (m) {
	KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) \|\|
	!(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
	!!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
	KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
	m->m_next, m->m_nextpkt, m->m_flags));
	m = m->m_next;
	}
	KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
	sbappendstream_locked(sb, n);
	m = sb->sb_mb;

	while (m) {
	KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
	m->m_next, m->m_nextpkt, m->m_flags));
	m = m->m_next;
	}
	}

	static inline int
	is_t3a(const struct toedev *dev)
	{
	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
	}

	static void
	dump_toepcb(struct toepcb *toep)
	{
	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
	toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
	toep->tp_mtu_idx, toep->tp_tid);

	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
	toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
	toep->tp_mss_clamp, toep->tp_flags);
	}

	#ifndef RTALLOC2_DEFINED
	static struct rtentry *
	rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
	{
	struct rtentry *rt = NULL;

	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
	RT_UNLOCK(rt);

	return (rt);
	}
	#endif

	/*
	* Determine whether to send a CPL message now or defer it. A message is
	* deferred if the connection is in SYN_SENT since we don't know the TID yet.
	* For connections in other states the message is sent immediately.
	* If through_l2t is set the message is subject to ARP processing, otherwise
	* it is sent directly.
	*/
	static inline void
	send_or_defer(struct toepcb toep, struct mbuf m, int through_l2t)
	{
	struct tcpcb *tp = toep->tp_tp;

	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
	inp_wlock(tp->t_inpcb);
	mbufq_tail(&toep->out_of_order_queue, m); // defer
	inp_wunlock(tp->t_inpcb);
	} else if (through_l2t)
	l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
	else
	cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
	}

	static inline unsigned int
	mkprio(unsigned int cntrl, const struct toepcb *toep)
	{
	return (cntrl);
	}

	/*
	* Populate a TID_RELEASE WR. The skb must be already propely sized.
	*/
	static inline void
	mk_tid_release(struct mbuf m, const struct toepcb toep, unsigned int tid)
	{
	struct cpl_tid_release *req;

	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	req = mtod(m, struct cpl_tid_release *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	req->wr.wr_lo = 0;
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
	}

	static inline void
	make_tx_data_wr(struct socket so, struct mbuf m, int len, struct mbuf *tail)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	struct tx_data_wr *req;
	struct sockbuf *snd;

	inp_lock_assert(tp->t_inpcb);
	snd = so_sockbuf_snd(so);

	req = mtod(m, struct tx_data_wr *);
	m->m_len = sizeof(*req);
	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
	/* len includes the length of any HW ULP additions */
	req->len = htonl(len);
	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
	/* V_TX_ULP_SUBMODE sets both the mode and submode */
	req->flags = htonl(V_TX_ULP_SUBMODE(/skb_ulp_mode(skb)/ 0) \|
	V_TX_URG(/* skb_urgent(skb) */ 0 ) \|
	V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
	(tail ? 0 : 1))));
	req->sndseq = htonl(tp->snd_nxt);
	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
	req->flags \|= htonl(V_TX_ACK_PAGES(2) \| F_TX_INIT \|
	V_TX_CPU_IDX(toep->tp_qset));

	/* Sendbuffer is in units of 32KB.
	*/
	if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
	req->param \|= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
	else {
	req->param \|= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
	}

	toep->tp_flags \|= TP_DATASENT;
	}
	}

	#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */

	int
	t3_push_frames(struct socket *so, int req_completion)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	struct mbuf tail, m0, *last;
	struct t3cdev *cdev;
	struct tom_data *d;
	int state, bytes, count, total_bytes;
	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
	struct sockbuf *snd;

	if (tp->t_state == TCPS_SYN_SENT \|\| tp->t_state == TCPS_CLOSED) {
	DPRINTF("tcp state=%d\n", tp->t_state);
	return (0);
	}

	state = so_state_get(so);

	if (state & (SS_ISDISCONNECTING\|SS_ISDISCONNECTED)) {
	DPRINTF("disconnecting\n");

	return (0);
	}

	inp_lock_assert(tp->t_inpcb);

	snd = so_sockbuf_snd(so);
	sockbuf_lock(snd);

	d = TOM_DATA(toep->tp_toedev);
	cdev = d->cdev;

	last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;

	total_bytes = 0;
	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
	toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);

	if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) {
	KASSERT(tail, ("sbdrop error"));
	last = tail = tail->m_next;
	}

	if ((toep->tp_wr_avail == 0 ) \|\| (tail == NULL)) {
	DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
	sockbuf_unlock(snd);

	return (0);
	}

	toep->tp_m_last = NULL;
	while (toep->tp_wr_avail && (tail != NULL)) {
	count = bytes = 0;
	segp = segs;
	if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
	sockbuf_unlock(snd);
	return (0);
	}
	/*
	* If the data in tail fits as in-line, then
	* make an immediate data wr.
	*/
	if (tail->m_len <= IMM_LEN) {
	count = 1;
	bytes = tail->m_len;
	last = tail;
	tail = tail->m_next;
	m_set_sgl(m0, NULL);
	m_set_sgllen(m0, 0);
	make_tx_data_wr(so, m0, bytes, tail);
	m_append(m0, bytes, mtod(last, caddr_t));
	KASSERT(!m0->m_next, ("bad append"));
	} else {
	while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
	&& (tail != NULL) && (count < TX_MAX_SEGS-1)) {
	bytes += tail->m_len;
	last = tail;
	count++;
	/*
	* technically an abuse to be using this for a VA
	* but less gross than defining my own structure
	* or calling pmap_kextract from here :-\|
	*/
	segp->ds_addr = (bus_addr_t)tail->m_data;
	segp->ds_len = tail->m_len;
	DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
	count, mbuf_wrs[count], tail->m_data, tail->m_len);
	segp++;
	tail = tail->m_next;
	}
	DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
	toep->tp_wr_avail, count, mbuf_wrs[count], tail);

	m_set_sgl(m0, segs);
	m_set_sgllen(m0, count);
	make_tx_data_wr(so, m0, bytes, tail);
	}
	m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));

	if (tail) {
	snd->sb_sndptr = tail;
	toep->tp_m_last = NULL;
	} else
	toep->tp_m_last = snd->sb_sndptr = last;


	DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);

	snd->sb_sndptroff += bytes;
	total_bytes += bytes;
	toep->tp_write_seq += bytes;
	CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
	" tail=%p sndptr=%p sndptroff=%d",
	toep->tp_wr_avail, count, mbuf_wrs[count],
	tail, snd->sb_sndptr, snd->sb_sndptroff);
	if (tail)
	CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
	" tp_m_last=%p tailbuf=%p snd_una=0x%08x",
	total_bytes, toep->tp_m_last, tail->m_data,
	tp->snd_una);
	else
	CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
	" tp_m_last=%p snd_una=0x%08x",
	total_bytes, toep->tp_m_last, tp->snd_una);


	#ifdef KTR
	{
	int i;

	i = 0;
	while (i < count && m_get_sgllen(m0)) {
	if ((count - i) >= 3) {
	CTR6(KTR_TOM,
	"t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
	" len=%d pa=0x%zx len=%d",
	segs[i].ds_addr, segs[i].ds_len,
	segs[i + 1].ds_addr, segs[i + 1].ds_len,
	segs[i + 2].ds_addr, segs[i + 2].ds_len);
	i += 3;
	} else if ((count - i) == 2) {
	CTR4(KTR_TOM,
	"t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
	" len=%d",
	segs[i].ds_addr, segs[i].ds_len,
	segs[i + 1].ds_addr, segs[i + 1].ds_len);
	i += 2;
	} else {
	CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
	segs[i].ds_addr, segs[i].ds_len);
	i++;
	}

	}
	}
	#endif
	/*
	* remember credits used
	*/
	m0->m_pkthdr.csum_data = mbuf_wrs[count];
	m0->m_pkthdr.len = bytes;
	toep->tp_wr_avail -= mbuf_wrs[count];
	toep->tp_wr_unacked += mbuf_wrs[count];

	if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) \|\|
	toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
	struct work_request_hdr *wr = cplhdr(m0);

	wr->wr_hi \|= htonl(F_WR_COMPL);
	toep->tp_wr_unacked = 0;
	}
	KASSERT((m0->m_pkthdr.csum_data > 0) &&
	(m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
	m0->m_pkthdr.csum_data));
	m0->m_type = MT_DONTFREE;
	enqueue_wr(toep, m0);
	DPRINTF("sending offload tx with %d bytes in %d segments\n",
	bytes, count);
	l2t_send(cdev, m0, toep->tp_l2t);
	}
	sockbuf_unlock(snd);
	return (total_bytes);
	}

	/*
	* Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
	* under any circumstances. We take the easy way out and always queue the
	* message to the write_queue. We can optimize the case where the queue is
	* already empty though the optimization is probably not worth it.
	*/
	static void
	close_conn(struct socket *so)
	{
	struct mbuf *m;
	struct cpl_close_con_req *req;
	struct tom_data *d;
	struct inpcb *inp = so_sotoinpcb(so);
	struct tcpcb *tp;
	struct toepcb *toep;
	unsigned int tid;


	inp_wlock(inp);
	tp = so_sototcpcb(so);
	toep = tp->t_toe;

	if (tp->t_state != TCPS_SYN_SENT)
	t3_push_frames(so, 1);

	if (toep->tp_flags & TP_FIN_SENT) {
	inp_wunlock(inp);
	return;
	}

	tid = toep->tp_tid;

	d = TOM_DATA(toep->tp_toedev);

	m = m_gethdr_nofail(sizeof(*req));
	m_set_priority(m, CPL_PRIORITY_DATA);
	m_set_sgl(m, NULL);
	m_set_sgllen(m, 0);

	toep->tp_flags \|= TP_FIN_SENT;
	req = mtod(m, struct cpl_close_con_req *);

	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
	req->wr.wr_lo = htonl(V_WR_TID(tid));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
	req->rsvd = 0;
	inp_wunlock(inp);
	/*
	* XXX - need to defer shutdown while there is still data in the queue
	*
	*/
	CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
	cxgb_ofld_send(d->cdev, m);

	}

	/*
	* Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
	* and send it along.
	*/
	static void
	abort_arp_failure(struct t3cdev cdev, struct mbuf m)
	{
	struct cpl_abort_req *req = cplhdr(m);

	req->cmd = CPL_ABORT_NO_RST;
	cxgb_ofld_send(cdev, m);
	}

	/*
	* Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
	* permitted to return without sending the message in case we cannot allocate
	* an sk_buff. Returns the number of credits sent.
	*/
	uint32_t
	t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
	{
	struct mbuf *m;
	struct cpl_rx_data_ack *req;
	struct toepcb *toep = tp->t_toe;
	struct toedev *tdev = toep->tp_toedev;

	m = m_gethdr_nofail(sizeof(*req));

	DPRINTF("returning %u credits to HW\n", credits);

	req = mtod(m, struct cpl_rx_data_ack *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	req->wr.wr_lo = 0;
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
	req->credit_dack = htonl(dack \| V_RX_CREDITS(credits));
	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
	return (credits);
	}

	/*
	* Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
	* This is only used in DDP mode, so we take the opportunity to also set the
	* DACK mode and flush any Rx credits.
	*/
	void
	t3_send_rx_modulate(struct toepcb *toep)
	{
	struct mbuf *m;
	struct cpl_rx_data_ack *req;

	m = m_gethdr_nofail(sizeof(*req));

	req = mtod(m, struct cpl_rx_data_ack *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	req->wr.wr_lo = 0;
	m->m_pkthdr.len = m->m_len = sizeof(*req);

	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
	req->credit_dack = htonl(F_RX_MODULATE \| F_RX_DACK_CHANGE \|
	V_RX_DACK_MODE(1) \|
	V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
	toep->tp_rcv_wup = toep->tp_copied_seq;
	}

	/*
	* Handle receipt of an urgent pointer.
	*/
	static void
	handle_urg_ptr(struct socket *so, uint32_t urg_seq)
	{
	#ifdef URGENT_DATA_SUPPORTED
	struct tcpcb *tp = so_sototcpcb(so);

	urg_seq--; /* initially points past the urgent data, per BSD */

	if (tp->urg_data && !after(urg_seq, tp->urg_seq))
	return; /* duplicate pointer */
	sk_send_sigurg(sk);
	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
	!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);

	tp->copied_seq++;
	if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
	tom_eat_skb(sk, skb, 0);
	}
	tp->urg_data = TCP_URG_NOTYET;
	tp->urg_seq = urg_seq;
	#endif
	}

	/*
	* Returns true if a socket cannot accept new Rx data.
	*/
	static inline int
	so_no_receive(const struct socket *so)
	{
	return (so_state_get(so) & (SS_ISDISCONNECTED\|SS_ISDISCONNECTING));
	}

	/*
	* Process an urgent data notification.
	*/
	static void
	rx_urg_notify(struct toepcb toep, struct mbuf m)
	{
	struct cpl_rx_urg_notify *hdr = cplhdr(m);
	struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);

	VALIDATE_SOCK(so);

	if (!so_no_receive(so))
	handle_urg_ptr(so, ntohl(hdr->seq));

	m_freem(m);
	}

	/*
	* Handler for RX_URG_NOTIFY CPL messages.
	*/
	static int
	do_rx_urg_notify(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;

	rx_urg_notify(toep, m);
	return (0);
	}

	static __inline int
	is_delack_mode_valid(struct toedev dev, struct toepcb toep)
	{
	return (toep->tp_ulp_mode \|\|
	(toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
	dev->tod_ttid >= TOE_ID_CHELSIO_T3));
	}

	/*
	* Set of states for which we should return RX credits.
	*/
	#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_FIN_WAIT2)

	/*
	* Called after some received data has been read. It returns RX credits
	* to the HW for the amount of data processed.
	*/
	void
	t3_cleanup_rbuf(struct tcpcb *tp, int copied)
	{
	struct toepcb *toep = tp->t_toe;
	struct socket *so;
	struct toedev *dev;
	int dack_mode, must_send, read;
	u32 thres, credits, dack = 0;
	struct sockbuf *rcv;

	so = inp_inpcbtosocket(tp->t_inpcb);
	rcv = so_sockbuf_rcv(so);

	if (!((tp->t_state == TCPS_ESTABLISHED) \|\| (tp->t_state == TCPS_FIN_WAIT_1) \|\|
	(tp->t_state == TCPS_FIN_WAIT_2))) {
	if (copied) {
	sockbuf_lock(rcv);
	toep->tp_copied_seq += copied;
	sockbuf_unlock(rcv);
	}

	return;
	}

	inp_lock_assert(tp->t_inpcb);

	sockbuf_lock(rcv);
	if (copied)
	toep->tp_copied_seq += copied;
	else {
	read = toep->tp_enqueued_bytes - rcv->sb_cc;
	toep->tp_copied_seq += read;
	}
	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
	toep->tp_enqueued_bytes = rcv->sb_cc;
	sockbuf_unlock(rcv);

	if (credits > rcv->sb_mbmax) {
	log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
	toep->tp_copied_seq, toep->tp_rcv_wup, credits);
	credits = rcv->sb_mbmax;
	}


	/*
	* XXX this won't accurately reflect credit return - we need
	* to look at the difference between the amount that has been
	* put in the recv sockbuf and what is there now
	*/

	if (__predict_false(!credits))
	return;

	dev = toep->tp_toedev;
	thres = TOM_TUNABLE(dev, rx_credit_thres);

	if (__predict_false(thres == 0))
	return;

	if (is_delack_mode_valid(dev, toep)) {
	dack_mode = TOM_TUNABLE(dev, delack);
	if (__predict_false(dack_mode != toep->tp_delack_mode)) {
	u32 r = tp->rcv_nxt - toep->tp_delack_seq;

	if (r >= tp->rcv_wnd \|\| r >= 16 * toep->tp_mss_clamp)
	dack = F_RX_DACK_CHANGE \|
	V_RX_DACK_MODE(dack_mode);
	}
	} else
	dack = F_RX_DACK_CHANGE \| V_RX_DACK_MODE(1);

	/*
	* For coalescing to work effectively ensure the receive window has
	* at least 16KB left.
	*/
	must_send = credits + 16384 >= tp->rcv_wnd;

	if (must_send \|\| credits >= thres)
	toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
	}

	static int
	cxgb_toe_disconnect(struct tcpcb *tp)
	{
	struct socket *so;

	DPRINTF("cxgb_toe_disconnect\n");

	so = inp_inpcbtosocket(tp->t_inpcb);
	close_conn(so);
	return (0);
	}

	static int
	cxgb_toe_reset(struct tcpcb *tp)
	{
	struct toepcb *toep = tp->t_toe;

	t3_send_reset(toep);

	/*
	* unhook from socket
	*/
	tp->t_flags &= ~TF_TOE;
	toep->tp_tp = NULL;
	tp->t_toe = NULL;
	return (0);
	}

	static int
	cxgb_toe_send(struct tcpcb *tp)
	{
	struct socket *so;

	DPRINTF("cxgb_toe_send\n");
	dump_toepcb(tp->t_toe);

	so = inp_inpcbtosocket(tp->t_inpcb);
	t3_push_frames(so, 1);
	return (0);
	}

	static int
	cxgb_toe_rcvd(struct tcpcb *tp)
	{

	inp_lock_assert(tp->t_inpcb);

	t3_cleanup_rbuf(tp, 0);

	return (0);
	}

	static void
	cxgb_toe_detach(struct tcpcb *tp)
	{
	struct toepcb *toep;

	/*
	* XXX how do we handle teardown in the SYN_SENT state?
	*
	*/
	inp_lock_assert(tp->t_inpcb);
	toep = tp->t_toe;
	toep->tp_tp = NULL;

	/*
	* unhook from socket
	*/
	tp->t_flags &= ~TF_TOE;
	tp->t_toe = NULL;
	}


	static struct toe_usrreqs cxgb_toe_usrreqs = {
	.tu_disconnect = cxgb_toe_disconnect,
	.tu_reset = cxgb_toe_reset,
	.tu_send = cxgb_toe_send,
	.tu_rcvd = cxgb_toe_rcvd,
	.tu_detach = cxgb_toe_detach,
	.tu_detach = cxgb_toe_detach,
	.tu_syncache_event = handle_syncache_event,
	};


	static void
	__set_tcb_field(struct toepcb toep, struct mbuf m, uint16_t word,
	uint64_t mask, uint64_t val, int no_reply)
	{
	struct cpl_set_tcb_field *req;

	CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
	toep->tp_tid, word, mask, val);

	req = mtod(m, struct cpl_set_tcb_field *);
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	req->wr.wr_lo = 0;
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
	req->reply = V_NO_REPLY(no_reply);
	req->cpu_idx = 0;
	req->word = htons(word);
	req->mask = htobe64(mask);
	req->val = htobe64(val);

	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	send_or_defer(toep, m, 0);
	}

	static void
	t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
	{
	struct mbuf *m;
	struct tcpcb *tp = toep->tp_tp;

	if (toep == NULL)
	return;

	if (tp->t_state == TCPS_CLOSED \|\| (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
	printf("not seting field\n");
	return;
	}

	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));

	__set_tcb_field(toep, m, word, mask, val, 1);
	}

	/*
	* Set one of the t_flags bits in the TCB.
	*/
	static void
	set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
	{

	t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
	}

	/*
	* Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
	*/
	static void
	t3_set_nagle(struct toepcb *toep)
	{
	struct tcpcb *tp = toep->tp_tp;

	set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
	}

	/*
	* Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
	*/
	void
	t3_set_keepalive(struct toepcb *toep, int on_off)
	{

	set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
	}

	void
	t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
	{
	set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
	}

	void
	t3_set_dack_mss(struct toepcb *toep, int on_off)
	{

	set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
	}

	/*
	* Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
	*/
	static void
	t3_set_tos(struct toepcb *toep)
	{
	int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);

	t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
	V_TCB_TOS(tos));
	}


	/*
	* In DDP mode, TP fails to schedule a timer to push RX data to the host when
	* DDP is disabled (data is delivered to freelist). [Note that, the peer should
	* set the PSH bit in the last segment, which would trigger delivery.]
	* We work around the issue by setting a DDP buffer in a partial placed state,
	* which guarantees that TP will schedule a timer.
	*/
	#define TP_DDP_TIMER_WORKAROUND_MASK\
	(V_TF_DDP_BUF0_VALID(1) \| V_TF_DDP_ACTIVE_BUF(1) \|\
	((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) \|\
	V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
	#define TP_DDP_TIMER_WORKAROUND_VAL\
	(V_TF_DDP_BUF0_VALID(1) \| V_TF_DDP_ACTIVE_BUF(0) \|\
	((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) \| V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
	32))

	static void
	t3_enable_ddp(struct toepcb *toep, int on)
	{
	if (on) {

	t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
	V_TF_DDP_OFF(0));
	} else
	t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
	V_TF_DDP_OFF(1) \|
	TP_DDP_TIMER_WORKAROUND_MASK,
	V_TF_DDP_OFF(1) \|
	TP_DDP_TIMER_WORKAROUND_VAL);

	}

	void
	t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
	{
	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
	V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
	tag_color);
	}

	void
	t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
	unsigned int len)
	{
	if (buf_idx == 0)
	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
	V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) \|
	V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
	V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) \|
	V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
	else
	t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
	V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) \|
	V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
	V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) \|
	V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
	}

	static int
	t3_set_cong_control(struct socket so, const char name)
	{
	#ifdef CONGESTION_CONTROL_SUPPORTED
	int cong_algo;

	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
	if (!strcmp(name, t3_cong_ops[cong_algo].name))
	break;

	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
	return -EINVAL;
	#endif
	return 0;
	}

	int
	t3_get_tcb(struct toepcb *toep)
	{
	struct cpl_get_tcb *req;
	struct tcpcb *tp = toep->tp_tp;
	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);

	if (!m)
	return (ENOMEM);

	inp_lock_assert(tp->t_inpcb);
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	req = mtod(m, struct cpl_get_tcb *);
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	req->wr.wr_lo = 0;
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
	req->cpuno = htons(toep->tp_qset);
	req->rsvd = 0;
	if (tp->t_state == TCPS_SYN_SENT)
	mbufq_tail(&toep->out_of_order_queue, m); // defer
	else
	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
	return 0;
	}

	static inline void
	so_insert_tid(struct tom_data d, struct toepcb toep, unsigned int tid)
	{

	toepcb_hold(toep);

	cxgb_insert_tid(d->cdev, d->client, toep, tid);
	}

	/**
	* find_best_mtu - find the entry in the MTU table closest to an MTU
	* @d: TOM state
	* @mtu: the target MTU
	*
	* Returns the index of the value in the MTU table that is closest to but
	* does not exceed the target MTU.
	*/
	static unsigned int
	find_best_mtu(const struct t3c_data *d, unsigned short mtu)
	{
	int i = 0;

	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
	++i;
	return (i);
	}

	static unsigned int
	select_mss(struct t3c_data td, struct tcpcb tp, unsigned int pmtu)
	{
	unsigned int idx;

	#ifdef notyet
	struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
	#endif
	if (tp) {
	tp->t_maxseg = pmtu - 40;
	if (tp->t_maxseg < td->mtus[0] - 40)
	tp->t_maxseg = td->mtus[0] - 40;
	idx = find_best_mtu(td, tp->t_maxseg + 40);

	tp->t_maxseg = td->mtus[idx] - 40;
	} else
	idx = find_best_mtu(td, pmtu);

	return (idx);
	}

	static inline void
	free_atid(struct t3cdev *cdev, unsigned int tid)
	{
	struct toepcb *toep = cxgb_free_atid(cdev, tid);

	if (toep)
	toepcb_release(toep);
	}

	/*
	* Release resources held by an offload connection (TID, L2T entry, etc.)
	*/
	static void
	t3_release_offload_resources(struct toepcb *toep)
	{
	struct tcpcb *tp = toep->tp_tp;
	struct toedev *tdev = toep->tp_toedev;
	struct t3cdev *cdev;
	struct socket *so;
	unsigned int tid = toep->tp_tid;
	struct sockbuf *rcv;

	CTR0(KTR_TOM, "t3_release_offload_resources");

	if (!tdev)
	return;

	cdev = TOEP_T3C_DEV(toep);
	if (!cdev)
	return;

	toep->tp_qset = 0;
	t3_release_ddp_resources(toep);

	#ifdef CTRL_SKB_CACHE
	kfree_skb(CTRL_SKB_CACHE(tp));
	CTRL_SKB_CACHE(tp) = NULL;
	#endif

	if (toep->tp_wr_avail != toep->tp_wr_max) {
	purge_wr_queue(toep);
	reset_wr_list(toep);
	}

	if (toep->tp_l2t) {
	l2t_release(L2DATA(cdev), toep->tp_l2t);
	toep->tp_l2t = NULL;
	}
	toep->tp_tp = NULL;
	if (tp) {
	inp_lock_assert(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);
	rcv = so_sockbuf_rcv(so);
	/*
	* cancel any offloaded reads
	*
	*/
	sockbuf_lock(rcv);
	tp->t_toe = NULL;
	tp->t_flags &= ~TF_TOE;
	if (toep->tp_ddp_state.user_ddp_pending) {
	t3_cancel_ubuf(toep, rcv);
	toep->tp_ddp_state.user_ddp_pending = 0;
	}
	so_sorwakeup_locked(so);

	}

	if (toep->tp_state == TCPS_SYN_SENT) {
	free_atid(cdev, tid);
	#ifdef notyet
	__skb_queue_purge(&tp->out_of_order_queue);
	#endif
	} else { // we have TID
	cxgb_remove_tid(cdev, toep, tid);
	toepcb_release(toep);
	}
	#if 0
	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
	#endif
	}

	static void
	install_offload_ops(struct socket *so)
	{
	struct tcpcb *tp = so_sototcpcb(so);

	KASSERT(tp->t_toe != NULL, ("toepcb not set"));

	t3_install_socket_ops(so);
	tp->t_flags \|= TF_TOE;
	tp->t_tu = &cxgb_toe_usrreqs;
	}

	/*
	* Determine the receive window scaling factor given a target max
	* receive window.
	*/
	static __inline int
	select_rcv_wscale(int space)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int wscale = 0;

	if (space > MAX_RCV_WND)
	space = MAX_RCV_WND;

	if (V_tcp_do_rfc1323)
	for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;

	return (wscale);
	}

	/*
	* Determine the receive window size for a socket.
	*/
	static unsigned long
	select_rcv_wnd(struct toedev dev, struct socket so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct tom_data *d = TOM_DATA(dev);
	unsigned int wnd;
	unsigned int max_rcv_wnd;
	struct sockbuf *rcv;

	rcv = so_sockbuf_rcv(so);

	if (V_tcp_do_autorcvbuf)
	wnd = V_tcp_autorcvbuf_max;
	else
	wnd = rcv->sb_hiwat;



	/* XXX
	* For receive coalescing to work effectively we need a receive window
	* that can accomodate a coalesced segment.
	*/
	if (wnd < MIN_RCV_WND)
	wnd = MIN_RCV_WND;

	/* PR 5138 */
	max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
	(uint32_t)d->rx_page_size * 23 :
	MAX_RCV_WND);

	return min(wnd, max_rcv_wnd);
	}

	/*
	* Assign offload parameters to some socket fields. This code is used by
	* both active and passive opens.
	*/
	static inline void
	init_offload_socket(struct socket so, struct toedev dev, unsigned int tid,
	struct l2t_entry e, struct rtentry dst, struct toepcb *toep)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
	struct sockbuf snd, rcv;

	#ifdef notyet
	SOCK_LOCK_ASSERT(so);
	#endif

	snd = so_sockbuf_snd(so);
	rcv = so_sockbuf_rcv(so);

	log(LOG_INFO, "initializing offload socket\n");
	/*
	* We either need to fix push frames to work with sbcompress
	* or we need to add this
	*/
	snd->sb_flags \|= SB_NOCOALESCE;
	rcv->sb_flags \|= SB_NOCOALESCE;

	tp->t_toe = toep;
	toep->tp_tp = tp;
	toep->tp_toedev = dev;

	toep->tp_tid = tid;
	toep->tp_l2t = e;
	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
	toep->tp_wr_unacked = 0;
	toep->tp_delack_mode = 0;

	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
	/*
	* XXX broken
	*
	*/
	tp->rcv_wnd = select_rcv_wnd(dev, so);

	toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
	tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
	toep->tp_qset_idx = 0;

	reset_wr_list(toep);
	DPRINTF("initialization done\n");
	}

	/*
	* The next two functions calculate the option 0 value for a socket.
	*/
	static inline unsigned int
	calc_opt0h(struct socket *so, int mtu_idx)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	int wscale = select_rcv_wscale(tp->rcv_wnd);

	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) \|
	V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) \| F_TCAM_BYPASS \|
	V_WND_SCALE(wscale) \| V_MSS_IDX(mtu_idx);
	}

	static inline unsigned int
	calc_opt0l(struct socket *so, int ulp_mode)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	unsigned int val;

	val = V_TOS(INP_TOS(tp->t_inpcb)) \| V_ULP_MODE(ulp_mode) \|
	V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));

	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
	return (val);
	}

	static inline unsigned int
	calc_opt2(const struct socket so, struct toedev dev)
	{
	int flv_valid;

	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);

	return (V_FLAVORS_VALID(flv_valid) \|
	V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
	}

	#if DEBUG_WR > 1
	static int
	count_pending_wrs(const struct toepcb *toep)
	{
	const struct mbuf *m;
	int n = 0;

	wr_queue_walk(toep, m)
	n += m->m_pkthdr.csum_data;
	return (n);
	}
	#endif

	#if 0
	((((struct tom_data *)&(dev)->l4opt)->conf.cong_alg) != -1)
	#endif

	static void
	mk_act_open_req(struct socket so, struct mbuf m,
	unsigned int atid, const struct l2t_entry *e)
	{
	struct cpl_act_open_req *req;
	struct inpcb *inp = so_sotoinpcb(so);
	struct tcpcb *tp = inp_inpcbtotcpcb(inp);
	struct toepcb *toep = tp->t_toe;
	struct toedev *tdev = toep->tp_toedev;

	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));

	req = mtod(m, struct cpl_act_open_req *);
	m->m_pkthdr.len = m->m_len = sizeof(*req);

	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	req->wr.wr_lo = 0;
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
	inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
	#if 0
	req->local_port = inp->inp_lport;
	req->peer_port = inp->inp_fport;
	memcpy(&req->local_ip, &inp->inp_laddr, 4);
	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
	#endif
	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) \| V_L2T_IDX(e->idx) \|
	V_TX_CHANNEL(e->smt_idx));
	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
	req->params = 0;
	req->opt2 = htonl(calc_opt2(so, tdev));
	}


	/*
	* Convert an ACT_OPEN_RPL status to an errno.
	*/
	static int
	act_open_rpl_status_to_errno(int status)
	{
	switch (status) {
	case CPL_ERR_CONN_RESET:
	return (ECONNREFUSED);
	case CPL_ERR_ARP_MISS:
	return (EHOSTUNREACH);
	case CPL_ERR_CONN_TIMEDOUT:
	return (ETIMEDOUT);
	case CPL_ERR_TCAM_FULL:
	return (ENOMEM);
	case CPL_ERR_CONN_EXIST:
	log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
	return (EADDRINUSE);
	default:
	return (EIO);
	}
	}

	static void
	fail_act_open(struct toepcb *toep, int errno)
	{
	struct tcpcb *tp = toep->tp_tp;

	t3_release_offload_resources(toep);
	if (tp) {
	inp_wunlock(tp->t_inpcb);
	tcp_offload_drop(tp, errno);
	}

	#ifdef notyet
	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
	#endif
	}

	/*
	* Handle active open failures.
	*/
	static void
	active_open_failed(struct toepcb toep, struct mbuf m)
	{
	struct cpl_act_open_rpl *rpl = cplhdr(m);
	struct inpcb *inp;

	if (toep->tp_tp == NULL)
	goto done;

	inp = toep->tp_tp->t_inpcb;

	/*
	* Don't handle connection retry for now
	*/
	#ifdef notyet
	struct inet_connection_sock *icsk = inet_csk(sk);

	if (rpl->status == CPL_ERR_CONN_EXIST &&
	icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
	icsk->icsk_retransmit_timer.function = act_open_retry_timer;
	sk_reset_timer(so, &icsk->icsk_retransmit_timer,
	jiffies + HZ / 2);
	} else
	#endif
	{
	inp_wlock(inp);
	/*
	* drops the inpcb lock
	*/
	fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
	}

	done:
	m_free(m);
	}

	/*
	* Return whether a failed active open has allocated a TID
	*/
	static inline int
	act_open_has_tid(int status)
	{
	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
	status != CPL_ERR_ARP_MISS;
	}

	/*
	* Process an ACT_OPEN_RPL CPL message.
	*/
	static int
	do_act_open_rpl(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;
	struct cpl_act_open_rpl *rpl = cplhdr(m);

	if (cdev->type != T3A && act_open_has_tid(rpl->status))
	cxgb_queue_tid_release(cdev, GET_TID(rpl));

	active_open_failed(toep, m);
	return (0);
	}

	/*
	* Handle an ARP failure for an active open. XXX purge ofo queue
	*
	* XXX badly broken for crossed SYNs as the ATID is no longer valid.
	* XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
	* check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
	* free the atid. Hmm.
	*/
	#ifdef notyet
	static void
	act_open_req_arp_failure(struct t3cdev dev, struct mbuf m)
	{
	struct toepcb *toep = m_get_toep(m);
	struct tcpcb *tp = toep->tp_tp;
	struct inpcb *inp = tp->t_inpcb;
	struct socket *so;

	inp_wlock(inp);
	if (tp->t_state == TCPS_SYN_SENT \|\| tp->t_state == TCPS_SYN_RECEIVED) {
	/*
	* drops the inpcb lock
	*/
	fail_act_open(so, EHOSTUNREACH);
	printf("freeing %p\n", m);

	m_free(m);
	} else
	inp_wunlock(inp);
	}
	#endif
	/*
	* Send an active open request.
	*/
	int
	t3_connect(struct toedev tdev, struct socket so,
	struct rtentry rt, struct sockaddr nam)
	{
	struct mbuf *m;
	struct l2t_entry *e;
	struct tom_data *d = TOM_DATA(tdev);
	struct inpcb *inp = so_sotoinpcb(so);
	struct tcpcb *tp = intotcpcb(inp);
	struct toepcb toep; / allocated by init_offload_socket */

	int atid;

	toep = toepcb_alloc();
	if (toep == NULL)
	goto out_err;

	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
	goto out_err;

	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
	if (!e)
	goto free_tid;

	inp_lock_assert(inp);
	m = m_gethdr(MT_DATA, M_WAITOK);

	#if 0
	m->m_toe.mt_toepcb = tp->t_toe;
	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
	#endif
	so_lock(so);

	init_offload_socket(so, tdev, atid, e, rt, toep);

	install_offload_ops(so);

	mk_act_open_req(so, m, atid, e);
	so_unlock(so);

	soisconnecting(so);
	toep = tp->t_toe;
	m_set_toep(m, tp->t_toe);

	toep->tp_state = TCPS_SYN_SENT;
	l2t_send(d->cdev, (struct mbuf *)m, e);

	if (toep->tp_ulp_mode)
	t3_enable_ddp(toep, 0);
	return (0);

	free_tid:
	printf("failing connect - free atid\n");

	free_atid(d->cdev, atid);
	out_err:
	printf("return ENOMEM\n");
	return (ENOMEM);
	}

	/*
	* Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
	* not send multiple ABORT_REQs for the same connection and also that we do
	* not try to send a message after the connection has closed. Returns 1 if
	* an ABORT_REQ wasn't generated after all, 0 otherwise.
	*/
	static void
	t3_send_reset(struct toepcb *toep)
	{

	struct cpl_abort_req *req;
	unsigned int tid = toep->tp_tid;
	int mode = CPL_ABORT_SEND_RST;
	struct tcpcb *tp = toep->tp_tp;
	struct toedev *tdev = toep->tp_toedev;
	struct socket *so = NULL;
	struct mbuf *m;
	struct sockbuf *snd;

	if (tp) {
	inp_lock_assert(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);
	}

	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) \|\|
	tdev == NULL))
	return;
	toep->tp_flags \|= (TP_ABORT_RPL_PENDING\|TP_ABORT_SHUTDOWN);

	snd = so_sockbuf_snd(so);
	/* Purge the send queue so we don't send anything after an abort. */
	if (so)
	sbflush(snd);
	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
	mode \|= CPL_ABORT_POST_CLOSE_REQ;

	m = m_gethdr_nofail(sizeof(*req));
	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
	set_arp_failure_handler(m, abort_arp_failure);

	req = mtod(m, struct cpl_abort_req *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
	req->wr.wr_lo = htonl(V_WR_TID(tid));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
	req->cmd = mode;
	if (tp && (tp->t_state == TCPS_SYN_SENT))
	mbufq_tail(&toep->out_of_order_queue, m); // defer
	else
	l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
	}

	static int
	t3_ip_ctloutput(struct socket so, struct sockopt sopt)
	{
	struct inpcb *inp;
	int error, optval;

	if (sopt->sopt_name == IP_OPTIONS)
	return (ENOPROTOOPT);

	if (sopt->sopt_name != IP_TOS)
	return (EOPNOTSUPP);

	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);

	if (error)
	return (error);

	if (optval > IPTOS_PREC_CRITIC_ECP)
	return (EINVAL);

	inp = so_sotoinpcb(so);
	inp_wlock(inp);
	inp_ip_tos_set(inp, optval);
	#if 0
	inp->inp_ip_tos = optval;
	#endif
	t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
	inp_wunlock(inp);

	return (0);
	}

	static int
	t3_tcp_ctloutput(struct socket so, struct sockopt sopt)
	{
	int err = 0;
	size_t copied;

	if (sopt->sopt_name != TCP_CONGESTION &&
	sopt->sopt_name != TCP_NODELAY)
	return (EOPNOTSUPP);

	if (sopt->sopt_name == TCP_CONGESTION) {
	char name[TCP_CA_NAME_MAX];
	int optlen = sopt->sopt_valsize;
	struct tcpcb *tp;

	if (sopt->sopt_dir == SOPT_GET) {
	KASSERT(0, ("unimplemented"));
	return (EOPNOTSUPP);
	}

	if (optlen < 1)
	return (EINVAL);

	err = copyinstr(sopt->sopt_val, name,
	min(TCP_CA_NAME_MAX - 1, optlen), &copied);
	if (err)
	return (err);
	if (copied < 1)
	return (EINVAL);

	tp = so_sototcpcb(so);
	/*
	* XXX I need to revisit this
	*/
	if ((err = t3_set_cong_control(so, name)) == 0) {
	#ifdef CONGESTION_CONTROL_SUPPORTED
	tp->t_cong_control = strdup(name, M_CXGB);
	#endif
	} else
	return (err);
	} else {
	int optval, oldval;
	struct inpcb *inp;
	struct tcpcb *tp;

	if (sopt->sopt_dir == SOPT_GET)
	return (EOPNOTSUPP);

	err = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);

	if (err)
	return (err);

	inp = so_sotoinpcb(so);
	inp_wlock(inp);
	tp = inp_inpcbtotcpcb(inp);

	oldval = tp->t_flags;
	if (optval)
	tp->t_flags \|= TF_NODELAY;
	else
	tp->t_flags &= ~TF_NODELAY;
	inp_wunlock(inp);


	if (oldval != tp->t_flags && (tp->t_toe != NULL))
	t3_set_nagle(tp->t_toe);

	}

	return (0);
	}

	int
	t3_ctloutput(struct socket so, struct sockopt sopt)
	{
	int err;

	if (sopt->sopt_level != IPPROTO_TCP)
	err = t3_ip_ctloutput(so, sopt);
	else
	err = t3_tcp_ctloutput(so, sopt);

	if (err != EOPNOTSUPP)
	return (err);

	return (tcp_ctloutput(so, sopt));
	}

	/*
	* Returns true if we need to explicitly request RST when we receive new data
	* on an RX-closed connection.
	*/
	static inline int
	need_rst_on_excess_rx(const struct toepcb *toep)
	{
	return (1);
	}

	/*
	* Handles Rx data that arrives in a state where the socket isn't accepting
	* new data.
	*/
	static void
	handle_excess_rx(struct toepcb toep, struct mbuf m)
	{

	if (need_rst_on_excess_rx(toep) &&
	!(toep->tp_flags & TP_ABORT_SHUTDOWN))
	t3_send_reset(toep);
	m_freem(m);
	}

	/*
	* Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
	* by getting the DDP offset from the TCB.
	*/
	static void
	tcb_rpl_as_ddp_complete(struct toepcb toep, struct mbuf m)
	{
	struct ddp_state *q = &toep->tp_ddp_state;
	struct ddp_buf_state *bsp;
	struct cpl_get_tcb_rpl *hdr;
	unsigned int ddp_offset;
	struct socket *so;
	struct tcpcb *tp;
	struct sockbuf *rcv;
	int state;

	uint64_t t;
	__be64 *tcb;

	tp = toep->tp_tp;
	so = inp_inpcbtosocket(tp->t_inpcb);

	inp_lock_assert(tp->t_inpcb);
	rcv = so_sockbuf_rcv(so);
	sockbuf_lock(rcv);

	/* Note that we only accout for CPL_GET_TCB issued by the DDP code.
	* We really need a cookie in order to dispatch the RPLs.
	*/
	q->get_tcb_count--;

	/* It is a possible that a previous CPL already invalidated UBUF DDP
	* and moved the cur_buf idx and hence no further processing of this
	* skb is required. However, the app might be sleeping on
	* !q->get_tcb_count and we need to wake it up.
	*/
	if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
	int state = so_state_get(so);

	m_freem(m);
	if (__predict_true((state & SS_NOFDREF) == 0))
	so_sorwakeup_locked(so);
	else
	sockbuf_unlock(rcv);

	return;
	}

	bsp = &q->buf_state[q->cur_buf];
	hdr = cplhdr(m);
	tcb = (__be64 *)(hdr + 1);
	if (q->cur_buf == 0) {
	t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
	ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
	} else {
	t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
	ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
	}
	ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
	m->m_cur_offset = bsp->cur_offset;
	bsp->cur_offset = ddp_offset;
	m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;

	CTR5(KTR_TOM,
	"tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
	q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
	KASSERT(ddp_offset >= m->m_cur_offset,
	("ddp_offset=%u less than cur_offset=%u",
	ddp_offset, m->m_cur_offset));

	#if 0
	{
	unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;

	t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
	ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;

	t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
	rcv_nxt = t >> S_TCB_RCV_NXT;
	rcv_nxt &= M_TCB_RCV_NXT;

	t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
	rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
	rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;

	T3_TRACE2(TIDTB(sk),
	"tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
	ddp_flags, rcv_nxt - rx_hdr_offset);
	T3_TRACE4(TB(q),
	"tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
	tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
	T3_TRACE3(TB(q),
	"tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
	rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
	T3_TRACE2(TB(q),
	"tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
	q->buf_state[0].flags, q->buf_state[1].flags);

	}
	#endif
	if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
	handle_excess_rx(toep, m);
	return;
	}

	#ifdef T3_TRACE
	if ((int)m->m_pkthdr.len < 0) {
	t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
	}
	#endif
	if (bsp->flags & DDP_BF_NOCOPY) {
	#ifdef T3_TRACE
	T3_TRACE0(TB(q),
	"tcb_rpl_as_ddp_complete: CANCEL UBUF");

	if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
	printk("!cancel_ubuf");
	t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
	}
	#endif
	m->m_ddp_flags = DDP_BF_PSH \| DDP_BF_NOCOPY \| 1;
	bsp->flags &= ~(DDP_BF_NOCOPY\|DDP_BF_NODATA);
	q->cur_buf ^= 1;
	} else if (bsp->flags & DDP_BF_NOFLIP) {

	m->m_ddp_flags = 1; /* always a kernel buffer */

	/* now HW buffer carries a user buffer */
	bsp->flags &= ~DDP_BF_NOFLIP;
	bsp->flags \|= DDP_BF_NOCOPY;

	/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
	* any new data in which case we're done. If in addition the
	* offset is 0, then there wasn't a completion for the kbuf
	* and we need to decrement the posted count.
	*/
	if (m->m_pkthdr.len == 0) {
	if (ddp_offset == 0) {
	q->kbuf_posted--;
	bsp->flags \|= DDP_BF_NODATA;
	}
	sockbuf_unlock(rcv);
	m_free(m);
	return;
	}
	} else {
	sockbuf_unlock(rcv);

	/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
	* but it got here way late and nobody cares anymore.
	*/
	m_free(m);
	return;
	}

	m->m_ddp_gl = (unsigned char *)bsp->gl;
	m->m_flags \|= M_DDP;
	m->m_seq = tp->rcv_nxt;
	tp->rcv_nxt += m->m_pkthdr.len;
	tp->t_rcvtime = ticks;
	CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
	m->m_seq, q->cur_buf, m->m_pkthdr.len);
	if (m->m_pkthdr.len == 0) {
	q->user_ddp_pending = 0;
	m_free(m);
	} else
	SBAPPEND(rcv, m);

	state = so_state_get(so);
	if (__predict_true((state & SS_NOFDREF) == 0))
	so_sorwakeup_locked(so);
	else
	sockbuf_unlock(rcv);
	}

	/*
	* Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
	* in that case they are similar to DDP completions.
	*/
	static int
	do_get_tcb_rpl(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;

	/* OK if socket doesn't exist */
	if (toep == NULL) {
	printf("null toep in do_get_tcb_rpl\n");
	return (CPL_RET_BUF_DONE);
	}

	inp_wlock(toep->tp_tp->t_inpcb);
	tcb_rpl_as_ddp_complete(toep, m);
	inp_wunlock(toep->tp_tp->t_inpcb);

	return (0);
	}

	static void
	handle_ddp_data(struct toepcb toep, struct mbuf m)
	{
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	struct ddp_state *q;
	struct ddp_buf_state *bsp;
	struct cpl_rx_data *hdr = cplhdr(m);
	unsigned int rcv_nxt = ntohl(hdr->seq);
	struct sockbuf *rcv;

	if (tp->rcv_nxt == rcv_nxt)
	return;

	inp_lock_assert(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);
	rcv = so_sockbuf_rcv(so);
	sockbuf_lock(rcv);

	q = &toep->tp_ddp_state;
	bsp = &q->buf_state[q->cur_buf];
	KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
	rcv_nxt, tp->rcv_nxt));
	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
	CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
	rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);

	#ifdef T3_TRACE
	if ((int)m->m_pkthdr.len < 0) {
	t3_ddp_error(so, "handle_ddp_data: neg len");
	}
	#endif
	m->m_ddp_gl = (unsigned char *)bsp->gl;
	m->m_flags \|= M_DDP;
	m->m_cur_offset = bsp->cur_offset;
	m->m_ddp_flags = DDP_BF_PSH \| (bsp->flags & DDP_BF_NOCOPY) \| 1;
	if (bsp->flags & DDP_BF_NOCOPY)
	bsp->flags &= ~DDP_BF_NOCOPY;

	m->m_seq = tp->rcv_nxt;
	tp->rcv_nxt = rcv_nxt;
	bsp->cur_offset += m->m_pkthdr.len;
	if (!(bsp->flags & DDP_BF_NOFLIP))
	q->cur_buf ^= 1;
	/*
	* For now, don't re-enable DDP after a connection fell out of DDP
	* mode.
	*/
	q->ubuf_ddp_ready = 0;
	sockbuf_unlock(rcv);
	}

	/*
	* Process new data received for a connection.
	*/
	static void
	new_rx_data(struct toepcb toep, struct mbuf m)
	{
	struct cpl_rx_data *hdr = cplhdr(m);
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	struct sockbuf *rcv;
	int state;
	int len = be16toh(hdr->len);

	inp_wlock(tp->t_inpcb);

	so = inp_inpcbtosocket(tp->t_inpcb);

	if (__predict_false(so_no_receive(so))) {
	handle_excess_rx(toep, m);
	inp_wunlock(tp->t_inpcb);
	TRACE_EXIT;
	return;
	}

	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
	handle_ddp_data(toep, m);

	m->m_seq = ntohl(hdr->seq);
	m->m_ulp_mode = 0; /* for iSCSI */

	#if VALIDATE_SEQ
	if (__predict_false(m->m_seq != tp->rcv_nxt)) {
	log(LOG_ERR,
	"%s: TID %u: Bad sequence number %u, expected %u\n",
	toep->tp_toedev->name, toep->tp_tid, m->m_seq,
	tp->rcv_nxt);
	m_freem(m);
	inp_wunlock(tp->t_inpcb);
	return;
	}
	#endif
	m_adj(m, sizeof(*hdr));

	#ifdef URGENT_DATA_SUPPORTED
	/*
	* We don't handle urgent data yet
	*/
	if (__predict_false(hdr->urg))
	handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
	tp->urg_seq - tp->rcv_nxt < skb->len))
	tp->urg_data = TCP_URG_VALID \| skb->data[tp->urg_seq -
	tp->rcv_nxt];
	#endif
	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
	toep->tp_delack_mode = hdr->dack_mode;
	toep->tp_delack_seq = tp->rcv_nxt;
	}
	CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
	m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);

	if (len < m->m_pkthdr.len)
	m->m_pkthdr.len = m->m_len = len;

	tp->rcv_nxt += m->m_pkthdr.len;
	tp->t_rcvtime = ticks;
	toep->tp_enqueued_bytes += m->m_pkthdr.len;
	CTR2(KTR_TOM,
	"new_rx_data: seq 0x%x len %u",
	m->m_seq, m->m_pkthdr.len);
	inp_wunlock(tp->t_inpcb);
	rcv = so_sockbuf_rcv(so);
	sockbuf_lock(rcv);
	#if 0
	if (sb_notify(rcv))
	DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
	#endif
	SBAPPEND(rcv, m);

	#ifdef notyet
	/*
	* We're giving too many credits to the card - but disable this check so we can keep on moving :-\|
	*
	*/
	KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),

	("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
	so, rcv->sb_cc, rcv->sb_mbmax));
	#endif


	CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
	rcv->sb_cc, rcv->sb_mbcnt);

	state = so_state_get(so);
	if (__predict_true((state & SS_NOFDREF) == 0))
	so_sorwakeup_locked(so);
	else
	sockbuf_unlock(rcv);
	}

	/*
	* Handler for RX_DATA CPL messages.
	*/
	static int
	do_rx_data(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;

	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);

	new_rx_data(toep, m);

	return (0);
	}

	static void
	new_rx_data_ddp(struct toepcb toep, struct mbuf m)
	{
	struct tcpcb *tp;
	struct ddp_state *q;
	struct ddp_buf_state *bsp;
	struct cpl_rx_data_ddp *hdr;
	struct socket *so;
	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
	int nomoredata = 0;
	unsigned int delack_mode;
	struct sockbuf *rcv;

	tp = toep->tp_tp;
	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);

	if (__predict_false(so_no_receive(so))) {

	handle_excess_rx(toep, m);
	inp_wunlock(tp->t_inpcb);
	return;
	}

	q = &toep->tp_ddp_state;
	hdr = cplhdr(m);
	ddp_report = ntohl(hdr->u.ddp_report);
	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
	bsp = &q->buf_state[buf_idx];

	CTR4(KTR_TOM,
	"new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
	"hdr seq 0x%x len %u",
	tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
	ntohs(hdr->len));
	CTR3(KTR_TOM,
	"new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
	G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);

	ddp_len = ntohs(hdr->len);
	rcv_nxt = ntohl(hdr->seq) + ddp_len;

	delack_mode = G_DDP_DACK_MODE(ddp_report);
	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
	toep->tp_delack_mode = delack_mode;
	toep->tp_delack_seq = tp->rcv_nxt;
	}

	m->m_seq = tp->rcv_nxt;
	tp->rcv_nxt = rcv_nxt;

	tp->t_rcvtime = ticks;
	/*
	* Store the length in m->m_len. We are changing the meaning of
	* m->m_len here, we need to be very careful that nothing from now on
	* interprets ->len of this packet the usual way.
	*/
	m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
	inp_wunlock(tp->t_inpcb);
	CTR3(KTR_TOM,
	"new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
	m->m_len, rcv_nxt, m->m_seq);
	/*
	* Figure out where the new data was placed in the buffer and store it
	* in when. Assumes the buffer offset starts at 0, consumer needs to
	* account for page pod's pg_offset.
	*/
	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
	m->m_cur_offset = end_offset - m->m_pkthdr.len;

	rcv = so_sockbuf_rcv(so);
	sockbuf_lock(rcv);

	m->m_ddp_gl = (unsigned char *)bsp->gl;
	m->m_flags \|= M_DDP;
	bsp->cur_offset = end_offset;
	toep->tp_enqueued_bytes += m->m_pkthdr.len;

	/*
	* Length is only meaningful for kbuf
	*/
	if (!(bsp->flags & DDP_BF_NOCOPY))
	KASSERT(m->m_len <= bsp->gl->dgl_length,
	("length received exceeds ddp pages: len=%d dgl_length=%d",
	m->m_len, bsp->gl->dgl_length));

	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
	KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
	/*
	* Bit 0 of flags stores whether the DDP buffer is completed.
	* Note that other parts of the code depend on this being in bit 0.
	*/
	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
	panic("spurious ddp completion");
	} else {
	m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
	if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
	q->cur_buf ^= 1; /* flip buffers */
	}

	if (bsp->flags & DDP_BF_NOCOPY) {
	m->m_ddp_flags \|= (bsp->flags & DDP_BF_NOCOPY);
	bsp->flags &= ~DDP_BF_NOCOPY;
	}

	if (ddp_report & F_DDP_PSH)
	m->m_ddp_flags \|= DDP_BF_PSH;
	if (nomoredata)
	m->m_ddp_flags \|= DDP_BF_NODATA;

	#ifdef notyet
	skb_reset_transport_header(skb);
	tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
	#endif
	SBAPPEND(rcv, m);

	if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) \|\|
	(((m->m_ddp_flags & (DDP_BF_NOCOPY\|1)) == (DDP_BF_NOCOPY\|1))
	\|\| !(m->m_ddp_flags & DDP_BF_NOCOPY))))
	so_sorwakeup_locked(so);
	else
	sockbuf_unlock(rcv);
	}

	#define DDP_ERR (F_DDP_PPOD_MISMATCH \| F_DDP_LLIMIT_ERR \| F_DDP_ULIMIT_ERR \|\
	F_DDP_PPOD_PARITY_ERR \| F_DDP_PADDING_ERR \| F_DDP_OFFSET_ERR \|\
	F_DDP_INVALID_TAG \| F_DDP_COLOR_ERR \| F_DDP_TID_MISMATCH \|\
	F_DDP_INVALID_PPOD)

	/*
	* Handler for RX_DATA_DDP CPL messages.
	*/
	static int
	do_rx_data_ddp(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb *toep = ctx;
	const struct cpl_rx_data_ddp *hdr = cplhdr(m);

	VALIDATE_SOCK(so);

	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
	log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
	GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
	return (CPL_RET_BUF_DONE);
	}
	#if 0
	skb->h.th = tcphdr_skb->h.th;
	#endif
	new_rx_data_ddp(toep, m);
	return (0);
	}

	static void
	process_ddp_complete(struct toepcb toep, struct mbuf m)
	{
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	struct ddp_state *q;
	struct ddp_buf_state *bsp;
	struct cpl_rx_ddp_complete *hdr;
	unsigned int ddp_report, buf_idx, when, delack_mode;
	int nomoredata = 0;
	struct sockbuf *rcv;

	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);

	if (__predict_false(so_no_receive(so))) {
	struct inpcb *inp = so_sotoinpcb(so);

	handle_excess_rx(toep, m);
	inp_wunlock(inp);
	return;
	}
	q = &toep->tp_ddp_state;
	hdr = cplhdr(m);
	ddp_report = ntohl(hdr->ddp_report);
	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
	m->m_pkthdr.csum_data = tp->rcv_nxt;

	rcv = so_sockbuf_rcv(so);
	sockbuf_lock(rcv);

	bsp = &q->buf_state[buf_idx];
	when = bsp->cur_offset;
	m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
	tp->rcv_nxt += m->m_len;
	tp->t_rcvtime = ticks;

	delack_mode = G_DDP_DACK_MODE(ddp_report);
	if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
	toep->tp_delack_mode = delack_mode;
	toep->tp_delack_seq = tp->rcv_nxt;
	}
	#ifdef notyet
	skb_reset_transport_header(skb);
	tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
	#endif
	inp_wunlock(tp->t_inpcb);

	KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
	CTR5(KTR_TOM,
	"process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
	"ddp_report 0x%x offset %u, len %u",
	tp->rcv_nxt, bsp->cur_offset, ddp_report,
	G_DDP_OFFSET(ddp_report), m->m_len);

	m->m_cur_offset = bsp->cur_offset;
	bsp->cur_offset += m->m_len;

	if (!(bsp->flags & DDP_BF_NOFLIP)) {
	q->cur_buf ^= 1; /* flip buffers */
	if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
	nomoredata=1;
	}

	CTR4(KTR_TOM,
	"process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
	"ddp_report %u offset %u",
	tp->rcv_nxt, bsp->cur_offset, ddp_report,
	G_DDP_OFFSET(ddp_report));

	m->m_ddp_gl = (unsigned char *)bsp->gl;
	m->m_flags \|= M_DDP;
	m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) \| 1;
	if (bsp->flags & DDP_BF_NOCOPY)
	bsp->flags &= ~DDP_BF_NOCOPY;
	if (nomoredata)
	m->m_ddp_flags \|= DDP_BF_NODATA;

	SBAPPEND(rcv, m);
	if ((so_state_get(so) & SS_NOFDREF) == 0)
	so_sorwakeup_locked(so);
	else
	sockbuf_unlock(rcv);
	}

	/*
	* Handler for RX_DDP_COMPLETE CPL messages.
	*/
	static int
	do_rx_ddp_complete(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb *toep = ctx;

	VALIDATE_SOCK(so);
	#if 0
	skb->h.th = tcphdr_skb->h.th;
	#endif
	process_ddp_complete(toep, m);
	return (0);
	}

	/*
	* Move a socket to TIME_WAIT state. We need to make some adjustments to the
	* socket state before calling tcp_time_wait to comply with its expectations.
	*/
	static void
	enter_timewait(struct tcpcb *tp)
	{
	/*
	* Bump rcv_nxt for the peer FIN. We don't do this at the time we
	* process peer_close because we don't want to carry the peer FIN in
	* the socket's receive queue and if we increment rcv_nxt without
	* having the FIN in the receive queue we'll confuse facilities such
	* as SIOCINQ.
	*/
	inp_wlock(tp->t_inpcb);
	tp->rcv_nxt++;

	tp->ts_recent_age = 0; /* defeat recycling */
	tp->t_srtt = 0; /* defeat tcp_update_metrics */
	inp_wunlock(tp->t_inpcb);
	tcp_offload_twstart(tp);
	}

	/*
	* For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
	* function deals with the data that may be reported along with the FIN.
	* Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
	* perform normal FIN-related processing. In the latter case 1 indicates that
	* there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
	* skb can be freed.
	*/
	static int
	handle_peer_close_data(struct socket so, struct mbuf m)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	struct ddp_state *q;
	struct ddp_buf_state *bsp;
	struct cpl_peer_close *req = cplhdr(m);
	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
	struct sockbuf *rcv;

	if (tp->rcv_nxt == rcv_nxt) /* no data */
	return (0);

	CTR0(KTR_TOM, "handle_peer_close_data");
	if (__predict_false(so_no_receive(so))) {
	handle_excess_rx(toep, m);

	/*
	* Although we discard the data we want to process the FIN so
	* that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
	* PEER_CLOSE without data. In particular this PEER_CLOSE
	* may be what will close the connection. We return 1 because
	* handle_excess_rx() already freed the packet.
	*/
	return (1);
	}

	inp_lock_assert(tp->t_inpcb);
	q = &toep->tp_ddp_state;
	rcv = so_sockbuf_rcv(so);
	sockbuf_lock(rcv);

	bsp = &q->buf_state[q->cur_buf];
	m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
	KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
	m->m_ddp_gl = (unsigned char *)bsp->gl;
	m->m_flags \|= M_DDP;
	m->m_cur_offset = bsp->cur_offset;
	m->m_ddp_flags =
	DDP_BF_PSH \| (bsp->flags & DDP_BF_NOCOPY) \| 1;
	m->m_seq = tp->rcv_nxt;
	tp->rcv_nxt = rcv_nxt;
	bsp->cur_offset += m->m_pkthdr.len;
	if (!(bsp->flags & DDP_BF_NOFLIP))
	q->cur_buf ^= 1;
	#ifdef notyet
	skb_reset_transport_header(skb);
	tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
	#endif
	tp->t_rcvtime = ticks;
	SBAPPEND(rcv, m);
	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
	so_sorwakeup_locked(so);
	else
	sockbuf_unlock(rcv);

	return (1);
	}

	/*
	* Handle a peer FIN.
	*/
	static void
	do_peer_fin(struct toepcb toep, struct mbuf m)
	{
	struct socket *so;
	struct tcpcb *tp = toep->tp_tp;
	int keep, action;

	action = keep = 0;
	CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
	printf("abort_pending set\n");

	goto out;
	}
	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
	keep = handle_peer_close_data(so, m);
	if (keep < 0) {
	inp_wunlock(tp->t_inpcb);
	return;
	}
	}
	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	CTR1(KTR_TOM,
	"waking up waiters for cantrcvmore on %p ", so);
	socantrcvmore(so);

	/*
	* If connection is half-synchronized
	* (ie NEEDSYN flag on) then delay ACK,
	* so it may be piggybacked when SYN is sent.
	* Otherwise, since we received a FIN then no
	* more input can be expected, send ACK now.
	*/
	if (tp->t_flags & TF_NEEDSYN)
	tp->t_flags \|= TF_DELACK;
	else
	tp->t_flags \|= TF_ACKNOW;
	tp->rcv_nxt++;
	}

	switch (tp->t_state) {
	case TCPS_SYN_RECEIVED:
	tp->t_starttime = ticks;
	/* FALLTHROUGH */
	case TCPS_ESTABLISHED:
	tp->t_state = TCPS_CLOSE_WAIT;
	break;
	case TCPS_FIN_WAIT_1:
	tp->t_state = TCPS_CLOSING;
	break;
	case TCPS_FIN_WAIT_2:
	/*
	* If we've sent an abort_req we must have sent it too late,
	* HW will send us a reply telling us so, and this peer_close
	* is really the last message for this connection and needs to
	* be treated as an abort_rpl, i.e., transition the connection
	* to TCP_CLOSE (note that the host stack does this at the
	* time of generating the RST but we must wait for HW).
	* Otherwise we enter TIME_WAIT.
	*/
	t3_release_offload_resources(toep);
	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
	action = TCP_CLOSE;
	} else {
	action = TCP_TIMEWAIT;
	}
	break;
	default:
	log(LOG_ERR,
	"%s: TID %u received PEER_CLOSE in bad state %d\n",
	toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
	}
	inp_wunlock(tp->t_inpcb);

	if (action == TCP_TIMEWAIT) {
	enter_timewait(tp);
	} else if (action == TCP_DROP) {
	tcp_offload_drop(tp, 0);
	} else if (action == TCP_CLOSE) {
	tcp_offload_close(tp);
	}

	#ifdef notyet
	/* Do not send POLL_HUP for half duplex close. */
	if ((sk->sk_shutdown & SEND_SHUTDOWN) \|\|
	sk->sk_state == TCP_CLOSE)
	sk_wake_async(so, 1, POLL_HUP);
	else
	sk_wake_async(so, 1, POLL_IN);
	#endif

	out:
	if (!keep)
	m_free(m);
	}

	/*
	* Handler for PEER_CLOSE CPL messages.
	*/
	static int
	do_peer_close(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;

	VALIDATE_SOCK(so);

	do_peer_fin(toep, m);
	return (0);
	}

	static void
	process_close_con_rpl(struct toepcb toep, struct mbuf m)
	{
	struct cpl_close_con_rpl *rpl = cplhdr(m);
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	int action = 0;
	struct sockbuf *rcv;

	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);

	tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */

	if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
	inp_wunlock(tp->t_inpcb);
	goto out;
	}

	CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
	tp->t_state, !!(so_state_get(so) & SS_NOFDREF));

	switch (tp->t_state) {
	case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
	t3_release_offload_resources(toep);
	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
	action = TCP_CLOSE;

	} else {
	action = TCP_TIMEWAIT;
	}
	break;
	case TCPS_LAST_ACK:
	/*
	* In this state we don't care about pending abort_rpl.
	* If we've sent abort_req it was post-close and was sent too
	* late, this close_con_rpl is the actual last message.
	*/
	t3_release_offload_resources(toep);
	action = TCP_CLOSE;
	break;
	case TCPS_FIN_WAIT_1:
	/*
	* If we can't receive any more
	* data, then closing user can proceed.
	* Starting the timer is contrary to the
	* specification, but if we don't get a FIN
	* we'll hang forever.
	*
	* XXXjl:
	* we should release the tp also, and use a
	* compressed state.
	*/
	if (so)
	rcv = so_sockbuf_rcv(so);
	else
	break;

	if (rcv->sb_state & SBS_CANTRCVMORE) {
	int timeout;

	if (so)
	soisdisconnected(so);
	timeout = (tcp_fast_finwait2_recycle) ?
	tcp_finwait2_timeout : tcp_maxidle;
	tcp_timer_activate(tp, TT_2MSL, timeout);
	}
	tp->t_state = TCPS_FIN_WAIT_2;
	if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
	(toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
	action = TCP_DROP;
	}

	break;
	default:
	log(LOG_ERR,
	"%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
	toep->tp_toedev->tod_name, toep->tp_tid,
	tp->t_state);
	}
	inp_wunlock(tp->t_inpcb);


	if (action == TCP_TIMEWAIT) {
	enter_timewait(tp);
	} else if (action == TCP_DROP) {
	tcp_offload_drop(tp, 0);
	} else if (action == TCP_CLOSE) {
	tcp_offload_close(tp);
	}
	out:
	m_freem(m);
	}

	/*
	* Handler for CLOSE_CON_RPL CPL messages.
	*/
	static int
	do_close_con_rpl(struct t3cdev cdev, struct mbuf m,
	void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;

	process_close_con_rpl(toep, m);
	return (0);
	}

	/*
	* Process abort replies. We only process these messages if we anticipate
	* them as the coordination between SW and HW in this area is somewhat lacking
	* and sometimes we get ABORT_RPLs after we are done with the connection that
	* originated the ABORT_REQ.
	*/
	static void
	process_abort_rpl(struct toepcb toep, struct mbuf m)
	{
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	int needclose = 0;

	#ifdef T3_TRACE
	T3_TRACE1(TIDTB(sk),
	"process_abort_rpl: GTS rpl pending %d",
	sock_flag(sk, ABORT_RPL_PENDING));
	#endif

	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);

	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
	/*
	* XXX panic on tcpdrop
	*/
	if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
	toep->tp_flags \|= TP_ABORT_RPL_RCVD;
	else {
	toep->tp_flags &= ~(TP_ABORT_RPL_RCVD\|TP_ABORT_RPL_PENDING);
	if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) \|\|
	!is_t3a(toep->tp_toedev)) {
	if (toep->tp_flags & TP_ABORT_REQ_RCVD)
	panic("TP_ABORT_REQ_RCVD set");
	t3_release_offload_resources(toep);
	needclose = 1;
	}
	}
	}
	inp_wunlock(tp->t_inpcb);

	if (needclose)
	tcp_offload_close(tp);

	m_free(m);
	}

	/*
	* Handle an ABORT_RPL_RSS CPL message.
	*/
	static int
	do_abort_rpl(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
	struct toepcb *toep;

	/*
	* Ignore replies to post-close aborts indicating that the abort was
	* requested too late. These connections are terminated when we get
	* PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
	* arrives the TID is either no longer used or it has been recycled.
	*/
	if (rpl->status == CPL_ERR_ABORT_FAILED) {
	discard:
	m_free(m);
	return (0);
	}

	toep = (struct toepcb *)ctx;

	/*
	* Sometimes we've already closed the socket, e.g., a post-close
	* abort races with ABORT_REQ_RSS, the latter frees the socket
	* expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
	* but FW turns the ABORT_REQ into a regular one and so we get
	* ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
	*/
	if (!toep)
	goto discard;

	if (toep->tp_tp == NULL) {
	log(LOG_NOTICE, "removing tid for abort\n");
	cxgb_remove_tid(cdev, toep, toep->tp_tid);
	if (toep->tp_l2t)
	l2t_release(L2DATA(cdev), toep->tp_l2t);

	toepcb_release(toep);
	goto discard;
	}

	log(LOG_NOTICE, "toep=%p\n", toep);
	log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);

	toepcb_hold(toep);
	process_abort_rpl(toep, m);
	toepcb_release(toep);
	return (0);
	}

	/*
	* Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
	* indicate whether RST should be sent in response.
	*/
	static int
	abort_status_to_errno(struct socket so, int abort_reason, int need_rst)
	{
	struct tcpcb *tp = so_sototcpcb(so);

	switch (abort_reason) {
	case CPL_ERR_BAD_SYN:
	#if 0
	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
	#endif
	case CPL_ERR_CONN_RESET:
	// XXX need to handle SYN_RECV due to crossed SYNs
	return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
	case CPL_ERR_XMIT_TIMEDOUT:
	case CPL_ERR_PERSIST_TIMEDOUT:
	case CPL_ERR_FINWAIT2_TIMEDOUT:
	case CPL_ERR_KEEPALIVE_TIMEDOUT:
	#if 0
	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
	#endif
	return (ETIMEDOUT);
	default:
	return (EIO);
	}
	}

	static inline void
	set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
	{
	struct cpl_abort_rpl *rpl = cplhdr(m);

	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
	m->m_len = m->m_pkthdr.len = sizeof(*rpl);

	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
	rpl->cmd = cmd;
	}

	static void
	send_deferred_abort_rpl(struct toedev tdev, struct mbuf m)
	{
	struct mbuf *reply_mbuf;
	struct cpl_abort_req_rss *req = cplhdr(m);

	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
	m_set_priority(m, CPL_PRIORITY_DATA);
	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
	m_free(m);
	}

	/*
	* Returns whether an ABORT_REQ_RSS message is a negative advice.
	*/
	static inline int
	is_neg_adv_abort(unsigned int status)
	{
	return status == CPL_ERR_RTX_NEG_ADVICE \|\|
	status == CPL_ERR_PERSIST_NEG_ADVICE;
	}

	static void
	send_abort_rpl(struct mbuf m, struct toedev tdev, int rst_status)
	{
	struct mbuf *reply_mbuf;
	struct cpl_abort_req_rss *req = cplhdr(m);

	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);

	if (!reply_mbuf) {
	/* Defer the reply. Stick rst_status into req->cmd. */
	req->status = rst_status;
	t3_defer_reply(m, tdev, send_deferred_abort_rpl);
	return;
	}

	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
	m_free(m);

	/*
	* XXX need to sync with ARP as for SYN_RECV connections we can send
	* these messages while ARP is pending. For other connection states
	* it's not a problem.
	*/
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
	}

	#ifdef notyet
	static void
	cleanup_syn_rcv_conn(struct socket child, struct socket parent)
	{
	CXGB_UNIMPLEMENTED();
	#ifdef notyet
	struct request_sock *req = child->sk_user_data;

	inet_csk_reqsk_queue_removed(parent, req);
	synq_remove(tcp_sk(child));
	__reqsk_free(req);
	child->sk_user_data = NULL;
	#endif
	}


	/*
	* Performs the actual work to abort a SYN_RECV connection.
	*/
	static void
	do_abort_syn_rcv(struct socket child, struct socket parent)
	{
	struct tcpcb *parenttp = so_sototcpcb(parent);
	struct tcpcb *childtp = so_sototcpcb(child);

	/*
	* If the server is still open we clean up the child connection,
	* otherwise the server already did the clean up as it was purging
	* its SYN queue and the skb was just sitting in its backlog.
	*/
	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
	cleanup_syn_rcv_conn(child, parent);
	inp_wlock(childtp->t_inpcb);
	t3_release_offload_resources(childtp->t_toe);
	inp_wunlock(childtp->t_inpcb);
	tcp_offload_close(childtp);
	}
	}
	#endif

	/*
	* Handle abort requests for a SYN_RECV connection. These need extra work
	* because the socket is on its parent's SYN queue.
	*/
	static int
	abort_syn_rcv(struct socket so, struct mbuf m)
	{
	CXGB_UNIMPLEMENTED();
	#ifdef notyet
	struct socket *parent;
	struct toedev *tdev = toep->tp_toedev;
	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
	struct socket *oreq = so->so_incomp;
	struct t3c_tid_entry *t3c_stid;
	struct tid_info *t;

	if (!oreq)
	return -1; /* somehow we are not on the SYN queue */

	t = &(T3C_DATA(cdev))->tid_maps;
	t3c_stid = lookup_stid(t, oreq->ts_recent);
	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;

	so_lock(parent);
	do_abort_syn_rcv(so, parent);
	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
	so_unlock(parent);
	#endif
	return (0);
	}

	/*
	* Process abort requests. If we are waiting for an ABORT_RPL we ignore this
	* request except that we need to reply to it.
	*/
	static void
	process_abort_req(struct toepcb toep, struct mbuf m, struct toedev *tdev)
	{
	int rst_status = CPL_ABORT_NO_RST;
	const struct cpl_abort_req_rss *req = cplhdr(m);
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	int needclose = 0;

	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
	toep->tp_flags \|= (TP_ABORT_REQ_RCVD\|TP_ABORT_SHUTDOWN);
	m_free(m);
	goto skip;
	}

	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
	/*
	* Three cases to consider:
	* a) We haven't sent an abort_req; close the connection.
	* b) We have sent a post-close abort_req that will get to TP too late
	* and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
	* be ignored and the connection should be closed now.
	* c) We have sent a regular abort_req that will get to TP too late.
	* That will generate an abort_rpl with status 0, wait for it.
	*/
	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) \|\|
	(is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
	int error;

	error = abort_status_to_errno(so, req->status,
	&rst_status);
	so_error_set(so, error);

	if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
	so_sorwakeup(so);
	/*
	* SYN_RECV needs special processing. If abort_syn_rcv()
	* returns 0 is has taken care of the abort.
	*/
	if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
	goto skip;

	t3_release_offload_resources(toep);
	needclose = 1;
	}
	inp_wunlock(tp->t_inpcb);

	if (needclose)
	tcp_offload_close(tp);

	send_abort_rpl(m, tdev, rst_status);
	return;
	skip:
	inp_wunlock(tp->t_inpcb);
	}

	/*
	* Handle an ABORT_REQ_RSS CPL message.
	*/
	static int
	do_abort_req(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	const struct cpl_abort_req_rss *req = cplhdr(m);
	struct toepcb toep = (struct toepcb )ctx;

	if (is_neg_adv_abort(req->status)) {
	m_free(m);
	return (0);
	}

	log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);

	if ((toep->tp_flags & (TP_SYN_RCVD\|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
	cxgb_remove_tid(cdev, toep, toep->tp_tid);
	toep->tp_flags \|= TP_ABORT_REQ_RCVD;

	send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
	if (toep->tp_l2t)
	l2t_release(L2DATA(cdev), toep->tp_l2t);

	/*
	* Unhook
	*/
	toep->tp_tp->t_toe = NULL;
	toep->tp_tp->t_flags &= ~TF_TOE;
	toep->tp_tp = NULL;
	/*
	* XXX need to call syncache_chkrst - but we don't
	* have a way of doing that yet
	*/
	toepcb_release(toep);
	log(LOG_ERR, "abort for unestablished connection :-(\n");
	return (0);
	}
	if (toep->tp_tp == NULL) {
	log(LOG_NOTICE, "disconnected toepcb\n");
	/* should be freed momentarily */
	return (0);
	}


	toepcb_hold(toep);
	process_abort_req(toep, m, toep->tp_toedev);
	toepcb_release(toep);
	return (0);
	}
	#ifdef notyet
	static void
	pass_open_abort(struct socket child, struct socket parent, struct mbuf *m)
	{
	struct toedev *tdev = TOE_DEV(parent);

	do_abort_syn_rcv(child, parent);
	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
	struct cpl_pass_accept_rpl *rpl = cplhdr(m);

	rpl->opt0h = htonl(F_TCAM_BYPASS);
	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
	} else
	m_free(m);
	}
	#endif
	static void
	handle_pass_open_arp_failure(struct socket so, struct mbuf m)
	{
	CXGB_UNIMPLEMENTED();

	#ifdef notyet
	struct t3cdev *cdev;
	struct socket *parent;
	struct socket *oreq;
	struct t3c_tid_entry *t3c_stid;
	struct tid_info *t;
	struct tcpcb otp, tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	/*
	* If the connection is being aborted due to the parent listening
	* socket going away there's nothing to do, the ABORT_REQ will close
	* the connection.
	*/
	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
	m_free(m);
	return;
	}

	oreq = so->so_incomp;
	otp = so_sototcpcb(oreq);

	cdev = T3C_DEV(so);
	t = &(T3C_DATA(cdev))->tid_maps;
	t3c_stid = lookup_stid(t, otp->ts_recent);
	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;

	so_lock(parent);
	pass_open_abort(so, parent, m);
	so_unlock(parent);
	#endif
	}

	/*
	* Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
	* to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
	* connection.
	*/
	static void
	pass_accept_rpl_arp_failure(struct t3cdev cdev, struct mbuf m)
	{

	#ifdef notyet
	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
	#endif
	handle_pass_open_arp_failure(m_get_socket(m), m);
	}

	/*
	* Populate a reject CPL_PASS_ACCEPT_RPL WR.
	*/
	static void
	mk_pass_accept_rpl(struct mbuf reply_mbuf, struct mbuf req_mbuf)
	{
	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
	unsigned int tid = GET_TID(req);

	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
	rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
	rpl->opt0h = htonl(F_TCAM_BYPASS);
	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
	rpl->opt2 = 0;
	rpl->rsvd = rpl->opt2; /* workaround for HW bug */
	}

	/*
	* Send a deferred reject to an accept request.
	*/
	static void
	reject_pass_request(struct toedev tdev, struct mbuf m)
	{
	struct mbuf *reply_mbuf;

	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
	mk_pass_accept_rpl(reply_mbuf, m);
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
	m_free(m);
	}

	static void
	handle_syncache_event(int event, void *arg)
	{
	struct toepcb *toep = arg;

	switch (event) {
	case TOE_SC_ENTRY_PRESENT:
	/*
	* entry already exists - free toepcb
	* and l2t
	*/
	printf("syncache entry present\n");
	toepcb_release(toep);
	break;
	case TOE_SC_DROP:
	/*
	* The syncache has given up on this entry
	* either it timed out, or it was evicted
	* we need to explicitly release the tid
	*/
	printf("syncache entry dropped\n");
	toepcb_release(toep);
	break;
	default:
	log(LOG_ERR, "unknown syncache event %d\n", event);
	break;
	}
	}

	static void
	syncache_add_accept_req(struct cpl_pass_accept_req req, struct socket lso, struct toepcb *toep)
	{
	struct in_conninfo inc;
	struct tcpopt to;
	struct tcphdr th;
	struct inpcb *inp;
	int mss, wsf, sack, ts;
	uint32_t rcv_isn = ntohl(req->rcv_isn);

	bzero(&to, sizeof(struct tcpopt));
	inp = so_sotoinpcb(lso);

	/*
	* Fill out information for entering us into the syncache
	*/
	bzero(&inc, sizeof(inc));
	inc.inc_fport = th.th_sport = req->peer_port;
	inc.inc_lport = th.th_dport = req->local_port;
	th.th_seq = req->rcv_isn;
	th.th_flags = TH_SYN;

	toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;


	inc.inc_isipv6 = 0;
	inc.inc_len = 0;
	inc.inc_faddr.s_addr = req->peer_ip;
	inc.inc_laddr.s_addr = req->local_ip;

	DPRINTF("syncache add of %d:%d %d:%d\n",
	ntohl(req->local_ip), ntohs(req->local_port),
	ntohl(req->peer_ip), ntohs(req->peer_port));

	mss = req->tcp_options.mss;
	wsf = req->tcp_options.wsf;
	ts = req->tcp_options.tstamp;
	sack = req->tcp_options.sack;
	to.to_mss = mss;
	to.to_wscale = wsf;
	to.to_flags = (mss ? TOF_MSS : 0) \| (wsf ? TOF_SCALE : 0) \| (ts ? TOF_TS : 0) \| (sack ? TOF_SACKPERM : 0);
	tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
	}


	/*
	* Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
	* lock held. Note that the sock here is a listening socket that is not owned
	* by the TOE.
	*/
	static void
	process_pass_accept_req(struct socket so, struct mbuf m, struct toedev *tdev,
	struct listen_ctx *lctx)
	{
	int rt_flags;
	struct l2t_entry *e;
	struct iff_mac tim;
	struct mbuf reply_mbuf, ddp_mbuf = NULL;
	struct cpl_pass_accept_rpl *rpl;
	struct cpl_pass_accept_req *req = cplhdr(m);
	unsigned int tid = GET_TID(req);
	struct tom_data *d = TOM_DATA(tdev);
	struct t3cdev *cdev = d->cdev;
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *newtoep;
	struct rtentry *dst;
	struct sockaddr_in nam;
	struct t3c_data *td = T3C_DATA(cdev);

	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
	if (__predict_false(reply_mbuf == NULL)) {
	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
	t3_defer_reply(m, tdev, reject_pass_request);
	else {
	cxgb_queue_tid_release(cdev, tid);
	m_free(m);
	}
	DPRINTF("failed to get reply_mbuf\n");

	goto out;
	}

	if (tp->t_state != TCPS_LISTEN) {
	DPRINTF("socket not in listen state\n");

	goto reject;
	}

	tim.mac_addr = req->dst_mac;
	tim.vlan_tag = ntohs(req->vlan_tag);
	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 \|\| !tim.dev) {
	DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
	goto reject;
	}

	#ifdef notyet
	/*
	* XXX do route lookup to confirm that we're still listening on this
	* address
	*/
	if (ip_route_input(skb, req->local_ip, req->peer_ip,
	G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
	goto reject;
	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
	(RTCF_BROADCAST \| RTCF_MULTICAST \| RTCF_LOCAL);
	dst_release(skb->dst); // done with the input route, release it
	skb->dst = NULL;

	if ((rt_flags & RTF_LOCAL) == 0)
	goto reject;
	#endif
	/*
	* XXX
	*/
	rt_flags = RTF_LOCAL;
	if ((rt_flags & RTF_LOCAL) == 0)
	goto reject;

	/*
	* Calculate values and add to syncache
	*/

	newtoep = toepcb_alloc();
	if (newtoep == NULL)
	goto reject;

	bzero(&nam, sizeof(struct sockaddr_in));

	nam.sin_len = sizeof(struct sockaddr_in);
	nam.sin_family = AF_INET;
	nam.sin_addr.s_addr =req->peer_ip;
	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);

	if (dst == NULL) {
	printf("failed to find route\n");
	goto reject;
	}
	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
	(struct sockaddr *)&nam);
	if (e == NULL) {
	DPRINTF("failed to get l2t\n");
	}
	/*
	* Point to our listen socket until accept
	*/
	newtoep->tp_tp = tp;
	newtoep->tp_flags = TP_SYN_RCVD;
	newtoep->tp_tid = tid;
	newtoep->tp_toedev = tdev;
	tp->rcv_wnd = select_rcv_wnd(tdev, so);

	cxgb_insert_tid(cdev, d->client, newtoep, tid);
	so_lock(so);
	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
	so_unlock(so);

	newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
	tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;

	if (newtoep->tp_ulp_mode) {
	ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);

	if (ddp_mbuf == NULL)
	newtoep->tp_ulp_mode = 0;
	}

	CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
	TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
	/*
	* XXX workaround for lack of syncache drop
	*/
	toepcb_hold(newtoep);
	syncache_add_accept_req(req, so, newtoep);

	rpl = cplhdr(reply_mbuf);
	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	rpl->wr.wr_lo = 0;
	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
	rpl->opt2 = htonl(calc_opt2(so, tdev));
	rpl->rsvd = rpl->opt2; /* workaround for HW bug */
	rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten

	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) \|
	V_L2T_IDX(e->idx) \| V_TX_CHANNEL(e->smt_idx));
	rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) \|
	CPL_PASS_OPEN_ACCEPT);

	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);

	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));

	l2t_send(cdev, reply_mbuf, e);
	m_free(m);
	if (newtoep->tp_ulp_mode) {
	__set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
	V_TF_DDP_OFF(1) \|
	TP_DDP_TIMER_WORKAROUND_MASK,
	V_TF_DDP_OFF(1) \|
	TP_DDP_TIMER_WORKAROUND_VAL, 1);
	} else
	printf("not offloading\n");



	return;
	reject:
	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
	mk_pass_accept_rpl(reply_mbuf, m);
	else
	mk_tid_release(reply_mbuf, newtoep, tid);
	cxgb_ofld_send(cdev, reply_mbuf);
	m_free(m);
	out:
	#if 0
	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
	#else
	return;
	#endif
	}

	/*
	* Handle a CPL_PASS_ACCEPT_REQ message.
	*/
	static int
	do_pass_accept_req(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct listen_ctx listen_ctx = (struct listen_ctx )ctx;
	struct socket lso = listen_ctx->lso; / XXX need an interlock against the listen socket going away */
	struct tom_data *d = listen_ctx->tom_data;

	#if VALIDATE_TID
	struct cpl_pass_accept_req *req = cplhdr(m);
	unsigned int tid = GET_TID(req);
	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;

	if (unlikely(!lsk)) {
	printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
	cdev->name,
	(unsigned long)((union listen_entry *)ctx -
	t->stid_tab));
	return CPL_RET_BUF_DONE;
	}
	if (unlikely(tid >= t->ntids)) {
	printk(KERN_ERR "%s: passive open TID %u too large\n",
	cdev->name, tid);
	return CPL_RET_BUF_DONE;
	}
	/*
	* For T3A the current user of the TID may have closed but its last
	* message(s) may have been backlogged so the TID appears to be still
	* in use. Just take the TID away, the connection can close at its
	* own leisure. For T3B this situation is a bug.
	*/
	if (!valid_new_tid(t, tid) &&
	cdev->type != T3A) {
	printk(KERN_ERR "%s: passive open uses existing TID %u\n",
	cdev->name, tid);
	return CPL_RET_BUF_DONE;
	}
	#endif

	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
	return (0);
	}

	/*
	* Called when a connection is established to translate the TCP options
	* reported by HW to FreeBSD's native format.
	*/
	static void
	assign_rxopt(struct socket *so, unsigned int opt)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));

	inp_lock_assert(tp->t_inpcb);

	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
	tp->t_flags \|= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
	tp->t_flags \|= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
	tp->t_flags \|= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	(TF_RCVD_SCALE\|TF_REQ_SCALE))
	tp->rcv_scale = tp->request_r_scale;
	}

	/*
	* Completes some final bits of initialization for just established connections
	* and changes their state to TCP_ESTABLISHED.
	*
	* snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
	*/
	static void
	make_established(struct socket *so, u32 snd_isn, unsigned int opt)
	{
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
	assign_rxopt(so, opt);

	/*
	*XXXXXXXXXXX
	*
	*/
	#ifdef notyet
	so->so_proto->pr_ctloutput = t3_ctloutput;
	#endif

	#if 0
	inet_sk(sk)->id = tp->write_seq ^ jiffies;
	#endif
	/*
	* XXX not clear what rcv_wup maps to
	*/
	/*
	* Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
	* pass through opt0.
	*/
	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
	toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);

	dump_toepcb(toep);

	#ifdef notyet
	/*
	* no clean interface for marking ARP up to date
	*/
	dst_confirm(sk->sk_dst_cache);
	#endif
	tp->t_starttime = ticks;
	tp->t_state = TCPS_ESTABLISHED;
	soisconnected(so);
	}

	static int
	syncache_expand_establish_req(struct cpl_pass_establish req, struct socket so, struct toepcb toep)
	{

	struct in_conninfo inc;
	struct tcpopt to;
	struct tcphdr th;
	int mss, wsf, sack, ts;
	struct mbuf *m = NULL;
	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
	unsigned int opt;

	#ifdef MAC
	#error "no MAC support"
	#endif

	opt = ntohs(req->tcp_opt);

	bzero(&to, sizeof(struct tcpopt));

	/*
	* Fill out information for entering us into the syncache
	*/
	bzero(&inc, sizeof(inc));
	inc.inc_fport = th.th_sport = req->peer_port;
	inc.inc_lport = th.th_dport = req->local_port;
	th.th_seq = req->rcv_isn;
	th.th_flags = TH_ACK;

	inc.inc_isipv6 = 0;
	inc.inc_len = 0;
	inc.inc_faddr.s_addr = req->peer_ip;
	inc.inc_laddr.s_addr = req->local_ip;

	mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
	wsf = G_TCPOPT_WSCALE_OK(opt);
	ts = G_TCPOPT_TSTAMP(opt);
	sack = G_TCPOPT_SACK(opt);

	to.to_mss = mss;
	to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
	to.to_flags = (mss ? TOF_MSS : 0) \| (wsf ? TOF_SCALE : 0) \| (ts ? TOF_TS : 0) \| (sack ? TOF_SACKPERM : 0);

	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
	ntohl(req->local_ip), ntohs(req->local_port),
	ntohl(req->peer_ip), ntohs(req->peer_port),
	mss, wsf, ts, sack);
	return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
	}


	/*
	* Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
	* if we are in TCP_SYN_RECV due to crossed SYNs
	*/
	static int
	do_pass_establish(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct cpl_pass_establish *req = cplhdr(m);
	struct toepcb toep = (struct toepcb )ctx;
	struct tcpcb *tp = toep->tp_tp;
	struct socket so, lso;
	struct t3c_data *td = T3C_DATA(cdev);
	struct sockbuf snd, rcv;

	// Complete socket initialization now that we have the SND_ISN

	struct toedev *tdev;


	tdev = toep->tp_toedev;

	inp_wlock(tp->t_inpcb);

	/*
	*
	* XXX need to add reference while we're manipulating
	*/
	so = lso = inp_inpcbtosocket(tp->t_inpcb);

	inp_wunlock(tp->t_inpcb);

	so_lock(so);
	LIST_REMOVE(toep, synq_entry);
	so_unlock(so);

	if (!syncache_expand_establish_req(req, &so, toep)) {
	/*
	* No entry
	*/
	CXGB_UNIMPLEMENTED();
	}
	if (so == NULL) {
	/*
	* Couldn't create the socket
	*/
	CXGB_UNIMPLEMENTED();
	}

	tp = so_sototcpcb(so);
	inp_wlock(tp->t_inpcb);

	snd = so_sockbuf_snd(so);
	rcv = so_sockbuf_rcv(so);

	snd->sb_flags \|= SB_NOCOALESCE;
	rcv->sb_flags \|= SB_NOCOALESCE;

	toep->tp_tp = tp;
	toep->tp_flags = 0;
	tp->t_toe = toep;
	reset_wr_list(toep);
	tp->rcv_wnd = select_rcv_wnd(tdev, so);
	tp->rcv_nxt = toep->tp_copied_seq;
	install_offload_ops(so);

	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
	toep->tp_wr_unacked = 0;
	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
	toep->tp_qset_idx = 0;
	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);

	/*
	* XXX Cancel any keep alive timer
	*/

	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));

	/*
	* XXX workaround for lack of syncache drop
	*/
	toepcb_release(toep);
	inp_wunlock(tp->t_inpcb);

	CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
	cxgb_log_tcb(cdev->adapter, toep->tp_tid);
	#ifdef notyet
	/*
	* XXX not sure how these checks map to us
	*/
	if (unlikely(sk->sk_socket)) { // simultaneous opens only
	sk->sk_state_change(sk);
	sk_wake_async(so, 0, POLL_OUT);
	}
	/*
	* The state for the new connection is now up to date.
	* Next check if we should add the connection to the parent's
	* accept queue. When the parent closes it resets connections
	* on its SYN queue, so check if we are being reset. If so we
	* don't need to do anything more, the coming ABORT_RPL will
	* destroy this socket. Otherwise move the connection to the
	* accept queue.
	*
	* Note that we reset the synq before closing the server so if
	* we are not being reset the stid is still open.
	*/
	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
	__kfree_skb(skb);
	goto unlock;
	}
	#endif
	m_free(m);

	return (0);
	}

	/*
	* Fill in the right TID for CPL messages waiting in the out-of-order queue
	* and send them to the TOE.
	*/
	static void
	fixup_and_send_ofo(struct toepcb *toep)
	{
	struct mbuf *m;
	struct toedev *tdev = toep->tp_toedev;
	struct tcpcb *tp = toep->tp_tp;
	unsigned int tid = toep->tp_tid;

	log(LOG_NOTICE, "fixup_and_send_ofo\n");

	inp_lock_assert(tp->t_inpcb);
	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
	/*
	* A variety of messages can be waiting but the fields we'll
	* be touching are common to all so any message type will do.
	*/
	struct cpl_close_con_req *p = cplhdr(m);

	p->wr.wr_lo = htonl(V_WR_TID(tid));
	OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
	}
	}

	/*
	* Updates socket state from an active establish CPL message. Runs with the
	* socket lock held.
	*/
	static void
	socket_act_establish(struct socket so, struct mbuf m)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct cpl_act_establish *req = cplhdr(m);
	u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
	struct tcpcb *tp = so_sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
	log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
	toep->tp_tid, tp->t_state);

	tp->ts_recent_age = ticks;
	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;

	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));

	/*
	* Now that we finally have a TID send any CPL messages that we had to
	* defer for lack of a TID.
	*/
	if (mbufq_len(&toep->out_of_order_queue))
	fixup_and_send_ofo(toep);

	if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
	/*
	* XXX does this even make sense?
	*/
	so_sorwakeup(so);
	}
	m_free(m);
	#ifdef notyet
	/*
	* XXX assume no write requests permitted while socket connection is
	* incomplete
	*/
	/*
	* Currently the send queue must be empty at this point because the
	* socket layer does not send anything before a connection is
	* established. To be future proof though we handle the possibility
	* that there are pending buffers to send (either TX_DATA or
	* CLOSE_CON_REQ). First we need to adjust the sequence number of the
	* buffers according to the just learned write_seq, and then we send
	* them on their way.
	*/
	fixup_pending_writeq_buffers(sk);
	if (t3_push_frames(so, 1))
	sk->sk_write_space(sk);
	#endif

	toep->tp_state = tp->t_state;
	V_tcpstat.tcps_connects++;

	}

	/*
	* Process a CPL_ACT_ESTABLISH message.
	*/
	static int
	do_act_establish(struct t3cdev cdev, struct mbuf m, void *ctx)
	{
	struct cpl_act_establish *req = cplhdr(m);
	unsigned int tid = GET_TID(req);
	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
	struct toepcb toep = (struct toepcb )ctx;
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so;
	struct toedev *tdev;
	struct tom_data *d;

	if (tp == NULL) {
	free_atid(cdev, atid);
	return (0);
	}
	inp_wlock(tp->t_inpcb);

	/*
	* XXX
	*/
	so = inp_inpcbtosocket(tp->t_inpcb);
	tdev = toep->tp_toedev; /* blow up here if link was down */
	d = TOM_DATA(tdev);

	/*
	* It's OK if the TID is currently in use, the owning socket may have
	* backlogged its last CPL message(s). Just take it away.
	*/
	toep->tp_tid = tid;
	toep->tp_tp = tp;
	so_insert_tid(d, toep, tid);
	free_atid(cdev, atid);
	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));

	socket_act_establish(so, m);
	inp_wunlock(tp->t_inpcb);
	CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
	cxgb_log_tcb(cdev->adapter, toep->tp_tid);

	return (0);
	}

	/*
	* Process an acknowledgment of WR completion. Advance snd_una and send the
	* next batch of work requests from the write queue.
	*/
	static void
	wr_ack(struct toepcb toep, struct mbuf m)
	{
	struct tcpcb *tp = toep->tp_tp;
	struct cpl_wr_ack *hdr = cplhdr(m);
	struct socket *so;
	unsigned int credits = ntohs(hdr->credits);
	u32 snd_una = ntohl(hdr->snd_una);
	int bytes = 0;
	struct sockbuf *snd;

	CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);

	inp_wlock(tp->t_inpcb);
	so = inp_inpcbtosocket(tp->t_inpcb);
	toep->tp_wr_avail += credits;
	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
	toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;

	while (credits) {
	struct mbuf *p = peek_wr(toep);

	if (__predict_false(!p)) {
	log(LOG_ERR, "%u WR_ACK credits for TID %u with "
	"nothing pending, state %u wr_avail=%u\n",
	credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
	break;
	}
	CTR2(KTR_TOM,
	"wr_ack: p->credits=%d p->bytes=%d",
	p->m_pkthdr.csum_data, p->m_pkthdr.len);
	KASSERT(p->m_pkthdr.csum_data != 0,
	("empty request still on list"));

	if (__predict_false(credits < p->m_pkthdr.csum_data)) {

	#if DEBUG_WR > 1
	struct tx_data_wr *w = cplhdr(p);
	log(LOG_ERR,
	"TID %u got %u WR credits, need %u, len %u, "
	"main body %u, frags %u, seq # %u, ACK una %u,"
	" ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
	toep->tp_tid, credits, p->csum, p->len,
	p->len - p->data_len, skb_shinfo(p)->nr_frags,
	ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
	toep->tp_wr_avail, count_pending_wrs(tp) - credits);
	#endif
	p->m_pkthdr.csum_data -= credits;
	break;
	} else {
	dequeue_wr(toep);
	credits -= p->m_pkthdr.csum_data;
	bytes += p->m_pkthdr.len;
	CTR3(KTR_TOM,
	"wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
	p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);

	m_free(p);
	}
	}

	#if DEBUG_WR
	check_wr_invariants(tp);
	#endif

	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
	#if VALIDATE_SEQ
	struct tom_data *d = TOM_DATA(TOE_DEV(so));

	log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
	"for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
	toep->tp_tid, tp->snd_una);
	#endif
	goto out_free;
	}

	if (tp->snd_una != snd_una) {
	tp->snd_una = snd_una;
	tp->ts_recent_age = ticks;
	#ifdef notyet
	/*
	* Keep ARP entry "minty fresh"
	*/
	dst_confirm(sk->sk_dst_cache);
	#endif
	if (tp->snd_una == tp->snd_nxt)
	toep->tp_flags &= ~TP_TX_WAIT_IDLE;
	}

	snd = so_sockbuf_snd(so);
	if (bytes) {
	CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
	snd = so_sockbuf_snd(so);
	sockbuf_lock(snd);
	sbdrop_locked(snd, bytes);
	so_sowwakeup_locked(so);
	}

	if (snd->sb_sndptroff < snd->sb_cc)
	t3_push_frames(so, 0);

	out_free:
	inp_wunlock(tp->t_inpcb);
	m_free(m);
	}

	/*
	* Handler for TX_DATA_ACK CPL messages.
	*/
	static int
	do_wr_ack(struct t3cdev dev, struct mbuf m, void *ctx)
	{
	struct toepcb toep = (struct toepcb )ctx;

	VALIDATE_SOCK(so);

	wr_ack(toep, m);
	return 0;
	}

	/*
	* Handler for TRACE_PKT CPL messages. Just sink these packets.
	*/
	static int
	do_trace_pkt(struct t3cdev dev, struct mbuf m, void *ctx)
	{
	m_freem(m);
	return 0;
	}

	/*
	* Reset a connection that is on a listener's SYN queue or accept queue,
	* i.e., one that has not had a struct socket associated with it.
	* Must be called from process context.
	*
	* Modeled after code in inet_csk_listen_stop().
	*/
	static void
	t3_reset_listen_child(struct socket *child)
	{
	struct tcpcb *tp = so_sototcpcb(child);

	t3_send_reset(tp->t_toe);
	}


	static void
	t3_child_disconnect(struct socket so, void arg)
	{
	struct tcpcb *tp = so_sototcpcb(so);

	if (tp->t_flags & TF_TOE) {
	inp_wlock(tp->t_inpcb);
	t3_reset_listen_child(so);
	inp_wunlock(tp->t_inpcb);
	}
	}

	/*
	* Disconnect offloaded established but not yet accepted connections sitting
	* on a server's accept_queue. We just send an ABORT_REQ at this point and
	* finish off the disconnect later as we may need to wait for the ABORT_RPL.
	*/
	void
	t3_disconnect_acceptq(struct socket *listen_so)
	{

	so_lock(listen_so);
	so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
	so_unlock(listen_so);
	}

	/*
	* Reset offloaded connections sitting on a server's syn queue. As above
	* we send ABORT_REQ and finish off when we get ABORT_RPL.
	*/

	void
	t3_reset_synq(struct listen_ctx *lctx)
	{
	struct toepcb *toep;

	so_lock(lctx->lso);
	while (!LIST_EMPTY(&lctx->synq_head)) {
	toep = LIST_FIRST(&lctx->synq_head);
	LIST_REMOVE(toep, synq_entry);
	toep->tp_tp = NULL;
	t3_send_reset(toep);
	cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
	toepcb_release(toep);
	}
	so_unlock(lctx->lso);
	}


	int
	t3_setup_ppods(struct toepcb toep, const struct ddp_gather_list gl,
	unsigned int nppods, unsigned int tag, unsigned int maxoff,
	unsigned int pg_off, unsigned int color)
	{
	unsigned int i, j, pidx;
	struct pagepod *p;
	struct mbuf *m;
	struct ulp_mem_io *req;
	unsigned int tid = toep->tp_tid;
	const struct tom_data *td = TOM_DATA(toep->tp_toedev);
	unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;

	CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
	gl, nppods, tag, maxoff, pg_off, color);

	for (i = 0; i < nppods; ++i) {
	m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	req = mtod(m, struct ulp_mem_io *);
	m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
	req->wr.wr_lo = 0;
	req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) \|
	V_ULPTX_CMD(ULP_MEM_WRITE));
	req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) \|
	V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));

	p = (struct pagepod *)(req + 1);
	if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
	p->pp_vld_tid = htonl(F_PPOD_VALID \| V_PPOD_TID(tid));
	p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) \|
	V_PPOD_COLOR(color));
	p->pp_max_offset = htonl(maxoff);
	p->pp_page_offset = htonl(pg_off);
	p->pp_rsvd = 0;
	for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
	p->pp_addr[j] = pidx < gl->dgl_nelem ?
	htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
	} else
	p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
	send_or_defer(toep, m, 0);
	ppod_addr += PPOD_SIZE;
	}
	return (0);
	}

	/*
	* Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
	*/
	static inline void
	mk_cpl_barrier_ulp(struct cpl_barrier *b)
	{
	struct ulp_txpkt txpkt = (struct ulp_txpkt )b;

	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
	b->opcode = CPL_BARRIER;
	}

	/*
	* Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
	*/
	static inline void
	mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
	{
	struct ulp_txpkt txpkt = (struct ulp_txpkt )req;

	txpkt = (struct ulp_txpkt *)req;
	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
	req->cpuno = htons(cpuno);
	}

	/*
	* Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
	*/
	static inline void
	mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
	unsigned int word, uint64_t mask, uint64_t val)
	{
	struct ulp_txpkt txpkt = (struct ulp_txpkt )req;

	CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
	tid, word, mask, val);

	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
	req->reply = V_NO_REPLY(1);
	req->cpu_idx = 0;
	req->word = htons(word);
	req->mask = htobe64(mask);
	req->val = htobe64(val);
	}

	/*
	* Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
	*/
	static void
	mk_rx_data_ack_ulp(struct toepcb toep, struct cpl_rx_data_ack ack,
	unsigned int tid, unsigned int credits)
	{
	struct ulp_txpkt txpkt = (struct ulp_txpkt )ack;

	txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
	txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
	OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
	ack->credit_dack = htonl(F_RX_MODULATE \| F_RX_DACK_CHANGE \|
	V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) \|
	V_RX_CREDITS(credits));
	}

	void
	t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
	{
	unsigned int wrlen;
	struct mbuf *m;
	struct work_request_hdr *wr;
	struct cpl_barrier *lock;
	struct cpl_set_tcb_field *req;
	struct cpl_get_tcb *getreq;
	struct ddp_state *p = &toep->tp_ddp_state;

	#if 0
	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
	#endif
	wrlen = sizeof(wr) + sizeof(req) + 2 * sizeof(*lock) +
	sizeof(*getreq);
	m = m_gethdr_nofail(wrlen);
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	wr = mtod(m, struct work_request_hdr *);
	bzero(wr, wrlen);

	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
	m->m_pkthdr.len = m->m_len = wrlen;

	lock = (struct cpl_barrier *)(wr + 1);
	mk_cpl_barrier_ulp(lock);

	req = (struct cpl_set_tcb_field *)(lock + 1);

	CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);

	/* Hmmm, not sure if this actually a good thing: reactivating
	* the other buffer might be an issue if it has been completed
	* already. However, that is unlikely, since the fact that the UBUF
	* is not completed indicates that there is no oustanding data.
	*/
	if (bufidx == 0)
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
	V_TF_DDP_ACTIVE_BUF(1) \|
	V_TF_DDP_BUF0_VALID(1),
	V_TF_DDP_ACTIVE_BUF(1));
	else
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
	V_TF_DDP_ACTIVE_BUF(1) \|
	V_TF_DDP_BUF1_VALID(1), 0);

	getreq = (struct cpl_get_tcb *)(req + 1);
	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);

	mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));

	/* Keep track of the number of oustanding CPL_GET_TCB requests
	*/
	p->get_tcb_count++;

	#ifdef T3_TRACE
	T3_TRACE1(TIDTB(so),
	"t3_cancel_ddpbuf: bufidx %u", bufidx);
	#endif
	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
	}

	/**
	* t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
	* @sk: the socket associated with the buffers
	* @bufidx: index of HW DDP buffer (0 or 1)
	* @tag0: new tag for HW buffer 0
	* @tag1: new tag for HW buffer 1
	* @len: new length for HW buf @bufidx
	*
	* Sends a compound WR to overlay a new DDP buffer on top of an existing
	* buffer by changing the buffer tag and length and setting the valid and
	* active flag accordingly. The caller must ensure the new buffer is at
	* least as big as the existing one. Since we typically reprogram both HW
	* buffers this function sets both tags for convenience. Read the TCB to
	* determine how made data was written into the buffer before the overlay
	* took place.
	*/
	void
	t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
	unsigned int tag1, unsigned int len)
	{
	unsigned int wrlen;
	struct mbuf *m;
	struct work_request_hdr *wr;
	struct cpl_get_tcb *getreq;
	struct cpl_set_tcb_field *req;
	struct ddp_state *p = &toep->tp_ddp_state;

	CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
	bufidx, tag0, tag1, len);
	#if 0
	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
	#endif
	wrlen = sizeof(wr) + 3 sizeof(req) + sizeof(getreq);
	m = m_gethdr_nofail(wrlen);
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	wr = mtod(m, struct work_request_hdr *);
	m->m_pkthdr.len = m->m_len = wrlen;
	bzero(wr, wrlen);


	/* Set the ATOMIC flag to make sure that TP processes the following
	* CPLs in an atomic manner and no wire segments can be interleaved.
	*/
	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) \| F_WR_ATOMIC);
	req = (struct cpl_set_tcb_field *)(wr + 1);
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
	V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) \|
	V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
	V_TCB_RX_DDP_BUF0_TAG(tag0) \|
	V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
	req++;
	if (bufidx == 0) {
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
	V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
	V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
	req++;
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
	V_TF_DDP_PUSH_DISABLE_0(1) \|
	V_TF_DDP_BUF0_VALID(1) \| V_TF_DDP_ACTIVE_BUF(1),
	V_TF_DDP_PUSH_DISABLE_0(0) \|
	V_TF_DDP_BUF0_VALID(1));
	} else {
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
	V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
	V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
	req++;
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
	V_TF_DDP_PUSH_DISABLE_1(1) \|
	V_TF_DDP_BUF1_VALID(1) \| V_TF_DDP_ACTIVE_BUF(1),
	V_TF_DDP_PUSH_DISABLE_1(0) \|
	V_TF_DDP_BUF1_VALID(1) \| V_TF_DDP_ACTIVE_BUF(1));
	}

	getreq = (struct cpl_get_tcb *)(req + 1);
	mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);

	/* Keep track of the number of oustanding CPL_GET_TCB requests
	*/
	p->get_tcb_count++;

	#ifdef T3_TRACE
	T3_TRACE4(TIDTB(sk),
	"t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
	"len %d",
	bufidx, tag0, tag1, len);
	#endif
	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
	}

	/*
	* Sends a compound WR containing all the CPL messages needed to program the
	* two HW DDP buffers, namely optionally setting up the length and offset of
	* each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
	*/
	void
	t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
	unsigned int len1, unsigned int offset1,
	uint64_t ddp_flags, uint64_t flag_mask, int modulate)
	{
	unsigned int wrlen;
	struct mbuf *m;
	struct work_request_hdr *wr;
	struct cpl_set_tcb_field *req;

	CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
	len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);

	#if 0
	SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
	#endif
	wrlen = sizeof(wr) + sizeof(req) + (len0 ? sizeof(*req) : 0) +
	(len1 ? sizeof(*req) : 0) +
	(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
	m = m_gethdr_nofail(wrlen);
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
	wr = mtod(m, struct work_request_hdr *);
	bzero(wr, wrlen);

	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
	m->m_pkthdr.len = m->m_len = wrlen;

	req = (struct cpl_set_tcb_field *)(wr + 1);
	if (len0) { /* program buffer 0 offset and length */
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
	V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) \|
	V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
	V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) \|
	V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
	req++;
	}
	if (len1) { /* program buffer 1 offset and length */
	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
	V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) \|
	V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
	V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) \|
	V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
	req++;
	}

	mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
	ddp_flags);

	if (modulate) {
	mk_rx_data_ack_ulp(toep,
	(struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
	toep->tp_copied_seq - toep->tp_rcv_wup);
	toep->tp_rcv_wup = toep->tp_copied_seq;
	}

	#ifdef T3_TRACE
	T3_TRACE5(TIDTB(sk),
	"t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
	"modulate %d",
	len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
	modulate);
	#endif

	cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
	}

	void
	t3_init_wr_tab(unsigned int wr_len)
	{
	int i;

	if (mbuf_wrs[1]) /* already initialized */
	return;

	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
	int sgl_len = (3 * i) / 2 + (i & 1);

	sgl_len += 3;
	mbuf_wrs[i] = sgl_len <= wr_len ?
	1 : 1 + (sgl_len - 2) / (wr_len - 1);
	}

	wrlen = wr_len * 8;
	}

	int
	t3_init_cpl_io(void)
	{
	#ifdef notyet
	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
	if (!tcphdr_skb) {
	log(LOG_ERR,
	"Chelsio TCP offload: can't allocate sk_buff\n");
	return -1;
	}
	skb_put(tcphdr_skb, sizeof(struct tcphdr));
	tcphdr_skb->h.raw = tcphdr_skb->data;
	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
	#endif

	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
	return (0);
	}

	Index: head/sys/dev/firewire/firewire.c
	===================================================================
	--- head/sys/dev/firewire/firewire.c (revision 183549)
	+++ head/sys/dev/firewire/firewire.c (revision 183550)
	@@ -1,2285 +1,2285 @@
	/*-
	* Copyright (c) 2003 Hidetoshi Shimokawa
	* Copyright (c) 1998-2002 Katsushi Kobayashi and Hidetoshi Shimokawa
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the acknowledgement as bellow:
	*
	* This product includes software developed by K. Kobayashi and H. Shimokawa
	*
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/types.h>

	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/malloc.h>
	#include <sys/conf.h>
	#include <sys/sysctl.h>
	#include <sys/kthread.h>
	#include <sys/vimage.h>

	#include <sys/kdb.h>

	#if defined(__DragonFly__) \|\| __FreeBSD_version < 500000
	#include <machine/clock.h> /* for DELAY() */
	#endif

	#include <sys/bus.h> /* used by smbus and newbus */
	#include <machine/bus.h>

	#ifdef __DragonFly__
	#include "firewire.h"
	#include "firewirereg.h"
	#include "fwmem.h"
	#include "iec13213.h"
	#include "iec68113.h"
	#else
	#include <dev/firewire/firewire.h>
	#include <dev/firewire/firewirereg.h>
	#include <dev/firewire/fwmem.h>
	#include <dev/firewire/iec13213.h>
	#include <dev/firewire/iec68113.h>
	#endif

	struct crom_src_buf {
	struct crom_src src;
	struct crom_chunk root;
	struct crom_chunk vendor;
	struct crom_chunk hw;
	};

	int firewire_debug=0, try_bmr=1, hold_count=3;
	SYSCTL_INT(_debug, OID_AUTO, firewire_debug, CTLFLAG_RW, &firewire_debug, 0,
	"FireWire driver debug flag");
	SYSCTL_NODE(_hw, OID_AUTO, firewire, CTLFLAG_RD, 0, "FireWire Subsystem");
	SYSCTL_INT(_hw_firewire, OID_AUTO, try_bmr, CTLFLAG_RW, &try_bmr, 0,
	"Try to be a bus manager");
	SYSCTL_INT(_hw_firewire, OID_AUTO, hold_count, CTLFLAG_RW, &hold_count, 0,
	"Number of count of bus resets for removing lost device information");

	MALLOC_DEFINE(M_FW, "firewire", "FireWire");
	MALLOC_DEFINE(M_FWXFER, "fw_xfer", "XFER/FireWire");

	#define FW_MAXASYRTY 4

	devclass_t firewire_devclass;

	static void firewire_identify (driver_t *, device_t);
	static int firewire_probe (device_t);
	static int firewire_attach (device_t);
	static int firewire_detach (device_t);
	static int firewire_resume (device_t);
	static void firewire_xfer_timeout(void *, int);
	#if 0
	static int firewire_shutdown (device_t);
	#endif
	static device_t firewire_add_child (device_t, int, const char *, int);
	static void fw_try_bmr (void *);
	static void fw_try_bmr_callback (struct fw_xfer *);
	static void fw_asystart (struct fw_xfer *);
	static int fw_get_tlabel (struct firewire_comm , struct fw_xfer );
	static void fw_bus_probe (struct firewire_comm *);
	static void fw_attach_dev (struct firewire_comm *);
	static void fw_bus_probe_thread(void *);
	#ifdef FW_VMACCESS
	static void fw_vmaccess (struct fw_xfer *);
	#endif
	static int fw_bmr (struct firewire_comm *);
	static void fw_dump_hdr(struct fw_pkt , char );

	static device_method_t firewire_methods[] = {
	/* Device interface */
	DEVMETHOD(device_identify, firewire_identify),
	DEVMETHOD(device_probe, firewire_probe),
	DEVMETHOD(device_attach, firewire_attach),
	DEVMETHOD(device_detach, firewire_detach),
	DEVMETHOD(device_suspend, bus_generic_suspend),
	DEVMETHOD(device_resume, firewire_resume),
	DEVMETHOD(device_shutdown, bus_generic_shutdown),

	/* Bus interface */
	DEVMETHOD(bus_add_child, firewire_add_child),
	DEVMETHOD(bus_print_child, bus_generic_print_child),

	{ 0, 0 }
	};
	char *linkspeed[] = {
	"S100", "S200", "S400", "S800",
	"S1600", "S3200", "undef", "undef"
	};

	static char *tcode_str[] = {
	"WREQQ", "WREQB", "WRES", "undef",
	"RREQQ", "RREQB", "RRESQ", "RRESB",
	"CYCS", "LREQ", "STREAM", "LRES",
	"undef", "undef", "PHY", "undef"
	};

	/* IEEE-1394a Table C-2 Gap count as a function of hops*/
	#define MAX_GAPHOP 15
	u_int gap_cnt[] = { 5, 5, 7, 8, 10, 13, 16, 18,
	21, 24, 26, 29, 32, 35, 37, 40};

	static driver_t firewire_driver = {
	"firewire",
	firewire_methods,
	sizeof(struct firewire_softc),
	};

	/*
	* Lookup fwdev by node id.
	*/
	struct fw_device *
	fw_noderesolve_nodeid(struct firewire_comm *fc, int dst)
	{
	struct fw_device *fwdev;
	int s;

	s = splfw();
	STAILQ_FOREACH(fwdev, &fc->devices, link)
	if (fwdev->dst == dst && fwdev->status != FWDEVINVAL)
	break;
	splx(s);

	return fwdev;
	}

	/*
	* Lookup fwdev by EUI64.
	*/
	struct fw_device *
	fw_noderesolve_eui64(struct firewire_comm fc, struct fw_eui64 eui)
	{
	struct fw_device *fwdev;
	int s;

	s = splfw();
	FW_GLOCK(fc);
	STAILQ_FOREACH(fwdev, &fc->devices, link)
	if (FW_EUI64_EQUAL(fwdev->eui, *eui))
	break;
	FW_GUNLOCK(fc);
	splx(s);

	if(fwdev == NULL) return NULL;
	if(fwdev->status == FWDEVINVAL) return NULL;
	return fwdev;
	}

	/*
	* Async. request procedure for userland application.
	*/
	int
	fw_asyreq(struct firewire_comm fc, int sub, struct fw_xfer xfer)
	{
	int err = 0;
	struct fw_xferq *xferq;
	int len;
	struct fw_pkt *fp;
	int tcode;
	struct tcode_info *info;

	if(xfer == NULL) return EINVAL;
	if(xfer->hand == NULL){
	printf("hand == NULL\n");
	return EINVAL;
	}
	fp = &xfer->send.hdr;

	tcode = fp->mode.common.tcode & 0xf;
	info = &fc->tcode[tcode];
	if (info->flag == 0) {
	printf("invalid tcode=%x\n", tcode);
	return EINVAL;
	}

	/* XXX allow bus explore packets only after bus rest */
	if ((fc->status < FWBUSEXPLORE) &&
	((tcode != FWTCODE_RREQQ) \|\| (fp->mode.rreqq.dest_hi != 0xffff) \|\|
	(fp->mode.rreqq.dest_lo < 0xf0000000) \|\|
	(fp->mode.rreqq.dest_lo >= 0xf0001000))) {
	xfer->resp = EAGAIN;
	xfer->flag = FWXF_BUSY;
	return (EAGAIN);
	}

	if (info->flag & FWTI_REQ)
	xferq = fc->atq;
	else
	xferq = fc->ats;
	len = info->hdr_len;
	if (xfer->send.pay_len > MAXREC(fc->maxrec)) {
	printf("send.pay_len > maxrec\n");
	return EINVAL;
	}
	if (info->flag & FWTI_BLOCK_STR)
	len = fp->mode.stream.len;
	else if (info->flag & FWTI_BLOCK_ASY)
	len = fp->mode.rresb.len;
	else
	len = 0;
	if (len != xfer->send.pay_len){
	printf("len(%d) != send.pay_len(%d) %s(%x)\n",
	len, xfer->send.pay_len, tcode_str[tcode], tcode);
	return EINVAL;
	}

	if(xferq->start == NULL){
	printf("xferq->start == NULL\n");
	return EINVAL;
	}
	if(!(xferq->queued < xferq->maxq)){
	device_printf(fc->bdev, "Discard a packet (queued=%d)\n",
	xferq->queued);
	return EAGAIN;
	}

	xfer->tl = -1;
	if (info->flag & FWTI_TLABEL) {
	if (fw_get_tlabel(fc, xfer) < 0)
	return EAGAIN;
	}

	xfer->resp = 0;
	xfer->fc = fc;
	xfer->q = xferq;

	fw_asystart(xfer);
	return err;
	}
	/*
	* Wakeup blocked process.
	*/
	void
	fw_xferwake(struct fw_xfer *xfer)
	{
	struct mtx *lock = &xfer->fc->wait_lock;

	mtx_lock(lock);
	xfer->flag \|= FWXF_WAKE;
	mtx_unlock(lock);

	wakeup(xfer);
	return;
	}

	int
	fw_xferwait(struct fw_xfer *xfer)
	{
	struct mtx *lock = &xfer->fc->wait_lock;
	int err = 0;

	mtx_lock(lock);
	if ((xfer->flag & FWXF_WAKE) == 0)
	err = msleep((void *)xfer, lock, PWAIT\|PCATCH, "fw_xferwait", 0);
	mtx_unlock(lock);

	return (err);
	}

	/*
	* Async. request with given xfer structure.
	*/
	static void
	fw_asystart(struct fw_xfer *xfer)
	{
	struct firewire_comm *fc = xfer->fc;
	int s;
	s = splfw();
	/* Protect from interrupt/timeout */
	FW_GLOCK(fc);
	xfer->flag = FWXF_INQ;
	STAILQ_INSERT_TAIL(&xfer->q->q, xfer, link);
	#if 0
	xfer->q->queued ++;
	#endif
	FW_GUNLOCK(fc);
	splx(s);
	/* XXX just queue for mbuf */
	if (xfer->mbuf == NULL)
	xfer->q->start(fc);
	return;
	}

	static void
	firewire_identify(driver_t *driver, device_t parent)
	{
	BUS_ADD_CHILD(parent, 0, "firewire", -1);
	}

	static int
	firewire_probe(device_t dev)
	{
	device_set_desc(dev, "IEEE1394(FireWire) bus");
	return (0);
	}

	static void
	firewire_xfer_timeout(void *arg, int pending)
	{
	struct firewire_comm fc = (struct firewire_comm )arg;
	struct fw_xfer xfer, txfer;
	struct timeval tv;
	struct timeval split_timeout;
	STAILQ_HEAD(, fw_xfer) xfer_timeout;
	int i, s;

	split_timeout.tv_sec = 0;
	split_timeout.tv_usec = 200 * 1000; /* 200 msec */

	microtime(&tv);
	timevalsub(&tv, &split_timeout);
	STAILQ_INIT(&xfer_timeout);

	s = splfw();
	mtx_lock(&fc->tlabel_lock);
	for (i = 0; i < 0x40; i ++) {
	while ((xfer = STAILQ_FIRST(&fc->tlabels[i])) != NULL) {
	if ((xfer->flag & FWXF_SENT) == 0)
	/* not sent yet */
	break;
	if (timevalcmp(&xfer->tv, &tv, >))
	/* the rests are newer than this */
	break;
	device_printf(fc->bdev,
	"split transaction timeout: "
	"tl=0x%x flag=0x%02x\n", i, xfer->flag);
	fw_dump_hdr(&xfer->send.hdr, "send");
	xfer->resp = ETIMEDOUT;
	STAILQ_REMOVE_HEAD(&fc->tlabels[i], tlabel);
	STAILQ_INSERT_TAIL(&xfer_timeout, xfer, tlabel);
	}
	}
	mtx_unlock(&fc->tlabel_lock);
	splx(s);
	fc->timeout(fc);

	STAILQ_FOREACH_SAFE(xfer, &xfer_timeout, tlabel, txfer)
	xfer->hand(xfer);
	}

	#define WATCHDOG_HZ 10
	static void
	firewire_watchdog(void *arg)
	{
	struct firewire_comm *fc;
	static int watchdog_clock = 0;

	fc = (struct firewire_comm *)arg;

	/*
	* At boot stage, the device interrupt is disabled and
	* We encounter a timeout easily. To avoid this,
	* ignore clock interrupt for a while.
	*/
	if (watchdog_clock > WATCHDOG_HZ * 15)
	taskqueue_enqueue(fc->taskqueue, &fc->task_timeout);
	else
	watchdog_clock ++;

	callout_reset(&fc->timeout_callout, hz / WATCHDOG_HZ,
	(void )firewire_watchdog, (void )fc);
	}

	/*
	* The attach routine.
	*/
	static int
	firewire_attach(device_t dev)
	{
	int unit;
	struct firewire_softc *sc = device_get_softc(dev);
	device_t pa = device_get_parent(dev);
	struct firewire_comm *fc;

	fc = (struct firewire_comm *)device_get_softc(pa);
	sc->fc = fc;
	fc->status = FWBUSNOTREADY;

	unit = device_get_unit(dev);
	if( fc->nisodma > FWMAXNDMA) fc->nisodma = FWMAXNDMA;

	fwdev_makedev(sc);

	mtx_init(&fc->wait_lock, "fwwait", NULL, MTX_DEF);
	mtx_init(&fc->tlabel_lock, "fwtlabel", NULL, MTX_DEF);
	CALLOUT_INIT(&fc->timeout_callout);
	CALLOUT_INIT(&fc->bmr_callout);
	CALLOUT_INIT(&fc->busprobe_callout);
	TASK_INIT(&fc->task_timeout, 0, firewire_xfer_timeout, (void *)fc);

	callout_reset(&sc->fc->timeout_callout, hz,
	(void )firewire_watchdog, (void )sc->fc);

	/* create thread */
	kproc_create(fw_bus_probe_thread, (void *)fc, &fc->probe_thread,
	0, 0, "fw%d_probe", unit);

	/* Locate our children */
	bus_generic_probe(dev);

	/* launch attachement of the added children */
	bus_generic_attach(dev);

	/* bus_reset */
	fw_busreset(fc, FWBUSNOTREADY);
	fc->ibr(fc);

	return 0;
	}

	/*
	* Attach it as child.
	*/
	static device_t
	firewire_add_child(device_t dev, int order, const char *name, int unit)
	{
	device_t child;
	struct firewire_softc *sc;

	sc = (struct firewire_softc *)device_get_softc(dev);
	child = device_add_child(dev, name, unit);
	if (child) {
	device_set_ivars(child, sc->fc);
	device_probe_and_attach(child);
	}

	return child;
	}

	static int
	firewire_resume(device_t dev)
	{
	struct firewire_softc *sc;

	sc = (struct firewire_softc *)device_get_softc(dev);
	sc->fc->status = FWBUSNOTREADY;

	bus_generic_resume(dev);

	return(0);
	}

	/*
	* Dettach it.
	*/
	static int
	firewire_detach(device_t dev)
	{
	struct firewire_softc *sc;
	struct firewire_comm *fc;
	struct fw_device fwdev, fwdev_next;
	int err;

	sc = (struct firewire_softc *)device_get_softc(dev);
	fc = sc->fc;
	mtx_lock(&fc->wait_lock);
	fc->status = FWBUSDETACH;
	wakeup(fc);
	if (msleep(fc->probe_thread, &fc->wait_lock, PWAIT, "fwthr", hz * 60))
	printf("firewire probe thread didn't die\n");
	mtx_unlock(&fc->wait_lock);

	if (fc->arq !=0 && fc->arq->maxq > 0)
	fw_drain_txq(fc);

	if ((err = fwdev_destroydev(sc)) != 0)
	return err;

	if ((err = bus_generic_detach(dev)) != 0)
	return err;

	callout_stop(&fc->timeout_callout);
	callout_stop(&fc->bmr_callout);
	callout_stop(&fc->busprobe_callout);

	/* XXX xfer_free and untimeout on all xfers */
	for (fwdev = STAILQ_FIRST(&fc->devices); fwdev != NULL;
	fwdev = fwdev_next) {
	fwdev_next = STAILQ_NEXT(fwdev, link);
	free(fwdev, M_FW);
	}
	free(fc->topology_map, M_FW);
	free(fc->speed_map, M_FW);
	free(fc->crom_src_buf, M_FW);

	mtx_destroy(&fc->tlabel_lock);
	mtx_destroy(&fc->wait_lock);
	return(0);
	}
	#if 0
	static int
	firewire_shutdown( device_t dev )
	{
	return 0;
	}
	#endif


	static void
	fw_xferq_drain(struct fw_xferq *xferq)
	{
	struct fw_xfer *xfer;

	while ((xfer = STAILQ_FIRST(&xferq->q)) != NULL) {
	STAILQ_REMOVE_HEAD(&xferq->q, link);
	#if 0
	xferq->queued --;
	#endif
	xfer->resp = EAGAIN;
	xfer->flag = FWXF_SENTERR;
	fw_xfer_done(xfer);
	}
	}

	void
	fw_drain_txq(struct firewire_comm *fc)
	{
	struct fw_xfer xfer, txfer;
	STAILQ_HEAD(, fw_xfer) xfer_drain;
	int i;

	STAILQ_INIT(&xfer_drain);

	FW_GLOCK(fc);
	fw_xferq_drain(fc->atq);
	fw_xferq_drain(fc->ats);
	for(i = 0; i < fc->nisodma; i++)
	fw_xferq_drain(fc->it[i]);
	FW_GUNLOCK(fc);

	mtx_lock(&fc->tlabel_lock);
	for (i = 0; i < 0x40; i ++)
	while ((xfer = STAILQ_FIRST(&fc->tlabels[i])) != NULL) {
	if (firewire_debug)
	printf("tl=%d flag=%d\n", i, xfer->flag);
	xfer->resp = EAGAIN;
	STAILQ_REMOVE_HEAD(&fc->tlabels[i], tlabel);
	STAILQ_INSERT_TAIL(&xfer_drain, xfer, tlabel);
	}
	mtx_unlock(&fc->tlabel_lock);

	STAILQ_FOREACH_SAFE(xfer, &xfer_drain, tlabel, txfer)
	xfer->hand(xfer);
	}

	static void
	fw_reset_csr(struct firewire_comm *fc)
	{
	int i;

	CSRARC(fc, STATE_CLEAR)
	= 1 << 23 \| 0 << 17 \| 1 << 16 \| 1 << 15 \| 1 << 14 ;
	CSRARC(fc, STATE_SET) = CSRARC(fc, STATE_CLEAR);
	CSRARC(fc, NODE_IDS) = 0x3f;

	CSRARC(fc, TOPO_MAP + 8) = 0;
	fc->irm = -1;

	fc->max_node = -1;

	for(i = 2; i < 0x100/4 - 2 ; i++){
	CSRARC(fc, SPED_MAP + i * 4) = 0;
	}
	CSRARC(fc, STATE_CLEAR) = 1 << 23 \| 0 << 17 \| 1 << 16 \| 1 << 15 \| 1 << 14 ;
	CSRARC(fc, STATE_SET) = CSRARC(fc, STATE_CLEAR);
	CSRARC(fc, RESET_START) = 0;
	CSRARC(fc, SPLIT_TIMEOUT_HI) = 0;
	CSRARC(fc, SPLIT_TIMEOUT_LO) = 800 << 19;
	CSRARC(fc, CYCLE_TIME) = 0x0;
	CSRARC(fc, BUS_TIME) = 0x0;
	CSRARC(fc, BUS_MGR_ID) = 0x3f;
	CSRARC(fc, BANDWIDTH_AV) = 4915;
	CSRARC(fc, CHANNELS_AV_HI) = 0xffffffff;
	CSRARC(fc, CHANNELS_AV_LO) = 0xffffffff;
	CSRARC(fc, IP_CHANNELS) = (1 << 31);

	CSRARC(fc, CONF_ROM) = 0x04 << 24;
	CSRARC(fc, CONF_ROM + 4) = 0x31333934; /* means strings 1394 */
	CSRARC(fc, CONF_ROM + 8) = 1 << 31 \| 1 << 30 \| 1 << 29 \|
	1 << 28 \| 0xff << 16 \| 0x09 << 8;
	CSRARC(fc, CONF_ROM + 0xc) = 0;

	/* DV depend CSRs see blue book */
	CSRARC(fc, oPCR) &= ~DV_BROADCAST_ON;
	CSRARC(fc, iPCR) &= ~DV_BROADCAST_ON;

	CSRARC(fc, STATE_CLEAR) &= ~(1 << 23 \| 1 << 15 \| 1 << 14 );
	CSRARC(fc, STATE_SET) = CSRARC(fc, STATE_CLEAR);
	}

	static void
	fw_init_crom(struct firewire_comm *fc)
	{
	struct crom_src *src;

	fc->crom_src_buf = (struct crom_src_buf *)
	malloc(sizeof(struct crom_src_buf), M_FW, M_WAITOK \| M_ZERO);
	if (fc->crom_src_buf == NULL)
	return;

	src = &fc->crom_src_buf->src;
	bzero(src, sizeof(struct crom_src));

	/* BUS info sample */
	src->hdr.info_len = 4;

	src->businfo.bus_name = CSR_BUS_NAME_IEEE1394;

	src->businfo.irmc = 1;
	src->businfo.cmc = 1;
	src->businfo.isc = 1;
	src->businfo.bmc = 1;
	src->businfo.pmc = 0;
	src->businfo.cyc_clk_acc = 100;
	src->businfo.max_rec = fc->maxrec;
	src->businfo.max_rom = MAXROM_4;
	src->businfo.generation = 1;
	src->businfo.link_spd = fc->speed;

	src->businfo.eui64.hi = fc->eui.hi;
	src->businfo.eui64.lo = fc->eui.lo;

	STAILQ_INIT(&src->chunk_list);

	fc->crom_src = src;
	fc->crom_root = &fc->crom_src_buf->root;
	}

	static void
	fw_reset_crom(struct firewire_comm *fc)
	{
	struct crom_src_buf *buf;
	struct crom_src *src;
	struct crom_chunk *root;

	if (fc->crom_src_buf == NULL)
	fw_init_crom(fc);

	buf = fc->crom_src_buf;
	src = fc->crom_src;
	root = fc->crom_root;

	STAILQ_INIT(&src->chunk_list);

	bzero(root, sizeof(struct crom_chunk));
	crom_add_chunk(src, NULL, root, 0);
	crom_add_entry(root, CSRKEY_NCAP, 0x0083c0); /* XXX */
	/* private company_id */
	crom_add_entry(root, CSRKEY_VENDOR, CSRVAL_VENDOR_PRIVATE);
	#ifdef __DragonFly__
	crom_add_simple_text(src, root, &buf->vendor, "DragonFly Project");
	crom_add_entry(root, CSRKEY_HW, __DragonFly_cc_version);
	#else
	crom_add_simple_text(src, root, &buf->vendor, "FreeBSD Project");
	crom_add_entry(root, CSRKEY_HW, __FreeBSD_version);
	#endif
	- crom_add_simple_text(src, root, &buf->hw, V_hostname);
	+ crom_add_simple_text(src, root, &buf->hw, G_hostname);
	}

	/*
	* Called after bus reset.
	*/
	void
	fw_busreset(struct firewire_comm *fc, uint32_t new_status)
	{
	struct firewire_dev_comm *fdc;
	struct crom_src *src;
	device_t *devlistp;
	void *newrom;
	int i, devcnt;

	switch(fc->status){
	case FWBUSMGRELECT:
	callout_stop(&fc->bmr_callout);
	break;
	default:
	break;
	}
	fc->status = new_status;
	fw_reset_csr(fc);
	fw_reset_crom(fc);

	if (device_get_children(fc->bdev, &devlistp, &devcnt) == 0) {
	for( i = 0 ; i < devcnt ; i++)
	if (device_get_state(devlistp[i]) >= DS_ATTACHED) {
	fdc = device_get_softc(devlistp[i]);
	if (fdc->post_busreset != NULL)
	fdc->post_busreset(fdc);
	}
	free(devlistp, M_TEMP);
	}

	newrom = malloc(CROMSIZE, M_FW, M_NOWAIT \| M_ZERO);
	src = &fc->crom_src_buf->src;
	crom_load(src, (uint32_t *)newrom, CROMSIZE);
	if (bcmp(newrom, fc->config_rom, CROMSIZE) != 0) {
	/* bump generation and reload */
	src->businfo.generation ++;
	/* generation must be between 0x2 and 0xF */
	if (src->businfo.generation < 2)
	src->businfo.generation ++;
	crom_load(src, (uint32_t *)newrom, CROMSIZE);
	bcopy(newrom, (void *)fc->config_rom, CROMSIZE);
	}
	free(newrom, M_FW);
	}

	/* Call once after reboot */
	void fw_init(struct firewire_comm *fc)
	{
	int i;
	#ifdef FW_VMACCESS
	struct fw_xfer *xfer;
	struct fw_bind *fwb;
	#endif

	fc->arq->queued = 0;
	fc->ars->queued = 0;
	fc->atq->queued = 0;
	fc->ats->queued = 0;

	fc->arq->buf = NULL;
	fc->ars->buf = NULL;
	fc->atq->buf = NULL;
	fc->ats->buf = NULL;

	fc->arq->flag = 0;
	fc->ars->flag = 0;
	fc->atq->flag = 0;
	fc->ats->flag = 0;

	STAILQ_INIT(&fc->atq->q);
	STAILQ_INIT(&fc->ats->q);

	for( i = 0 ; i < fc->nisodma ; i ++ ){
	fc->it[i]->queued = 0;
	fc->ir[i]->queued = 0;

	fc->it[i]->start = NULL;
	fc->ir[i]->start = NULL;

	fc->it[i]->buf = NULL;
	fc->ir[i]->buf = NULL;

	fc->it[i]->flag = FWXFERQ_STREAM;
	fc->ir[i]->flag = FWXFERQ_STREAM;

	STAILQ_INIT(&fc->it[i]->q);
	STAILQ_INIT(&fc->ir[i]->q);
	}

	fc->arq->maxq = FWMAXQUEUE;
	fc->ars->maxq = FWMAXQUEUE;
	fc->atq->maxq = FWMAXQUEUE;
	fc->ats->maxq = FWMAXQUEUE;

	for( i = 0 ; i < fc->nisodma ; i++){
	fc->ir[i]->maxq = FWMAXQUEUE;
	fc->it[i]->maxq = FWMAXQUEUE;
	}
	/* Initialize csr registers */
	fc->topology_map = (struct fw_topology_map *)malloc(
	sizeof(struct fw_topology_map),
	M_FW, M_NOWAIT \| M_ZERO);
	fc->speed_map = (struct fw_speed_map *)malloc(
	sizeof(struct fw_speed_map),
	M_FW, M_NOWAIT \| M_ZERO);
	CSRARC(fc, TOPO_MAP) = 0x3f1 << 16;
	CSRARC(fc, TOPO_MAP + 4) = 1;
	CSRARC(fc, SPED_MAP) = 0x3f1 << 16;
	CSRARC(fc, SPED_MAP + 4) = 1;

	STAILQ_INIT(&fc->devices);

	/* Initialize Async handlers */
	STAILQ_INIT(&fc->binds);
	for( i = 0 ; i < 0x40 ; i++){
	STAILQ_INIT(&fc->tlabels[i]);
	}

	/* DV depend CSRs see blue book */
	#if 0
	CSRARC(fc, oMPR) = 0x3fff0001; /* # output channel = 1 */
	CSRARC(fc, oPCR) = 0x8000007a;
	for(i = 4 ; i < 0x7c/4 ; i+=4){
	CSRARC(fc, i + oPCR) = 0x8000007a;
	}

	CSRARC(fc, iMPR) = 0x00ff0001; /* # input channel = 1 */
	CSRARC(fc, iPCR) = 0x803f0000;
	for(i = 4 ; i < 0x7c/4 ; i+=4){
	CSRARC(fc, i + iPCR) = 0x0;
	}
	#endif

	fc->crom_src_buf = NULL;

	#ifdef FW_VMACCESS
	xfer = fw_xfer_alloc();
	if(xfer == NULL) return;

	fwb = (struct fw_bind *)malloc(sizeof (struct fw_bind), M_FW, M_NOWAIT);
	if(fwb == NULL){
	fw_xfer_free(xfer);
	return;
	}
	xfer->hand = fw_vmaccess;
	xfer->fc = fc;
	xfer->sc = NULL;

	fwb->start_hi = 0x2;
	fwb->start_lo = 0;
	fwb->addrlen = 0xffffffff;
	fwb->xfer = xfer;
	fw_bindadd(fc, fwb);
	#endif
	}

	#define BIND_CMP(addr, fwb) (((addr) < (fwb)->start)?-1:\
	((fwb)->end < (addr))?1:0)

	/*
	* To lookup bound process from IEEE1394 address.
	*/
	struct fw_bind *
	fw_bindlookup(struct firewire_comm *fc, uint16_t dest_hi, uint32_t dest_lo)
	{
	u_int64_t addr;
	struct fw_bind tfw, r = NULL;

	addr = ((u_int64_t)dest_hi << 32) \| dest_lo;
	FW_GLOCK(fc);
	STAILQ_FOREACH(tfw, &fc->binds, fclist)
	if (BIND_CMP(addr, tfw) == 0) {
	r = tfw;
	break;
	}
	FW_GUNLOCK(fc);
	return(r);
	}

	/*
	* To bind IEEE1394 address block to process.
	*/
	int
	fw_bindadd(struct firewire_comm fc, struct fw_bind fwb)
	{
	struct fw_bind tfw, prev = NULL;
	int r = 0;

	if (fwb->start > fwb->end) {
	printf("%s: invalid range\n", __func__);
	return EINVAL;
	}

	FW_GLOCK(fc);
	STAILQ_FOREACH(tfw, &fc->binds, fclist) {
	if (fwb->end < tfw->start)
	break;
	prev = tfw;
	}
	if (prev == NULL)
	STAILQ_INSERT_HEAD(&fc->binds, fwb, fclist);
	else if (prev->end < fwb->start)
	STAILQ_INSERT_AFTER(&fc->binds, prev, fwb, fclist);
	else {
	printf("%s: bind failed\n", __func__);
	r = EBUSY;
	}
	FW_GUNLOCK(fc);
	return (r);
	}

	/*
	* To free IEEE1394 address block.
	*/
	int
	fw_bindremove(struct firewire_comm fc, struct fw_bind fwb)
	{
	#if 0
	struct fw_xfer xfer, next;
	#endif
	struct fw_bind *tfw;
	int s;

	s = splfw();
	FW_GLOCK(fc);
	STAILQ_FOREACH(tfw, &fc->binds, fclist)
	if (tfw == fwb) {
	STAILQ_REMOVE(&fc->binds, fwb, fw_bind, fclist);
	goto found;
	}

	printf("%s: no such binding\n", __func__);
	FW_GUNLOCK(fc);
	splx(s);
	return (1);
	found:
	#if 0
	/* shall we do this? */
	for (xfer = STAILQ_FIRST(&fwb->xferlist); xfer != NULL; xfer = next) {
	next = STAILQ_NEXT(xfer, link);
	fw_xfer_free(xfer);
	}
	STAILQ_INIT(&fwb->xferlist);
	#endif
	FW_GUNLOCK(fc);

	splx(s);
	return 0;
	}

	int
	fw_xferlist_add(struct fw_xferlist q, struct malloc_type type,
	int slen, int rlen, int n,
	struct firewire_comm fc, void sc, void (hand)(struct fw_xfer ))
	{
	int i, s;
	struct fw_xfer *xfer;

	for (i = 0; i < n; i++) {
	xfer = fw_xfer_alloc_buf(type, slen, rlen);
	if (xfer == NULL)
	return (n);
	xfer->fc = fc;
	xfer->sc = sc;
	xfer->hand = hand;
	s = splfw();
	STAILQ_INSERT_TAIL(q, xfer, link);
	splx(s);
	}
	return (n);
	}

	void
	fw_xferlist_remove(struct fw_xferlist *q)
	{
	struct fw_xfer xfer, next;

	for (xfer = STAILQ_FIRST(q); xfer != NULL; xfer = next) {
	next = STAILQ_NEXT(xfer, link);
	fw_xfer_free_buf(xfer);
	}
	STAILQ_INIT(q);
	}
	/*
	* dump packet header
	*/
	static void
	fw_dump_hdr(struct fw_pkt fp, char prefix)
	{
	printf("%s: dst=0x%02x tl=0x%02x rt=%d tcode=0x%x pri=0x%x "
	"src=0x%03x\n", prefix,
	fp->mode.hdr.dst & 0x3f,
	fp->mode.hdr.tlrt >> 2, fp->mode.hdr.tlrt & 3,
	fp->mode.hdr.tcode, fp->mode.hdr.pri,
	fp->mode.hdr.src);
	}

	/*
	* To free transaction label.
	*/
	static void
	fw_tl_free(struct firewire_comm fc, struct fw_xfer xfer)
	{
	struct fw_xfer *txfer;
	int s;

	if (xfer->tl < 0)
	return;

	s = splfw();
	mtx_lock(&fc->tlabel_lock);
	#if 1 /* make sure the label is allocated */
	STAILQ_FOREACH(txfer, &fc->tlabels[xfer->tl], tlabel)
	if(txfer == xfer)
	break;
	if (txfer == NULL) {
	printf("%s: the xfer is not in the queue "
	"(tlabel=%d, flag=0x%x)\n",
	__FUNCTION__, xfer->tl, xfer->flag);
	fw_dump_hdr(&xfer->send.hdr, "send");
	fw_dump_hdr(&xfer->recv.hdr, "recv");
	kdb_backtrace();
	mtx_unlock(&fc->tlabel_lock);
	splx(s);
	return;
	}
	#endif

	STAILQ_REMOVE(&fc->tlabels[xfer->tl], xfer, fw_xfer, tlabel);
	mtx_unlock(&fc->tlabel_lock);
	splx(s);
	return;
	}

	/*
	* To obtain XFER structure by transaction label.
	*/
	static struct fw_xfer *
	fw_tl2xfer(struct firewire_comm *fc, int node, int tlabel, int tcode)
	{
	struct fw_xfer *xfer;
	int s = splfw();
	int req;

	mtx_lock(&fc->tlabel_lock);
	STAILQ_FOREACH(xfer, &fc->tlabels[tlabel], tlabel)
	if(xfer->send.hdr.mode.hdr.dst == node) {
	mtx_unlock(&fc->tlabel_lock);
	splx(s);
	KASSERT(xfer->tl == tlabel,
	("xfer->tl 0x%x != 0x%x", xfer->tl, tlabel));
	/* extra sanity check */
	req = xfer->send.hdr.mode.hdr.tcode;
	if (xfer->fc->tcode[req].valid_res != tcode) {
	printf("%s: invalid response tcode "
	"(0x%x for 0x%x)\n", __FUNCTION__,
	tcode, req);
	return(NULL);
	}

	if (firewire_debug > 2)
	printf("fw_tl2xfer: found tl=%d\n", tlabel);
	return(xfer);
	}
	mtx_unlock(&fc->tlabel_lock);
	if (firewire_debug > 1)
	printf("fw_tl2xfer: not found tl=%d\n", tlabel);
	splx(s);
	return(NULL);
	}

	/*
	* To allocate IEEE1394 XFER structure.
	*/
	struct fw_xfer *
	fw_xfer_alloc(struct malloc_type *type)
	{
	struct fw_xfer *xfer;

	xfer = malloc(sizeof(struct fw_xfer), type, M_NOWAIT \| M_ZERO);
	if (xfer == NULL)
	return xfer;

	xfer->malloc = type;

	return xfer;
	}

	struct fw_xfer *
	fw_xfer_alloc_buf(struct malloc_type *type, int send_len, int recv_len)
	{
	struct fw_xfer *xfer;

	xfer = fw_xfer_alloc(type);
	if (xfer == NULL)
	return(NULL);
	xfer->send.pay_len = send_len;
	xfer->recv.pay_len = recv_len;
	if (send_len > 0) {
	xfer->send.payload = malloc(send_len, type, M_NOWAIT \| M_ZERO);
	if (xfer->send.payload == NULL) {
	fw_xfer_free(xfer);
	return(NULL);
	}
	}
	if (recv_len > 0) {
	xfer->recv.payload = malloc(recv_len, type, M_NOWAIT);
	if (xfer->recv.payload == NULL) {
	if (xfer->send.payload != NULL)
	free(xfer->send.payload, type);
	fw_xfer_free(xfer);
	return(NULL);
	}
	}
	return(xfer);
	}

	/*
	* IEEE1394 XFER post process.
	*/
	void
	fw_xfer_done(struct fw_xfer *xfer)
	{
	if (xfer->hand == NULL) {
	printf("hand == NULL\n");
	return;
	}

	if (xfer->fc == NULL)
	panic("fw_xfer_done: why xfer->fc is NULL?");

	fw_tl_free(xfer->fc, xfer);
	xfer->hand(xfer);
	}

	void
	fw_xfer_unload(struct fw_xfer* xfer)
	{
	int s;

	if(xfer == NULL ) return;
	if(xfer->flag & FWXF_INQ){
	printf("fw_xfer_free FWXF_INQ\n");
	s = splfw();
	FW_GLOCK(xfer->fc);
	STAILQ_REMOVE(&xfer->q->q, xfer, fw_xfer, link);
	#if 0
	xfer->q->queued --;
	#endif
	FW_GUNLOCK(xfer->fc);
	splx(s);
	}
	if (xfer->fc != NULL) {
	#if 1
	if(xfer->flag & FWXF_START)
	/*
	* This could happen if:
	* 1. We call fwohci_arcv() before fwohci_txd().
	* 2. firewire_watch() is called.
	*/
	printf("fw_xfer_free FWXF_START\n");
	#endif
	}
	xfer->flag = FWXF_INIT;
	xfer->resp = 0;
	}
	/*
	* To free IEEE1394 XFER structure.
	*/
	void
	fw_xfer_free_buf( struct fw_xfer* xfer)
	{
	if (xfer == NULL) {
	printf("%s: xfer == NULL\n", __func__);
	return;
	}
	fw_xfer_unload(xfer);
	if(xfer->send.payload != NULL){
	free(xfer->send.payload, xfer->malloc);
	}
	if(xfer->recv.payload != NULL){
	free(xfer->recv.payload, xfer->malloc);
	}
	free(xfer, xfer->malloc);
	}

	void
	fw_xfer_free( struct fw_xfer* xfer)
	{
	if (xfer == NULL) {
	printf("%s: xfer == NULL\n", __func__);
	return;
	}
	fw_xfer_unload(xfer);
	free(xfer, xfer->malloc);
	}

	void
	fw_asy_callback_free(struct fw_xfer *xfer)
	{
	#if 0
	printf("asyreq done flag=0x%02x resp=%d\n",
	xfer->flag, xfer->resp);
	#endif
	fw_xfer_free(xfer);
	}

	/*
	* To configure PHY.
	*/
	static void
	fw_phy_config(struct firewire_comm *fc, int root_node, int gap_count)
	{
	struct fw_xfer *xfer;
	struct fw_pkt *fp;

	fc->status = FWBUSPHYCONF;

	xfer = fw_xfer_alloc(M_FWXFER);
	if (xfer == NULL)
	return;
	xfer->fc = fc;
	xfer->hand = fw_asy_callback_free;

	fp = &xfer->send.hdr;
	fp->mode.ld[1] = 0;
	if (root_node >= 0)
	fp->mode.ld[1] \|= (root_node & 0x3f) << 24 \| 1 << 23;
	if (gap_count >= 0)
	fp->mode.ld[1] \|= 1 << 22 \| (gap_count & 0x3f) << 16;
	fp->mode.ld[2] = ~fp->mode.ld[1];
	/* XXX Dangerous, how to pass PHY packet to device driver */
	fp->mode.common.tcode \|= FWTCODE_PHY;

	if (firewire_debug)
	printf("send phy_config root_node=%d gap_count=%d\n",
	root_node, gap_count);
	fw_asyreq(fc, -1, xfer);
	}

	#if 0
	/*
	* Dump self ID.
	*/
	static void
	fw_print_sid(uint32_t sid)
	{
	union fw_self_id *s;
	s = (union fw_self_id *) &sid;
	printf("node:%d link:%d gap:%d spd:%d del:%d con:%d pwr:%d"
	" p0:%d p1:%d p2:%d i:%d m:%d\n",
	s->p0.phy_id, s->p0.link_active, s->p0.gap_count,
	s->p0.phy_speed, s->p0.phy_delay, s->p0.contender,
	s->p0.power_class, s->p0.port0, s->p0.port1,
	s->p0.port2, s->p0.initiated_reset, s->p0.more_packets);
	}
	#endif

	/*
	* To receive self ID.
	*/
	void fw_sidrcv(struct firewire_comm* fc, uint32_t *sid, u_int len)
	{
	uint32_t *p;
	union fw_self_id *self_id;
	u_int i, j, node, c_port = 0, i_branch = 0;

	fc->sid_cnt = len /(sizeof(uint32_t) * 2);
	fc->max_node = fc->nodeid & 0x3f;
	CSRARC(fc, NODE_IDS) = ((uint32_t)fc->nodeid) << 16;
	fc->status = FWBUSCYMELECT;
	fc->topology_map->crc_len = 2;
	fc->topology_map->generation ++;
	fc->topology_map->self_id_count = 0;
	fc->topology_map->node_count = 0;
	fc->speed_map->generation ++;
	fc->speed_map->crc_len = 1 + (64*64 + 3) / 4;
	self_id = &fc->topology_map->self_id[0];
	for(i = 0; i < fc->sid_cnt; i ++){
	if (sid[1] != ~sid[0]) {
	printf("fw_sidrcv: invalid self-id packet\n");
	sid += 2;
	continue;
	}
	self_id = ((union fw_self_id *)sid);
	fc->topology_map->crc_len++;
	if(self_id->p0.sequel == 0){
	fc->topology_map->node_count ++;
	c_port = 0;
	#if 0
	fw_print_sid(sid[0]);
	#endif
	node = self_id->p0.phy_id;
	if(fc->max_node < node){
	fc->max_node = self_id->p0.phy_id;
	}
	/* XXX I'm not sure this is the right speed_map */
	fc->speed_map->speed[node][node]
	= self_id->p0.phy_speed;
	for (j = 0; j < node; j ++) {
	fc->speed_map->speed[j][node]
	= fc->speed_map->speed[node][j]
	= min(fc->speed_map->speed[j][j],
	self_id->p0.phy_speed);
	}
	if ((fc->irm == -1 \|\| self_id->p0.phy_id > fc->irm) &&
	(self_id->p0.link_active && self_id->p0.contender)) {
	fc->irm = self_id->p0.phy_id;
	}
	if(self_id->p0.port0 >= 0x2){
	c_port++;
	}
	if(self_id->p0.port1 >= 0x2){
	c_port++;
	}
	if(self_id->p0.port2 >= 0x2){
	c_port++;
	}
	}
	if(c_port > 2){
	i_branch += (c_port - 2);
	}
	sid += 2;
	self_id++;
	fc->topology_map->self_id_count ++;
	}
	device_printf(fc->bdev, "%d nodes", fc->max_node + 1);
	/* CRC */
	fc->topology_map->crc = fw_crc16(
	(uint32_t *)&fc->topology_map->generation,
	fc->topology_map->crc_len * 4);
	fc->speed_map->crc = fw_crc16(
	(uint32_t *)&fc->speed_map->generation,
	fc->speed_map->crc_len * 4);
	/* byteswap and copy to CSR */
	p = (uint32_t *)fc->topology_map;
	for (i = 0; i <= fc->topology_map->crc_len; i++)
	CSRARC(fc, TOPO_MAP + i * 4) = htonl(*p++);
	p = (uint32_t *)fc->speed_map;
	CSRARC(fc, SPED_MAP) = htonl(*p++);
	CSRARC(fc, SPED_MAP + 4) = htonl(*p++);
	/* don't byte-swap uint8_t array */
	bcopy(p, &CSRARC(fc, SPED_MAP + 8), (fc->speed_map->crc_len - 1)*4);

	fc->max_hop = fc->max_node - i_branch;
	printf(", maxhop <= %d", fc->max_hop);

	if(fc->irm == -1 ){
	printf(", Not found IRM capable node");
	}else{
	printf(", cable IRM = %d", fc->irm);
	if (fc->irm == fc->nodeid)
	printf(" (me)");
	}
	printf("\n");

	if (try_bmr && (fc->irm != -1) && (CSRARC(fc, BUS_MGR_ID) == 0x3f)) {
	if (fc->irm == fc->nodeid) {
	fc->status = FWBUSMGRDONE;
	CSRARC(fc, BUS_MGR_ID) = fc->set_bmr(fc, fc->irm);
	fw_bmr(fc);
	} else {
	fc->status = FWBUSMGRELECT;
	callout_reset(&fc->bmr_callout, hz/8,
	(void )fw_try_bmr, (void )fc);
	}
	} else
	fc->status = FWBUSMGRDONE;

	callout_reset(&fc->busprobe_callout, hz/4,
	(void )fw_bus_probe, (void )fc);
	}

	/*
	* To probe devices on the IEEE1394 bus.
	*/
	static void
	fw_bus_probe(struct firewire_comm *fc)
	{
	int s;
	struct fw_device *fwdev;

	s = splfw();
	fc->status = FWBUSEXPLORE;

	/* Invalidate all devices, just after bus reset. */
	STAILQ_FOREACH(fwdev, &fc->devices, link)
	if (fwdev->status != FWDEVINVAL) {
	fwdev->status = FWDEVINVAL;
	fwdev->rcnt = 0;
	}
	splx(s);

	wakeup((void *)fc);
	}

	static int
	fw_explore_read_quads(struct fw_device *fwdev, int offset,
	uint32_t *quad, int n)
	{
	struct fw_xfer *xfer;
	uint32_t tmp;
	int i, error;

	for (i = 0; i < n; i ++, offset += sizeof(uint32_t)) {
	xfer = fwmem_read_quad(fwdev, NULL, -1,
	0xffff, 0xf0000000 \| offset, (void *)&tmp,
	fw_xferwake);
	if (xfer == NULL)
	return (-1);
	fw_xferwait(xfer);

	if (xfer->resp == 0)
	quad[i] = ntohl(tmp);

	error = xfer->resp;
	fw_xfer_free(xfer);
	if (error)
	return (error);
	}
	return (0);
	}


	static int
	fw_explore_csrblock(struct fw_device *fwdev, int offset, int recur)
	{
	int err, i, off;
	struct csrdirectory *dir;
	struct csrreg *reg;

	dir = (struct csrdirectory *)&fwdev->csrrom[offset/sizeof(uint32_t)];
	err = fw_explore_read_quads(fwdev, CSRROMOFF + offset,
	(uint32_t *)dir, 1);
	if (err)
	return (-1);

	offset += sizeof(uint32_t);
	reg = (struct csrreg *)&fwdev->csrrom[offset/sizeof(uint32_t)];
	err = fw_explore_read_quads(fwdev, CSRROMOFF + offset,
	(uint32_t *)reg, dir->crc_len);
	if (err)
	return (-1);

	/* XXX check CRC */

	off = CSRROMOFF + offset + sizeof(uint32_t) * (dir->crc_len - 1);
	if (fwdev->rommax < off)
	fwdev->rommax = off;

	if (recur == 0)
	return (0);

	for (i = 0; i < dir->crc_len; i ++, offset += sizeof(uint32_t)) {
	if ((reg[i].key & CSRTYPE_MASK) == CSRTYPE_D)
	recur = 1;
	else if ((reg[i].key & CSRTYPE_MASK) == CSRTYPE_L)
	recur = 0;
	else
	continue;

	off = offset + reg[i].val * sizeof(uint32_t);
	if (off > CROMSIZE) {
	printf("%s: invalid offset %d\n", __FUNCTION__, off);
	return(-1);
	}
	err = fw_explore_csrblock(fwdev, off, recur);
	if (err)
	return (-1);
	}
	return (0);
	}

	static int
	fw_explore_node(struct fw_device *dfwdev)
	{
	struct firewire_comm *fc;
	struct fw_device fwdev, pfwdev, *tfwdev;
	uint32_t *csr;
	struct csrhdr *hdr;
	struct bus_info *binfo;
	int err, node, spd;

	fc = dfwdev->fc;
	csr = dfwdev->csrrom;
	node = dfwdev->dst;

	/* First quad */
	err = fw_explore_read_quads(dfwdev, CSRROMOFF, &csr[0], 1);
	if (err)
	return (-1);
	hdr = (struct csrhdr *)&csr[0];
	if (hdr->info_len != 4) {
	if (firewire_debug)
	printf("node%d: wrong bus info len(%d)\n",
	node, hdr->info_len);
	return (-1);
	}

	/* bus info */
	err = fw_explore_read_quads(dfwdev, CSRROMOFF + 0x04, &csr[1], 4);
	if (err)
	return (-1);
	binfo = (struct bus_info *)&csr[1];
	if (binfo->bus_name != CSR_BUS_NAME_IEEE1394) {
	if (firewire_debug)
	printf("node%d: invalid bus name 0x%08x\n",
	node, binfo->bus_name);
	return (-1);
	}
	spd = fc->speed_map->speed[fc->nodeid][node];
	STAILQ_FOREACH(fwdev, &fc->devices, link)
	if (FW_EUI64_EQUAL(fwdev->eui, binfo->eui64))
	break;
	if (fwdev == NULL) {
	/* new device */
	fwdev = malloc(sizeof(struct fw_device), M_FW,
	M_NOWAIT \| M_ZERO);
	if (fwdev == NULL) {
	if (firewire_debug)
	printf("node%d: no memory\n", node);
	return (-1);
	}
	fwdev->fc = fc;
	fwdev->eui = binfo->eui64;
	/* inesrt into sorted fwdev list */
	pfwdev = NULL;
	STAILQ_FOREACH(tfwdev, &fc->devices, link) {
	if (tfwdev->eui.hi > fwdev->eui.hi \|\|
	(tfwdev->eui.hi == fwdev->eui.hi &&
	tfwdev->eui.lo > fwdev->eui.lo))
	break;
	pfwdev = tfwdev;
	}
	if (pfwdev == NULL)
	STAILQ_INSERT_HEAD(&fc->devices, fwdev, link);
	else
	STAILQ_INSERT_AFTER(&fc->devices, pfwdev, fwdev, link);

	device_printf(fc->bdev, "New %s device ID:%08x%08x\n",
	linkspeed[spd],
	fwdev->eui.hi, fwdev->eui.lo);
	}
	fwdev->dst = node;
	fwdev->status = FWDEVINIT;
	fwdev->speed = spd;

	/* unchanged ? */
	if (bcmp(&csr[0], &fwdev->csrrom[0], sizeof(uint32_t) * 5) == 0) {
	if (firewire_debug)
	printf("node%d: crom unchanged\n", node);
	return (0);
	}

	bzero(&fwdev->csrrom[0], CROMSIZE);

	/* copy first quad and bus info block */
	bcopy(&csr[0], &fwdev->csrrom[0], sizeof(uint32_t) * 5);
	fwdev->rommax = CSRROMOFF + sizeof(uint32_t) * 4;

	err = fw_explore_csrblock(fwdev, 0x14, 1); /* root directory */

	if (err) {
	fwdev->status = FWDEVINVAL;
	fwdev->csrrom[0] = 0;
	}
	return (err);

	}

	/*
	* Find the self_id packet for a node, ignoring sequels.
	*/
	static union fw_self_id *
	fw_find_self_id(struct firewire_comm *fc, int node)
	{
	uint32_t i;
	union fw_self_id *s;

	for (i = 0; i < fc->topology_map->self_id_count; i++) {
	s = &fc->topology_map->self_id[i];
	if (s->p0.sequel)
	continue;
	if (s->p0.phy_id == node)
	return s;
	}
	return 0;
	}

	static void
	fw_explore(struct firewire_comm *fc)
	{
	int node, err, s, i, todo, todo2, trys;
	char nodes[63];
	struct fw_device dfwdev;
	union fw_self_id *fwsid;

	todo = 0;
	/* setup dummy fwdev */
	dfwdev.fc = fc;
	dfwdev.speed = 0;
	dfwdev.maxrec = 8; /* 512 */
	dfwdev.status = FWDEVINIT;

	for (node = 0; node <= fc->max_node; node ++) {
	/* We don't probe myself and linkdown nodes */
	if (node == fc->nodeid)
	continue;
	fwsid = fw_find_self_id(fc, node);
	if (!fwsid \|\| !fwsid->p0.link_active) {
	if (firewire_debug)
	printf("node%d: link down\n", node);
	continue;
	}
	nodes[todo++] = node;
	}

	s = splfw();
	for (trys = 0; todo > 0 && trys < 3; trys ++) {
	todo2 = 0;
	for (i = 0; i < todo; i ++) {
	dfwdev.dst = nodes[i];
	err = fw_explore_node(&dfwdev);
	if (err)
	nodes[todo2++] = nodes[i];
	if (firewire_debug)
	printf("%s: node %d, err = %d\n",
	__FUNCTION__, node, err);
	}
	todo = todo2;
	}
	splx(s);
	}


	static void
	fw_bus_probe_thread(void *arg)
	{
	struct firewire_comm *fc;

	fc = (struct firewire_comm *)arg;

	mtx_lock(&fc->wait_lock);
	while (fc->status != FWBUSDETACH) {
	if (fc->status == FWBUSEXPLORE) {
	mtx_unlock(&fc->wait_lock);
	fw_explore(fc);
	fc->status = FWBUSEXPDONE;
	if (firewire_debug)
	printf("bus_explore done\n");
	fw_attach_dev(fc);
	mtx_lock(&fc->wait_lock);
	}
	msleep((void *)fc, &fc->wait_lock, PWAIT\|PCATCH, "-", 0);
	}
	mtx_unlock(&fc->wait_lock);
	kproc_exit(0);
	}

	/*
	* To attach sub-devices layer onto IEEE1394 bus.
	*/
	static void
	fw_attach_dev(struct firewire_comm *fc)
	{
	struct fw_device fwdev, next;
	int i, err;
	device_t *devlistp;
	int devcnt;
	struct firewire_dev_comm *fdc;

	for (fwdev = STAILQ_FIRST(&fc->devices); fwdev != NULL; fwdev = next) {
	next = STAILQ_NEXT(fwdev, link);
	if (fwdev->status == FWDEVINIT) {
	fwdev->status = FWDEVATTACHED;
	} else if (fwdev->status == FWDEVINVAL) {
	fwdev->rcnt ++;
	if (fwdev->rcnt > hold_count) {
	/*
	* Remove devices which have not been seen
	* for a while.
	*/
	STAILQ_REMOVE(&fc->devices, fwdev, fw_device,
	link);
	free(fwdev, M_FW);
	}
	}
	}

	err = device_get_children(fc->bdev, &devlistp, &devcnt);
	if( err != 0 )
	return;
	for( i = 0 ; i < devcnt ; i++){
	if (device_get_state(devlistp[i]) >= DS_ATTACHED) {
	fdc = device_get_softc(devlistp[i]);
	if (fdc->post_explore != NULL)
	fdc->post_explore(fdc);
	}
	}
	free(devlistp, M_TEMP);

	return;
	}

	/*
	* To allocate unique transaction label.
	*/
	static int
	fw_get_tlabel(struct firewire_comm fc, struct fw_xfer xfer)
	{
	u_int dst, new_tlabel;
	struct fw_xfer *txfer;
	int s;

	dst = xfer->send.hdr.mode.hdr.dst & 0x3f;
	s = splfw();
	mtx_lock(&fc->tlabel_lock);
	new_tlabel = (fc->last_tlabel[dst] + 1) & 0x3f;
	STAILQ_FOREACH(txfer, &fc->tlabels[new_tlabel], tlabel)
	if ((txfer->send.hdr.mode.hdr.dst & 0x3f) == dst)
	break;
	if(txfer == NULL) {
	fc->last_tlabel[dst] = new_tlabel;
	STAILQ_INSERT_TAIL(&fc->tlabels[new_tlabel], xfer, tlabel);
	mtx_unlock(&fc->tlabel_lock);
	splx(s);
	xfer->tl = new_tlabel;
	xfer->send.hdr.mode.hdr.tlrt = new_tlabel << 2;
	if (firewire_debug > 1)
	printf("fw_get_tlabel: dst=%d tl=%d\n", dst, new_tlabel);
	return (new_tlabel);
	}
	mtx_unlock(&fc->tlabel_lock);
	splx(s);

	if (firewire_debug > 1)
	printf("fw_get_tlabel: no free tlabel\n");
	return (-1);
	}

	static void
	fw_rcv_copy(struct fw_rcv_buf *rb)
	{
	struct fw_pkt *pkt;
	u_char *p;
	struct tcode_info *tinfo;
	u_int res, i, len, plen;

	rb->xfer->recv.spd = rb->spd;

	pkt = (struct fw_pkt *)rb->vec->iov_base;
	tinfo = &rb->fc->tcode[pkt->mode.hdr.tcode];

	/* Copy header */
	p = (u_char *)&rb->xfer->recv.hdr;
	bcopy(rb->vec->iov_base, p, tinfo->hdr_len);
	rb->vec->iov_base = (u_char *)rb->vec->iov_base + tinfo->hdr_len;
	rb->vec->iov_len -= tinfo->hdr_len;

	/* Copy payload */
	p = (u_char *)rb->xfer->recv.payload;
	res = rb->xfer->recv.pay_len;

	/* special handling for RRESQ */
	if (pkt->mode.hdr.tcode == FWTCODE_RRESQ &&
	p != NULL && res >= sizeof(uint32_t)) {
	(uint32_t )p = pkt->mode.rresq.data;
	rb->xfer->recv.pay_len = sizeof(uint32_t);
	return;
	}

	if ((tinfo->flag & FWTI_BLOCK_ASY) == 0)
	return;

	plen = pkt->mode.rresb.len;

	for (i = 0; i < rb->nvec; i++, rb->vec++) {
	len = MIN(rb->vec->iov_len, plen);
	if (res < len) {
	printf("rcv buffer(%d) is %d bytes short.\n",
	rb->xfer->recv.pay_len, len - res);
	len = res;
	}
	bcopy(rb->vec->iov_base, p, len);
	p += len;
	res -= len;
	plen -= len;
	if (res == 0 \|\| plen == 0)
	break;
	}
	rb->xfer->recv.pay_len -= res;

	}

	/*
	* Generic packet receiving process.
	*/
	void
	fw_rcv(struct fw_rcv_buf *rb)
	{
	struct fw_pkt fp, resfp;
	struct fw_bind *bind;
	int tcode;
	int i, len, oldstate;
	#if 0
	{
	uint32_t *qld;
	int i;
	qld = (uint32_t *)buf;
	printf("spd %d len:%d\n", spd, len);
	for( i = 0 ; i <= len && i < 32; i+= 4){
	printf("0x%08x ", ntohl(qld[i/4]));
	if((i % 16) == 15) printf("\n");
	}
	if((i % 16) != 15) printf("\n");
	}
	#endif
	fp = (struct fw_pkt *)rb->vec[0].iov_base;
	tcode = fp->mode.common.tcode;
	switch (tcode) {
	case FWTCODE_WRES:
	case FWTCODE_RRESQ:
	case FWTCODE_RRESB:
	case FWTCODE_LRES:
	rb->xfer = fw_tl2xfer(rb->fc, fp->mode.hdr.src,
	fp->mode.hdr.tlrt >> 2, fp->mode.hdr.tcode);
	if(rb->xfer == NULL) {
	printf("fw_rcv: unknown response "
	"%s(%x) src=0x%x tl=0x%x rt=%d data=0x%x\n",
	tcode_str[tcode], tcode,
	fp->mode.hdr.src,
	fp->mode.hdr.tlrt >> 2,
	fp->mode.hdr.tlrt & 3,
	fp->mode.rresq.data);
	#if 0
	printf("try ad-hoc work around!!\n");
	rb->xfer = fw_tl2xfer(rb->fc, fp->mode.hdr.src,
	(fp->mode.hdr.tlrt >> 2)^3);
	if (rb->xfer == NULL) {
	printf("no use...\n");
	return;
	}
	#else
	return;
	#endif
	}
	fw_rcv_copy(rb);
	if (rb->xfer->recv.hdr.mode.wres.rtcode != RESP_CMP)
	rb->xfer->resp = EIO;
	else
	rb->xfer->resp = 0;
	/* make sure the packet is drained in AT queue */
	oldstate = rb->xfer->flag;
	rb->xfer->flag = FWXF_RCVD;
	switch (oldstate) {
	case FWXF_SENT:
	fw_xfer_done(rb->xfer);
	break;
	case FWXF_START:
	#if 0
	if (firewire_debug)
	printf("not sent yet tl=%x\n", rb->xfer->tl);
	#endif
	break;
	default:
	printf("unexpected flag 0x%02x\n", rb->xfer->flag);
	}
	return;
	case FWTCODE_WREQQ:
	case FWTCODE_WREQB:
	case FWTCODE_RREQQ:
	case FWTCODE_RREQB:
	case FWTCODE_LREQ:
	bind = fw_bindlookup(rb->fc, fp->mode.rreqq.dest_hi,
	fp->mode.rreqq.dest_lo);
	if(bind == NULL){
	printf("Unknown service addr 0x%04x:0x%08x %s(%x)"
	#if defined(__DragonFly__) \|\| __FreeBSD_version < 500000
	" src=0x%x data=%lx\n",
	#else
	" src=0x%x data=%x\n",
	#endif
	fp->mode.wreqq.dest_hi, fp->mode.wreqq.dest_lo,
	tcode_str[tcode], tcode,
	fp->mode.hdr.src, ntohl(fp->mode.wreqq.data));
	if (rb->fc->status == FWBUSINIT) {
	printf("fw_rcv: cannot respond(bus reset)!\n");
	return;
	}
	rb->xfer = fw_xfer_alloc(M_FWXFER);
	if(rb->xfer == NULL){
	return;
	}
	rb->xfer->send.spd = rb->spd;
	rb->xfer->send.pay_len = 0;
	resfp = &rb->xfer->send.hdr;
	switch (tcode) {
	case FWTCODE_WREQQ:
	case FWTCODE_WREQB:
	resfp->mode.hdr.tcode = FWTCODE_WRES;
	break;
	case FWTCODE_RREQQ:
	resfp->mode.hdr.tcode = FWTCODE_RRESQ;
	break;
	case FWTCODE_RREQB:
	resfp->mode.hdr.tcode = FWTCODE_RRESB;
	break;
	case FWTCODE_LREQ:
	resfp->mode.hdr.tcode = FWTCODE_LRES;
	break;
	}
	resfp->mode.hdr.dst = fp->mode.hdr.src;
	resfp->mode.hdr.tlrt = fp->mode.hdr.tlrt;
	resfp->mode.hdr.pri = fp->mode.hdr.pri;
	resfp->mode.rresb.rtcode = RESP_ADDRESS_ERROR;
	resfp->mode.rresb.extcode = 0;
	resfp->mode.rresb.len = 0;
	/*
	rb->xfer->hand = fw_xferwake;
	*/
	rb->xfer->hand = fw_xfer_free;
	if(fw_asyreq(rb->fc, -1, rb->xfer)){
	fw_xfer_free(rb->xfer);
	return;
	}
	return;
	}
	len = 0;
	for (i = 0; i < rb->nvec; i ++)
	len += rb->vec[i].iov_len;
	rb->xfer = STAILQ_FIRST(&bind->xferlist);
	if (rb->xfer == NULL) {
	#if 1
	printf("Discard a packet for this bind.\n");
	#endif
	return;
	}
	STAILQ_REMOVE_HEAD(&bind->xferlist, link);
	fw_rcv_copy(rb);
	rb->xfer->hand(rb->xfer);
	return;
	#if 0 /* shouldn't happen ?? or for GASP */
	case FWTCODE_STREAM:
	{
	struct fw_xferq *xferq;

	xferq = rb->fc->ir[sub];
	#if 0
	printf("stream rcv dma %d len %d off %d spd %d\n",
	sub, len, off, spd);
	#endif
	if(xferq->queued >= xferq->maxq) {
	printf("receive queue is full\n");
	return;
	}
	/* XXX get xfer from xfer queue, we don't need copy for
	per packet mode */
	rb->xfer = fw_xfer_alloc_buf(M_FWXFER, 0, /* XXX */
	vec[0].iov_len);
	if (rb->xfer == NULL)
	return;
	fw_rcv_copy(rb)
	s = splfw();
	xferq->queued++;
	STAILQ_INSERT_TAIL(&xferq->q, rb->xfer, link);
	splx(s);
	sc = device_get_softc(rb->fc->bdev);
	#if defined(__DragonFly__) \|\| __FreeBSD_version < 500000
	if (&xferq->rsel.si_pid != 0)
	#else
	if (SEL_WAITING(&xferq->rsel))
	#endif
	selwakeuppri(&xferq->rsel, FWPRI);
	if (xferq->flag & FWXFERQ_WAKEUP) {
	xferq->flag &= ~FWXFERQ_WAKEUP;
	wakeup((caddr_t)xferq);
	}
	if (xferq->flag & FWXFERQ_HANDLER) {
	xferq->hand(xferq);
	}
	return;
	break;
	}
	#endif
	default:
	printf("fw_rcv: unknow tcode %d\n", tcode);
	break;
	}
	}

	/*
	* Post process for Bus Manager election process.
	*/
	static void
	fw_try_bmr_callback(struct fw_xfer *xfer)
	{
	struct firewire_comm *fc;
	int bmr;

	if (xfer == NULL)
	return;
	fc = xfer->fc;
	if (xfer->resp != 0)
	goto error;
	if (xfer->recv.payload == NULL)
	goto error;
	if (xfer->recv.hdr.mode.lres.rtcode != FWRCODE_COMPLETE)
	goto error;

	bmr = ntohl(xfer->recv.payload[0]);
	if (bmr == 0x3f)
	bmr = fc->nodeid;

	CSRARC(fc, BUS_MGR_ID) = fc->set_bmr(fc, bmr & 0x3f);
	fw_xfer_free_buf(xfer);
	fw_bmr(fc);
	return;

	error:
	device_printf(fc->bdev, "bus manager election failed\n");
	fw_xfer_free_buf(xfer);
	}


	/*
	* To candidate Bus Manager election process.
	*/
	static void
	fw_try_bmr(void *arg)
	{
	struct fw_xfer *xfer;
	struct firewire_comm fc = (struct firewire_comm )arg;
	struct fw_pkt *fp;
	int err = 0;

	xfer = fw_xfer_alloc_buf(M_FWXFER, 8, 4);
	if(xfer == NULL){
	return;
	}
	xfer->send.spd = 0;
	fc->status = FWBUSMGRELECT;

	fp = &xfer->send.hdr;
	fp->mode.lreq.dest_hi = 0xffff;
	fp->mode.lreq.tlrt = 0;
	fp->mode.lreq.tcode = FWTCODE_LREQ;
	fp->mode.lreq.pri = 0;
	fp->mode.lreq.src = 0;
	fp->mode.lreq.len = 8;
	fp->mode.lreq.extcode = EXTCODE_CMP_SWAP;
	fp->mode.lreq.dst = FWLOCALBUS \| fc->irm;
	fp->mode.lreq.dest_lo = 0xf0000000 \| BUS_MGR_ID;
	xfer->send.payload[0] = htonl(0x3f);
	xfer->send.payload[1] = htonl(fc->nodeid);
	xfer->hand = fw_try_bmr_callback;

	err = fw_asyreq(fc, -1, xfer);
	if(err){
	fw_xfer_free_buf(xfer);
	return;
	}
	return;
	}

	#ifdef FW_VMACCESS
	/*
	* Software implementation for physical memory block access.
	* XXX:Too slow, usef for debug purpose only.
	*/
	static void
	fw_vmaccess(struct fw_xfer *xfer){
	struct fw_pkt rfp, sfp = NULL;
	uint32_t ld = (uint32_t )xfer->recv.buf;

	printf("vmaccess spd:%2x len:%03x data:%08x %08x %08x %08x\n",
	xfer->spd, xfer->recv.len, ntohl(ld[0]), ntohl(ld[1]), ntohl(ld[2]), ntohl(ld[3]));
	printf("vmaccess data:%08x %08x %08x %08x\n", ntohl(ld[4]), ntohl(ld[5]), ntohl(ld[6]), ntohl(ld[7]));
	if(xfer->resp != 0){
	fw_xfer_free( xfer);
	return;
	}
	if(xfer->recv.buf == NULL){
	fw_xfer_free( xfer);
	return;
	}
	rfp = (struct fw_pkt *)xfer->recv.buf;
	switch(rfp->mode.hdr.tcode){
	/* XXX need fix for 64bit arch */
	case FWTCODE_WREQB:
	xfer->send.buf = malloc(12, M_FW, M_NOWAIT);
	xfer->send.len = 12;
	sfp = (struct fw_pkt *)xfer->send.buf;
	bcopy(rfp->mode.wreqb.payload,
	(caddr_t)ntohl(rfp->mode.wreqb.dest_lo), ntohs(rfp->mode.wreqb.len));
	sfp->mode.wres.tcode = FWTCODE_WRES;
	sfp->mode.wres.rtcode = 0;
	break;
	case FWTCODE_WREQQ:
	xfer->send.buf = malloc(12, M_FW, M_NOWAIT);
	xfer->send.len = 12;
	sfp->mode.wres.tcode = FWTCODE_WRES;
	((uint32_t )(ntohl(rfp->mode.wreqb.dest_lo))) = rfp->mode.wreqq.data;
	sfp->mode.wres.rtcode = 0;
	break;
	case FWTCODE_RREQB:
	xfer->send.buf = malloc(16 + rfp->mode.rreqb.len, M_FW, M_NOWAIT);
	xfer->send.len = 16 + ntohs(rfp->mode.rreqb.len);
	sfp = (struct fw_pkt *)xfer->send.buf;
	bcopy((caddr_t)ntohl(rfp->mode.rreqb.dest_lo),
	sfp->mode.rresb.payload, (uint16_t)ntohs(rfp->mode.rreqb.len));
	sfp->mode.rresb.tcode = FWTCODE_RRESB;
	sfp->mode.rresb.len = rfp->mode.rreqb.len;
	sfp->mode.rresb.rtcode = 0;
	sfp->mode.rresb.extcode = 0;
	break;
	case FWTCODE_RREQQ:
	xfer->send.buf = malloc(16, M_FW, M_NOWAIT);
	xfer->send.len = 16;
	sfp = (struct fw_pkt *)xfer->send.buf;
	sfp->mode.rresq.data = (uint32_t )(ntohl(rfp->mode.rreqq.dest_lo));
	sfp->mode.wres.tcode = FWTCODE_RRESQ;
	sfp->mode.rresb.rtcode = 0;
	break;
	default:
	fw_xfer_free( xfer);
	return;
	}
	sfp->mode.hdr.dst = rfp->mode.hdr.src;
	xfer->dst = ntohs(rfp->mode.hdr.src);
	xfer->hand = fw_xfer_free;

	sfp->mode.hdr.tlrt = rfp->mode.hdr.tlrt;
	sfp->mode.hdr.pri = 0;

	fw_asyreq(xfer->fc, -1, xfer);
	/**/
	return;
	}
	#endif

	/*
	* CRC16 check-sum for IEEE1394 register blocks.
	*/
	uint16_t
	fw_crc16(uint32_t *ptr, uint32_t len){
	uint32_t i, sum, crc = 0;
	int shift;
	len = (len + 3) & ~3;
	for(i = 0 ; i < len ; i+= 4){
	for( shift = 28 ; shift >= 0 ; shift -= 4){
	sum = ((crc >> 12) ^ (ptr[i/4] >> shift)) & 0xf;
	crc = (crc << 4) ^ ( sum << 12 ) ^ ( sum << 5) ^ sum;
	}
	crc &= 0xffff;
	}
	return((uint16_t) crc);
	}

	static int
	fw_bmr(struct firewire_comm *fc)
	{
	struct fw_device fwdev;
	union fw_self_id *self_id;
	int cmstr;
	uint32_t quad;

	/* Check to see if the current root node is cycle master capable */
	self_id = fw_find_self_id(fc, fc->max_node);
	if (fc->max_node > 0) {
	/* XXX check cmc bit of businfo block rather than contender */
	if (self_id->p0.link_active && self_id->p0.contender)
	cmstr = fc->max_node;
	else {
	device_printf(fc->bdev,
	"root node is not cycle master capable\n");
	/* XXX shall we be the cycle master? */
	cmstr = fc->nodeid;
	/* XXX need bus reset */
	}
	} else
	cmstr = -1;

	device_printf(fc->bdev, "bus manager %d ", CSRARC(fc, BUS_MGR_ID));
	if(CSRARC(fc, BUS_MGR_ID) != fc->nodeid) {
	/* We are not the bus manager */
	printf("\n");
	return(0);
	}
	printf("(me)\n");

	/* Optimize gapcount */
	if(fc->max_hop <= MAX_GAPHOP )
	fw_phy_config(fc, cmstr, gap_cnt[fc->max_hop]);
	/* If we are the cycle master, nothing to do */
	if (cmstr == fc->nodeid \|\| cmstr == -1)
	return 0;
	/* Bus probe has not finished, make dummy fwdev for cmstr */
	bzero(&fwdev, sizeof(fwdev));
	fwdev.fc = fc;
	fwdev.dst = cmstr;
	fwdev.speed = 0;
	fwdev.maxrec = 8; /* 512 */
	fwdev.status = FWDEVINIT;
	/* Set cmstr bit on the cycle master */
	quad = htonl(1 << 8);
	fwmem_write_quad(&fwdev, NULL, 0/spd/,
	0xffff, 0xf0000000 \| STATE_SET, &quad, fw_asy_callback_free);

	return 0;
	}

	int
	fw_open_isodma(struct firewire_comm *fc, int tx)
	{
	struct fw_xferq **xferqa;
	struct fw_xferq *xferq;
	int i;

	if (tx)
	xferqa = &fc->it[0];
	else
	xferqa = &fc->ir[0];

	FW_GLOCK(fc);
	for (i = 0; i < fc->nisodma; i ++) {
	xferq = xferqa[i];
	if ((xferq->flag & FWXFERQ_OPEN) == 0) {
	xferq->flag \|= FWXFERQ_OPEN;
	break;
	}
	}
	if (i == fc->nisodma) {
	printf("no free dma channel (tx=%d)\n", tx);
	i = -1;
	}
	FW_GUNLOCK(fc);
	return (i);
	}

	static int
	fw_modevent(module_t mode, int type, void *data)
	{
	int err = 0;
	#if defined(__FreeBSD__) && __FreeBSD_version >= 500000
	static eventhandler_tag fwdev_ehtag = NULL;
	#endif

	switch (type) {
	case MOD_LOAD:
	#if defined(__FreeBSD__) && __FreeBSD_version >= 500000
	fwdev_ehtag = EVENTHANDLER_REGISTER(dev_clone,
	fwdev_clone, 0, 1000);
	#endif
	break;
	case MOD_UNLOAD:
	#if defined(__FreeBSD__) && __FreeBSD_version >= 500000
	if (fwdev_ehtag != NULL)
	EVENTHANDLER_DEREGISTER(dev_clone, fwdev_ehtag);
	#endif
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	return (EOPNOTSUPP);
	}
	return (err);
	}


	#ifdef __DragonFly__
	DECLARE_DUMMY_MODULE(firewire);
	#endif
	DRIVER_MODULE(firewire,fwohci,firewire_driver,firewire_devclass,fw_modevent,0);
	MODULE_VERSION(firewire, 1);
	Index: head/sys/fs/cd9660/cd9660_rrip.c
	===================================================================
	--- head/sys/fs/cd9660/cd9660_rrip.c (revision 183549)
	+++ head/sys/fs/cd9660/cd9660_rrip.c (revision 183550)
	@@ -1,718 +1,720 @@
	/*-
	* Copyright (c) 1993, 1994
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley
	* by Pace Willisson (pace@blitz.com). The Rock Ridge Extension
	* Support code is derived from software contributed to Berkeley
	* by Atsushi Murai (amurai@spec.co.jp).
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)cd9660_rrip.c 8.6 (Berkeley) 12/5/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/kernel.h>
	#include <sys/vimage.h>

	#include <fs/cd9660/iso.h>
	#include <fs/cd9660/cd9660_node.h>
	#include <fs/cd9660/cd9660_rrip.h>
	#include <fs/cd9660/iso_rrip.h>

	typedef int rrt_func_t(void , ISO_RRIP_ANALYZE ana);

	typedef struct {
	char type[2];
	rrt_func_t *func;
	void (func2)(struct iso_directory_record isodir, ISO_RRIP_ANALYZE *ana);
	int result;
	} RRIP_TABLE;

	static int cd9660_rrip_altname(ISO_RRIP_ALTNAME p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_attr(ISO_RRIP_ATTR p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_cont(ISO_RRIP_CONT p, ISO_RRIP_ANALYZE ana);
	static void cd9660_rrip_defattr(struct iso_directory_record *isodir,
	ISO_RRIP_ANALYZE *ana);
	static void cd9660_rrip_defname(struct iso_directory_record *isodir,
	ISO_RRIP_ANALYZE *ana);
	static void cd9660_rrip_deftstamp(struct iso_directory_record *isodir,
	ISO_RRIP_ANALYZE *ana);
	static int cd9660_rrip_device(ISO_RRIP_DEVICE p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_extref(ISO_RRIP_EXTREF p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_idflag(ISO_RRIP_IDFLAG p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_loop(struct iso_directory_record *isodir,
	ISO_RRIP_ANALYZE ana, RRIP_TABLE table);
	static int cd9660_rrip_pclink(ISO_RRIP_CLINK p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_reldir(ISO_RRIP_RELDIR p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_slink(ISO_RRIP_SLINK p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_stop(ISO_SUSP_HEADER p, ISO_RRIP_ANALYZE ana);
	static int cd9660_rrip_tstamp(ISO_RRIP_TSTAMP p, ISO_RRIP_ANALYZE ana);

	/*
	* POSIX file attribute
	*/
	static int
	cd9660_rrip_attr(p,ana)
	ISO_RRIP_ATTR *p;
	ISO_RRIP_ANALYZE *ana;
	{
	ana->inop->inode.iso_mode = isonum_733(p->mode);
	ana->inop->inode.iso_uid = isonum_733(p->uid);
	ana->inop->inode.iso_gid = isonum_733(p->gid);
	ana->inop->inode.iso_links = isonum_733(p->links);
	ana->fields &= ~ISO_SUSP_ATTR;
	return ISO_SUSP_ATTR;
	}

	static void
	cd9660_rrip_defattr(isodir,ana)
	struct iso_directory_record *isodir;
	ISO_RRIP_ANALYZE *ana;
	{
	/* But this is a required field! */
	printf("RRIP without PX field?\n");
	cd9660_defattr(isodir,ana->inop,NULL,ISO_FTYPE_RRIP);
	}

	/*
	* Symbolic Links
	*/
	static int
	cd9660_rrip_slink(p,ana)
	ISO_RRIP_SLINK *p;
	ISO_RRIP_ANALYZE *ana;
	{
	+ INIT_VPROCG(TD_TO_VPROCG(curthread));
	ISO_RRIP_SLINK_COMPONENT *pcomp;
	ISO_RRIP_SLINK_COMPONENT *pcompe;
	int len, wlen, cont;
	char outbuf, inbuf;

	pcomp = (ISO_RRIP_SLINK_COMPONENT *)p->component;
	pcompe = (ISO_RRIP_SLINK_COMPONENT )((char )p + isonum_711(p->h.length));
	len = *ana->outlen;
	outbuf = ana->outbuf;
	cont = ana->cont;

	/*
	* Gathering a Symbolic name from each component with path
	*/
	for (;
	pcomp < pcompe;
	pcomp = (ISO_RRIP_SLINK_COMPONENT )((char )pcomp + ISO_RRIP_SLSIZ
	+ isonum_711(pcomp->clen))) {

	if (!cont) {
	if (len < ana->maxlen) {
	len++;
	*outbuf++ = '/';
	}
	}
	cont = 0;

	inbuf = "..";
	wlen = 0;

	switch (*pcomp->cflag) {

	case ISO_SUSP_CFLAG_CURRENT:
	/* Inserting Current */
	wlen = 1;
	break;

	case ISO_SUSP_CFLAG_PARENT:
	/* Inserting Parent */
	wlen = 2;
	break;

	case ISO_SUSP_CFLAG_ROOT:
	/* Inserting slash for ROOT */
	/* Double slash, nothing really to do here. */
	break;

	case ISO_SUSP_CFLAG_VOLROOT:
	/* Inserting a mount point i.e. "/cdrom" */
	/* same as above */
	outbuf -= len;
	len = 0;
	inbuf = ana->imp->im_mountp->mnt_stat.f_mntonname;
	wlen = strlen(inbuf);
	break;

	case ISO_SUSP_CFLAG_HOST:
	/* XXXRW: locking. */
	/* Inserting hostname i.e. "kurt.tools.de" */
	inbuf = V_hostname;
	wlen = strlen(V_hostname);
	break;

	case ISO_SUSP_CFLAG_CONTINUE:
	cont = 1;
	/* FALLTHROUGH */
	case 0:
	/* Inserting component */
	wlen = isonum_711(pcomp->clen);
	inbuf = pcomp->name;
	break;
	default:
	printf("RRIP with incorrect flags?");
	wlen = ana->maxlen + 1;
	break;
	}

	if (len + wlen > ana->maxlen) {
	/* indicate error to caller */
	ana->cont = 1;
	ana->fields = 0;
	ana->outbuf -= *ana->outlen;
	*ana->outlen = 0;
	return 0;
	}

	bcopy(inbuf,outbuf,wlen);
	outbuf += wlen;
	len += wlen;

	}
	ana->outbuf = outbuf;
	*ana->outlen = len;
	ana->cont = cont;

	if (!isonum_711(p->flags)) {
	ana->fields &= ~ISO_SUSP_SLINK;
	return ISO_SUSP_SLINK;
	}
	return 0;
	}

	/*
	* Alternate name
	*/
	static int
	cd9660_rrip_altname(p,ana)
	ISO_RRIP_ALTNAME *p;
	ISO_RRIP_ANALYZE *ana;
	{
	+ INIT_VPROCG(TD_TO_VPROCG(curthread));
	char *inbuf;
	int wlen;
	int cont;

	inbuf = "..";
	wlen = 0;
	cont = 0;

	switch (*p->flags) {
	case ISO_SUSP_CFLAG_CURRENT:
	/* Inserting Current */
	wlen = 1;
	break;

	case ISO_SUSP_CFLAG_PARENT:
	/* Inserting Parent */
	wlen = 2;
	break;

	case ISO_SUSP_CFLAG_HOST:
	/* XXXRW: locking. */
	/* Inserting hostname i.e. "kurt.tools.de" */
	inbuf = V_hostname;
	wlen = strlen(V_hostname);
	break;

	case ISO_SUSP_CFLAG_CONTINUE:
	cont = 1;
	/* FALLTHROUGH */
	case 0:
	/* Inserting component */
	wlen = isonum_711(p->h.length) - 5;
	inbuf = (char *)p + 5;
	break;

	default:
	printf("RRIP with incorrect NM flags?\n");
	wlen = ana->maxlen + 1;
	break;
	}

	if ((*ana->outlen += wlen) > ana->maxlen) {
	/* treat as no name field */
	ana->fields &= ~ISO_SUSP_ALTNAME;
	ana->outbuf -= *ana->outlen - wlen;
	*ana->outlen = 0;
	return 0;
	}

	bcopy(inbuf,ana->outbuf,wlen);
	ana->outbuf += wlen;

	if (!cont) {
	ana->fields &= ~ISO_SUSP_ALTNAME;
	return ISO_SUSP_ALTNAME;
	}
	return 0;
	}

	static void
	cd9660_rrip_defname(isodir,ana)
	struct iso_directory_record *isodir;
	ISO_RRIP_ANALYZE *ana;
	{
	isofntrans(isodir->name,isonum_711(isodir->name_len),
	ana->outbuf,ana->outlen,
	1,isonum_711(isodir->flags)&4, ana->imp->joliet_level,
	ana->imp->im_flags, ana->imp->im_d2l);
	switch (*ana->outbuf) {
	default:
	break;
	case 1:
	*ana->outlen = 2;
	/* FALLTHROUGH */
	case 0:
	/* outlen is 1 already */
	strcpy(ana->outbuf,"..");
	break;
	}
	}

	/*
	* Parent or Child Link
	*/
	static int
	cd9660_rrip_pclink(p,ana)
	ISO_RRIP_CLINK *p;
	ISO_RRIP_ANALYZE *ana;
	{
	*ana->inump = isonum_733(p->dir_loc) << ana->imp->im_bshift;
	ana->fields &= ~(ISO_SUSP_CLINK\|ISO_SUSP_PLINK);
	return *p->h.type == 'C' ? ISO_SUSP_CLINK : ISO_SUSP_PLINK;
	}

	/*
	* Relocated directory
	*/
	static int
	cd9660_rrip_reldir(p,ana)
	ISO_RRIP_RELDIR *p;
	ISO_RRIP_ANALYZE *ana;
	{
	/* special hack to make caller aware of RE field */
	*ana->outlen = 0;
	ana->fields = 0;
	return ISO_SUSP_RELDIR\|ISO_SUSP_ALTNAME\|ISO_SUSP_CLINK\|ISO_SUSP_PLINK;
	}

	static int
	cd9660_rrip_tstamp(p,ana)
	ISO_RRIP_TSTAMP *p;
	ISO_RRIP_ANALYZE *ana;
	{
	u_char *ptime;

	ptime = p->time;

	/* Check a format of time stamp (7bytes/17bytes) */
	if (!(*p->flags&ISO_SUSP_TSTAMP_FORM17)) {
	if (*p->flags&ISO_SUSP_TSTAMP_CREAT)
	ptime += 7;

	if (*p->flags&ISO_SUSP_TSTAMP_MODIFY) {
	cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_mtime,
	ISO_FTYPE_RRIP);
	ptime += 7;
	} else
	bzero(&ana->inop->inode.iso_mtime,sizeof(struct timespec));

	if (*p->flags&ISO_SUSP_TSTAMP_ACCESS) {
	cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_atime,
	ISO_FTYPE_RRIP);
	ptime += 7;
	} else
	ana->inop->inode.iso_atime = ana->inop->inode.iso_mtime;

	if (*p->flags&ISO_SUSP_TSTAMP_ATTR)
	cd9660_tstamp_conv7(ptime,&ana->inop->inode.iso_ctime,
	ISO_FTYPE_RRIP);
	else
	ana->inop->inode.iso_ctime = ana->inop->inode.iso_mtime;

	} else {
	if (*p->flags&ISO_SUSP_TSTAMP_CREAT)
	ptime += 17;

	if (*p->flags&ISO_SUSP_TSTAMP_MODIFY) {
	cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_mtime);
	ptime += 17;
	} else
	bzero(&ana->inop->inode.iso_mtime,sizeof(struct timespec));

	if (*p->flags&ISO_SUSP_TSTAMP_ACCESS) {
	cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_atime);
	ptime += 17;
	} else
	ana->inop->inode.iso_atime = ana->inop->inode.iso_mtime;

	if (*p->flags&ISO_SUSP_TSTAMP_ATTR)
	cd9660_tstamp_conv17(ptime,&ana->inop->inode.iso_ctime);
	else
	ana->inop->inode.iso_ctime = ana->inop->inode.iso_mtime;

	}
	ana->fields &= ~ISO_SUSP_TSTAMP;
	return ISO_SUSP_TSTAMP;
	}

	static void
	cd9660_rrip_deftstamp(isodir,ana)
	struct iso_directory_record *isodir;
	ISO_RRIP_ANALYZE *ana;
	{
	cd9660_deftstamp(isodir,ana->inop,NULL,ISO_FTYPE_RRIP);
	}

	/*
	* POSIX device modes
	*/
	static int
	cd9660_rrip_device(p,ana)
	ISO_RRIP_DEVICE *p;
	ISO_RRIP_ANALYZE *ana;
	{
	u_int high, low;

	high = isonum_733(p->dev_t_high);
	low = isonum_733(p->dev_t_low);

	if (high == 0)
	ana->inop->inode.iso_rdev = makedev(umajor(low), uminor(low));
	else
	ana->inop->inode.iso_rdev = makedev(high, uminor(low));
	ana->fields &= ~ISO_SUSP_DEVICE;
	return ISO_SUSP_DEVICE;
	}

	/*
	* Flag indicating
	*/
	static int
	cd9660_rrip_idflag(p,ana)
	ISO_RRIP_IDFLAG *p;
	ISO_RRIP_ANALYZE *ana;
	{
	ana->fields &= isonum_711(p->flags)\|~0xff; /* don't touch high bits */
	/* special handling of RE field */
	if (ana->fields&ISO_SUSP_RELDIR)
	return cd9660_rrip_reldir(/* XXX / (ISO_RRIP_RELDIR )p,ana);

	return ISO_SUSP_IDFLAG;
	}

	/*
	* Continuation pointer
	*/
	static int
	cd9660_rrip_cont(p,ana)
	ISO_RRIP_CONT *p;
	ISO_RRIP_ANALYZE *ana;
	{
	ana->iso_ce_blk = isonum_733(p->location);
	ana->iso_ce_off = isonum_733(p->offset);
	ana->iso_ce_len = isonum_733(p->length);
	return ISO_SUSP_CONT;
	}

	/*
	* System Use end
	*/
	static int
	cd9660_rrip_stop(p,ana)
	ISO_SUSP_HEADER *p;
	ISO_RRIP_ANALYZE *ana;
	{
	return ISO_SUSP_STOP;
	}

	/*
	* Extension reference
	*/
	static int
	cd9660_rrip_extref(p,ana)
	ISO_RRIP_EXTREF *p;
	ISO_RRIP_ANALYZE *ana;
	{
	if (isonum_711(p->len_id) != 10
	\|\| bcmp((char *)p + 8,"RRIP_1991A",10)
	\|\| isonum_711(p->version) != 1)
	return 0;
	ana->fields &= ~ISO_SUSP_EXTREF;
	return ISO_SUSP_EXTREF;
	}

	static int
	cd9660_rrip_loop(isodir,ana,table)
	struct iso_directory_record *isodir;
	ISO_RRIP_ANALYZE *ana;
	RRIP_TABLE *table;
	{
	RRIP_TABLE *ptable;
	ISO_SUSP_HEADER *phead;
	ISO_SUSP_HEADER *pend;
	struct buf *bp = NULL;
	char *pwhead;
	u_short c;
	int result;

	/*
	* Note: If name length is odd,
	* it will be padding 1 byte after the name
	*/
	pwhead = isodir->name + isonum_711(isodir->name_len);
	if (!(isonum_711(isodir->name_len)&1))
	pwhead++;
	isochar(isodir->name, pwhead, ana->imp->joliet_level, &c, NULL,
	ana->imp->im_flags, ana->imp->im_d2l);

	/* If it's not the '.' entry of the root dir obey SP field */
	if (c != 0 \|\| isonum_733(isodir->extent) != ana->imp->root_extent)
	pwhead += ana->imp->rr_skip;
	else
	pwhead += ana->imp->rr_skip0;

	phead = (ISO_SUSP_HEADER *)pwhead;
	pend = (ISO_SUSP_HEADER )((char )isodir + isonum_711(isodir->length));

	result = 0;
	while (1) {
	ana->iso_ce_len = 0;
	/*
	* Note: "pend" should be more than one SUSP header
	*/
	while (pend >= phead + 1) {
	if (isonum_711(phead->version) == 1) {
	for (ptable = table; ptable->func; ptable++) {
	if (phead->type == ptable->type
	&& phead->type[1] == ptable->type[1]) {
	result \|= ptable->func(phead,ana);
	break;
	}
	}
	if (!ana->fields)
	break;
	}
	if (result&ISO_SUSP_STOP) {
	result &= ~ISO_SUSP_STOP;
	break;
	}
	/* plausibility check */
	if (isonum_711(phead->length) < sizeof(*phead))
	break;
	/*
	* move to next SUSP
	* Hopefully this works with newer versions, too
	*/
	phead = (ISO_SUSP_HEADER )((char )phead + isonum_711(phead->length));
	}

	if (ana->fields && ana->iso_ce_len) {
	if (ana->iso_ce_blk >= ana->imp->volume_space_size
	\|\| ana->iso_ce_off + ana->iso_ce_len > ana->imp->logical_block_size
	\|\| bread(ana->imp->im_devvp,
	ana->iso_ce_blk <<
	(ana->imp->im_bshift - DEV_BSHIFT),
	ana->imp->logical_block_size, NOCRED, &bp))
	/* what to do now? */
	break;
	phead = (ISO_SUSP_HEADER *)(bp->b_data + ana->iso_ce_off);
	pend = (ISO_SUSP_HEADER ) ((char )phead + ana->iso_ce_len);
	} else
	break;
	}
	if (bp)
	brelse(bp);
	/*
	* If we don't find the Basic SUSP stuffs, just set default value
	* (attribute/time stamp)
	*/
	for (ptable = table; ptable->func2; ptable++)
	if (!(ptable->result&result))
	ptable->func2(isodir,ana);

	return result;
	}

	/*
	* Get Attributes.
	*/
	/*
	* XXX the casts are bogus but will do for now.
	*/
	#define BC (rrt_func_t *)
	static RRIP_TABLE rrip_table_analyze[] = {
	{ "PX", BC cd9660_rrip_attr, cd9660_rrip_defattr, ISO_SUSP_ATTR },
	{ "TF", BC cd9660_rrip_tstamp, cd9660_rrip_deftstamp, ISO_SUSP_TSTAMP },
	{ "PN", BC cd9660_rrip_device, 0, ISO_SUSP_DEVICE },
	{ "RR", BC cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG },
	{ "CE", BC cd9660_rrip_cont, 0, ISO_SUSP_CONT },
	{ "ST", BC cd9660_rrip_stop, 0, ISO_SUSP_STOP },
	{ "", 0, 0, 0 }
	};

	int
	cd9660_rrip_analyze(isodir,inop,imp)
	struct iso_directory_record *isodir;
	struct iso_node *inop;
	struct iso_mnt *imp;
	{
	ISO_RRIP_ANALYZE analyze;

	analyze.inop = inop;
	analyze.imp = imp;
	analyze.fields = ISO_SUSP_ATTR\|ISO_SUSP_TSTAMP\|ISO_SUSP_DEVICE;

	return cd9660_rrip_loop(isodir,&analyze,rrip_table_analyze);
	}

	/*
	* Get Alternate Name.
	*/
	static RRIP_TABLE rrip_table_getname[] = {
	{ "NM", BC cd9660_rrip_altname, cd9660_rrip_defname, ISO_SUSP_ALTNAME },
	{ "CL", BC cd9660_rrip_pclink, 0, ISO_SUSP_CLINK\|ISO_SUSP_PLINK },
	{ "PL", BC cd9660_rrip_pclink, 0, ISO_SUSP_CLINK\|ISO_SUSP_PLINK },
	{ "RE", BC cd9660_rrip_reldir, 0, ISO_SUSP_RELDIR },
	{ "RR", BC cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG },
	{ "CE", BC cd9660_rrip_cont, 0, ISO_SUSP_CONT },
	{ "ST", BC cd9660_rrip_stop, 0, ISO_SUSP_STOP },
	{ "", 0, 0, 0 }
	};

	int
	cd9660_rrip_getname(isodir,outbuf,outlen,inump,imp)
	struct iso_directory_record *isodir;
	char *outbuf;
	u_short *outlen;
	ino_t *inump;
	struct iso_mnt *imp;
	{
	ISO_RRIP_ANALYZE analyze;
	RRIP_TABLE *tab;
	u_short c;

	analyze.outbuf = outbuf;
	analyze.outlen = outlen;
	analyze.maxlen = NAME_MAX;
	analyze.inump = inump;
	analyze.imp = imp;
	analyze.fields = ISO_SUSP_ALTNAME\|ISO_SUSP_RELDIR\|ISO_SUSP_CLINK\|ISO_SUSP_PLINK;
	*outlen = 0;

	isochar(isodir->name, isodir->name + isonum_711(isodir->name_len),
	imp->joliet_level, &c, NULL, imp->im_flags, imp->im_d2l);
	tab = rrip_table_getname;
	if (c == 0 \|\| c == 1) {
	cd9660_rrip_defname(isodir,&analyze);

	analyze.fields &= ~ISO_SUSP_ALTNAME;
	tab++;
	}

	return cd9660_rrip_loop(isodir,&analyze,tab);
	}

	/*
	* Get Symbolic Link.
	*/
	static RRIP_TABLE rrip_table_getsymname[] = {
	{ "SL", BC cd9660_rrip_slink, 0, ISO_SUSP_SLINK },
	{ "RR", BC cd9660_rrip_idflag, 0, ISO_SUSP_IDFLAG },
	{ "CE", BC cd9660_rrip_cont, 0, ISO_SUSP_CONT },
	{ "ST", BC cd9660_rrip_stop, 0, ISO_SUSP_STOP },
	{ "", 0, 0, 0 }
	};

	int
	cd9660_rrip_getsymname(isodir,outbuf,outlen,imp)
	struct iso_directory_record *isodir;
	char *outbuf;
	u_short *outlen;
	struct iso_mnt *imp;
	{
	ISO_RRIP_ANALYZE analyze;

	analyze.outbuf = outbuf;
	analyze.outlen = outlen;
	*outlen = 0;
	analyze.maxlen = MAXPATHLEN;
	analyze.cont = 1; /* don't start with a slash */
	analyze.imp = imp;
	analyze.fields = ISO_SUSP_SLINK;

	return (cd9660_rrip_loop(isodir,&analyze,rrip_table_getsymname)&ISO_SUSP_SLINK);
	}

	static RRIP_TABLE rrip_table_extref[] = {
	{ "ER", BC cd9660_rrip_extref, 0, ISO_SUSP_EXTREF },
	{ "CE", BC cd9660_rrip_cont, 0, ISO_SUSP_CONT },
	{ "ST", BC cd9660_rrip_stop, 0, ISO_SUSP_STOP },
	{ "", 0, 0, 0 }
	};

	/*
	* Check for Rock Ridge Extension and return offset of its fields.
	* Note: We insist on the ER field.
	*/
	int
	cd9660_rrip_offset(isodir,imp)
	struct iso_directory_record *isodir;
	struct iso_mnt *imp;
	{
	ISO_RRIP_OFFSET *p;
	ISO_RRIP_ANALYZE analyze;

	imp->rr_skip0 = 0;
	p = (ISO_RRIP_OFFSET *)(isodir->name + 1);
	if (bcmp(p,"SP\7\1\276\357",6)) {
	/* Maybe, it's a CDROM XA disc? */
	imp->rr_skip0 = 15;
	p = (ISO_RRIP_OFFSET )((char )p + 15);
	if (bcmp(p,"SP\7\1\276\357",6))
	return -1;
	}

	analyze.imp = imp;
	analyze.fields = ISO_SUSP_EXTREF;
	if (!(cd9660_rrip_loop(isodir,&analyze,rrip_table_extref)&ISO_SUSP_EXTREF))
	return -1;

	return isonum_711(p->skip);
	}
	Index: head/sys/kern/kern_jail.c
	===================================================================
	--- head/sys/kern/kern_jail.c (revision 183549)
	+++ head/sys/kern/kern_jail.c (revision 183550)
	@@ -1,1000 +1,1001 @@
	/*-
	* ----------------------------------------------------------------------------
	* "THE BEER-WARE LICENSE" (Revision 42):
	* <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
	* can do whatever you want with this stuff. If we meet some day, and you think
	* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
	* ----------------------------------------------------------------------------
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/types.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/sysproto.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/taskqueue.h>
	#include <sys/fcntl.h>
	#include <sys/jail.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/namei.h>
	#include <sys/mount.h>
	#include <sys/queue.h>
	#include <sys/socket.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>
	#include <sys/vimage.h>
	#include <net/if.h>
	#include <netinet/in.h>

	#include <security/mac/mac_framework.h>

	MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");

	SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
	"Jail rules");

	int jail_set_hostname_allowed = 1;
	SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
	&jail_set_hostname_allowed, 0,
	"Processes in jail can set their hostnames");

	int jail_socket_unixiproute_only = 1;
	SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
	&jail_socket_unixiproute_only, 0,
	"Processes in jail are limited to creating UNIX/IPv4/route sockets only");

	int jail_sysvipc_allowed = 0;
	SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
	&jail_sysvipc_allowed, 0,
	"Processes in jail can use System V IPC primitives");

	static int jail_enforce_statfs = 2;
	SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
	&jail_enforce_statfs, 0,
	"Processes in jail cannot see all mounted file systems");

	int jail_allow_raw_sockets = 0;
	SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
	&jail_allow_raw_sockets, 0,
	"Prison root can create raw sockets");

	int jail_chflags_allowed = 0;
	SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
	&jail_chflags_allowed, 0,
	"Processes in jail can alter system file flags");

	int jail_mount_allowed = 0;
	SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
	&jail_mount_allowed, 0,
	"Processes in jail can mount/unmount jail-friendly file systems");

	/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
	struct prisonlist allprison;
	struct sx allprison_lock;
	int lastprid = 0;
	int prisoncount = 0;

	/*
	* List of jail services. Protected by allprison_lock.
	*/
	TAILQ_HEAD(prison_services_head, prison_service);
	static struct prison_services_head prison_services =
	TAILQ_HEAD_INITIALIZER(prison_services);
	static int prison_service_slots = 0;

	struct prison_service {
	prison_create_t ps_create;
	prison_destroy_t ps_destroy;
	int ps_slotno;
	TAILQ_ENTRY(prison_service) ps_next;
	char ps_name[0];
	};

	static void init_prison(void *);
	static void prison_complete(void *context, int pending);
	static int sysctl_jail_list(SYSCTL_HANDLER_ARGS);

	static void
	init_prison(void *data __unused)
	{

	sx_init(&allprison_lock, "allprison");
	LIST_INIT(&allprison);
	}

	SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);

	/*
	* struct jail_args {
	* struct jail *jail;
	* };
	*/
	int
	jail(struct thread td, struct jail_args uap)
	{
	struct nameidata nd;
	struct prison pr, tpr;
	struct prison_service *psrv;
	struct jail j;
	struct jail_attach_args jaa;
	int vfslocked, error, tryprid;

	error = copyin(uap->jail, &j, sizeof(j));
	if (error)
	return (error);
	if (j.version != 0)
	return (EINVAL);

	MALLOC(pr, struct prison , sizeof(pr), M_PRISON, M_WAITOK \| M_ZERO);
	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
	pr->pr_ref = 1;
	error = copyinstr(j.path, &pr->pr_path, sizeof(pr->pr_path), 0);
	if (error)
	goto e_killmtx;
	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| LOCKLEAF, UIO_SYSSPACE,
	pr->pr_path, td);
	error = namei(&nd);
	if (error)
	goto e_killmtx;
	vfslocked = NDHASGIANT(&nd);
	pr->pr_root = nd.ni_vp;
	VOP_UNLOCK(nd.ni_vp, 0);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	VFS_UNLOCK_GIANT(vfslocked);
	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
	if (error)
	goto e_dropvnref;
	pr->pr_ip = j.ip_number;
	pr->pr_linux = NULL;
	pr->pr_securelevel = securelevel;
	if (prison_service_slots == 0)
	pr->pr_slots = NULL;
	else {
	pr->pr_slots = malloc(sizeof(pr->pr_slots) prison_service_slots,
	M_PRISON, M_ZERO \| M_WAITOK);
	}

	/* Determine next pr_id and add prison to allprison list. */
	sx_xlock(&allprison_lock);
	tryprid = lastprid + 1;
	if (tryprid == JAIL_MAX)
	tryprid = 1;
	next:
	LIST_FOREACH(tpr, &allprison, pr_list) {
	if (tpr->pr_id == tryprid) {
	tryprid++;
	if (tryprid == JAIL_MAX) {
	sx_xunlock(&allprison_lock);
	error = EAGAIN;
	goto e_dropvnref;
	}
	goto next;
	}
	}
	pr->pr_id = jaa.jid = lastprid = tryprid;
	LIST_INSERT_HEAD(&allprison, pr, pr_list);
	prisoncount++;
	sx_downgrade(&allprison_lock);
	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
	psrv->ps_create(psrv, pr);
	}
	sx_sunlock(&allprison_lock);

	error = jail_attach(td, &jaa);
	if (error)
	goto e_dropprref;
	mtx_lock(&pr->pr_mtx);
	pr->pr_ref--;
	mtx_unlock(&pr->pr_mtx);
	td->td_retval[0] = jaa.jid;
	return (0);
	e_dropprref:
	sx_xlock(&allprison_lock);
	LIST_REMOVE(pr, pr_list);
	prisoncount--;
	sx_downgrade(&allprison_lock);
	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
	psrv->ps_destroy(psrv, pr);
	}
	sx_sunlock(&allprison_lock);
	e_dropvnref:
	if (pr->pr_slots != NULL)
	FREE(pr->pr_slots, M_PRISON);
	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
	vrele(pr->pr_root);
	VFS_UNLOCK_GIANT(vfslocked);
	e_killmtx:
	mtx_destroy(&pr->pr_mtx);
	FREE(pr, M_PRISON);
	return (error);
	}

	/*
	* struct jail_attach_args {
	* int jid;
	* };
	*/
	int
	jail_attach(struct thread td, struct jail_attach_args uap)
	{
	struct proc *p;
	struct ucred newcred, oldcred;
	struct prison *pr;
	int vfslocked, error;

	/*
	* XXX: Note that there is a slight race here if two threads
	* in the same privileged process attempt to attach to two
	* different jails at the same time. It is important for
	* user processes not to do this, or they might end up with
	* a process root from one prison, but attached to the jail
	* of another.
	*/
	error = priv_check(td, PRIV_JAIL_ATTACH);
	if (error)
	return (error);

	p = td->td_proc;
	sx_slock(&allprison_lock);
	pr = prison_find(uap->jid);
	if (pr == NULL) {
	sx_sunlock(&allprison_lock);
	return (EINVAL);
	}
	pr->pr_ref++;
	mtx_unlock(&pr->pr_mtx);
	sx_sunlock(&allprison_lock);

	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
	vn_lock(pr->pr_root, LK_EXCLUSIVE \| LK_RETRY);
	if ((error = change_dir(pr->pr_root, td)) != 0)
	goto e_unlock;
	#ifdef MAC
	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
	goto e_unlock;
	#endif
	VOP_UNLOCK(pr->pr_root, 0);
	change_root(pr->pr_root, td);
	VFS_UNLOCK_GIANT(vfslocked);

	newcred = crget();
	PROC_LOCK(p);
	oldcred = p->p_ucred;
	setsugid(p);
	crcopy(newcred, oldcred);
	newcred->cr_prison = pr;
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);
	e_unlock:
	VOP_UNLOCK(pr->pr_root, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	mtx_lock(&pr->pr_mtx);
	pr->pr_ref--;
	mtx_unlock(&pr->pr_mtx);
	return (error);
	}

	/*
	* Returns a locked prison instance, or NULL on failure.
	*/
	struct prison *
	prison_find(int prid)
	{
	struct prison *pr;

	sx_assert(&allprison_lock, SX_LOCKED);
	LIST_FOREACH(pr, &allprison, pr_list) {
	if (pr->pr_id == prid) {
	mtx_lock(&pr->pr_mtx);
	if (pr->pr_ref == 0) {
	mtx_unlock(&pr->pr_mtx);
	break;
	}
	return (pr);
	}
	}
	return (NULL);
	}

	void
	prison_free(struct prison *pr)
	{

	mtx_lock(&pr->pr_mtx);
	pr->pr_ref--;
	if (pr->pr_ref == 0) {
	mtx_unlock(&pr->pr_mtx);
	TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
	taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
	return;
	}
	mtx_unlock(&pr->pr_mtx);
	}

	static void
	prison_complete(void *context, int pending)
	{
	struct prison_service *psrv;
	struct prison *pr;
	int vfslocked;

	pr = (struct prison *)context;

	sx_xlock(&allprison_lock);
	LIST_REMOVE(pr, pr_list);
	prisoncount--;
	sx_downgrade(&allprison_lock);
	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
	psrv->ps_destroy(psrv, pr);
	}
	sx_sunlock(&allprison_lock);
	if (pr->pr_slots != NULL)
	FREE(pr->pr_slots, M_PRISON);

	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
	vrele(pr->pr_root);
	VFS_UNLOCK_GIANT(vfslocked);

	mtx_destroy(&pr->pr_mtx);
	if (pr->pr_linux != NULL)
	FREE(pr->pr_linux, M_PRISON);
	FREE(pr, M_PRISON);
	}

	void
	prison_hold(struct prison *pr)
	{

	mtx_lock(&pr->pr_mtx);
	KASSERT(pr->pr_ref > 0,
	("Trying to hold dead prison (id=%d).", pr->pr_id));
	pr->pr_ref++;
	mtx_unlock(&pr->pr_mtx);
	}

	u_int32_t
	prison_getip(struct ucred *cred)
	{

	return (cred->cr_prison->pr_ip);
	}

	int
	prison_ip(struct ucred cred, int flag, u_int32_t ip)
	{
	u_int32_t tmp;

	if (!jailed(cred))
	return (0);
	if (flag)
	tmp = *ip;
	else
	tmp = ntohl(*ip);
	if (tmp == INADDR_ANY) {
	if (flag)
	*ip = cred->cr_prison->pr_ip;
	else
	*ip = htonl(cred->cr_prison->pr_ip);
	return (0);
	}
	if (tmp == INADDR_LOOPBACK) {
	if (flag)
	*ip = cred->cr_prison->pr_ip;
	else
	*ip = htonl(cred->cr_prison->pr_ip);
	return (0);
	}
	if (cred->cr_prison->pr_ip != tmp)
	return (1);
	return (0);
	}

	void
	prison_remote_ip(struct ucred cred, int flag, u_int32_t ip)
	{
	u_int32_t tmp;

	if (!jailed(cred))
	return;
	if (flag)
	tmp = *ip;
	else
	tmp = ntohl(*ip);
	if (tmp == INADDR_LOOPBACK) {
	if (flag)
	*ip = cred->cr_prison->pr_ip;
	else
	*ip = htonl(cred->cr_prison->pr_ip);
	return;
	}
	return;
	}

	int
	prison_if(struct ucred cred, struct sockaddr sa)
	{
	struct sockaddr_in *sai;
	int ok;

	sai = (struct sockaddr_in *)sa;
	if ((sai->sin_family != AF_INET) && jail_socket_unixiproute_only)
	ok = 1;
	else if (sai->sin_family != AF_INET)
	ok = 0;
	else if (cred->cr_prison->pr_ip != ntohl(sai->sin_addr.s_addr))
	ok = 1;
	else
	ok = 0;
	return (ok);
	}

	/*
	* Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
	*/
	int
	prison_check(struct ucred cred1, struct ucred cred2)
	{

	if (jailed(cred1)) {
	if (!jailed(cred2))
	return (ESRCH);
	if (cred2->cr_prison != cred1->cr_prison)
	return (ESRCH);
	}

	return (0);
	}

	/*
	* Return 1 if the passed credential is in a jail, otherwise 0.
	*/
	int
	jailed(struct ucred *cred)
	{

	return (cred->cr_prison != NULL);
	}

	/*
	* Return the correct hostname for the passed credential.
	*/
	void
	getcredhostname(struct ucred cred, char buf, size_t size)
	{
	+ INIT_VPROCG(cred->cr_vimage->v_procg);

	if (jailed(cred)) {
	mtx_lock(&cred->cr_prison->pr_mtx);
	strlcpy(buf, cred->cr_prison->pr_host, size);
	mtx_unlock(&cred->cr_prison->pr_mtx);
	} else {
	mtx_lock(&hostname_mtx);
	strlcpy(buf, V_hostname, size);
	mtx_unlock(&hostname_mtx);
	}
	}

	/*
	* Determine whether the subject represented by cred can "see"
	* status of a mount point.
	* Returns: 0 for permitted, ENOENT otherwise.
	* XXX: This function should be called cr_canseemount() and should be
	* placed in kern_prot.c.
	*/
	int
	prison_canseemount(struct ucred cred, struct mount mp)
	{
	struct prison *pr;
	struct statfs *sp;
	size_t len;

	if (!jailed(cred) \|\| jail_enforce_statfs == 0)
	return (0);
	pr = cred->cr_prison;
	if (pr->pr_root->v_mount == mp)
	return (0);
	if (jail_enforce_statfs == 2)
	return (ENOENT);
	/*
	* If jail's chroot directory is set to "/" we should be able to see
	* all mount-points from inside a jail.
	* This is ugly check, but this is the only situation when jail's
	* directory ends with '/'.
	*/
	if (strcmp(pr->pr_path, "/") == 0)
	return (0);
	len = strlen(pr->pr_path);
	sp = &mp->mnt_stat;
	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
	return (ENOENT);
	/*
	* Be sure that we don't have situation where jail's root directory
	* is "/some/path" and mount point is "/some/pathpath".
	*/
	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
	return (ENOENT);
	return (0);
	}

	void
	prison_enforce_statfs(struct ucred cred, struct mount mp, struct statfs *sp)
	{
	char jpath[MAXPATHLEN];
	struct prison *pr;
	size_t len;

	if (!jailed(cred) \|\| jail_enforce_statfs == 0)
	return;
	pr = cred->cr_prison;
	if (prison_canseemount(cred, mp) != 0) {
	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	strlcpy(sp->f_mntonname, "[restricted]",
	sizeof(sp->f_mntonname));
	return;
	}
	if (pr->pr_root->v_mount == mp) {
	/*
	* Clear current buffer data, so we are sure nothing from
	* the valid path left there.
	*/
	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	*sp->f_mntonname = '/';
	return;
	}
	/*
	* If jail's chroot directory is set to "/" we should be able to see
	* all mount-points from inside a jail.
	*/
	if (strcmp(pr->pr_path, "/") == 0)
	return;
	len = strlen(pr->pr_path);
	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
	/*
	* Clear current buffer data, so we are sure nothing from
	* the valid path left there.
	*/
	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	if (*jpath == '\0') {
	/* Should never happen. */
	*sp->f_mntonname = '/';
	} else {
	strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
	}
	}

	/*
	* Check with permission for a specific privilege is granted within jail. We
	* have a specific list of accepted privileges; the rest are denied.
	*/
	int
	prison_priv_check(struct ucred *cred, int priv)
	{

	if (!jailed(cred))
	return (0);

	switch (priv) {

	/*
	* Allow ktrace privileges for root in jail.
	*/
	case PRIV_KTRACE:

	#if 0
	/*
	* Allow jailed processes to configure audit identity and
	* submit audit records (login, etc). In the future we may
	* want to further refine the relationship between audit and
	* jail.
	*/
	case PRIV_AUDIT_GETAUDIT:
	case PRIV_AUDIT_SETAUDIT:
	case PRIV_AUDIT_SUBMIT:
	#endif

	/*
	* Allow jailed processes to manipulate process UNIX
	* credentials in any way they see fit.
	*/
	case PRIV_CRED_SETUID:
	case PRIV_CRED_SETEUID:
	case PRIV_CRED_SETGID:
	case PRIV_CRED_SETEGID:
	case PRIV_CRED_SETGROUPS:
	case PRIV_CRED_SETREUID:
	case PRIV_CRED_SETREGID:
	case PRIV_CRED_SETRESUID:
	case PRIV_CRED_SETRESGID:

	/*
	* Jail implements visibility constraints already, so allow
	* jailed root to override uid/gid-based constraints.
	*/
	case PRIV_SEEOTHERGIDS:
	case PRIV_SEEOTHERUIDS:

	/*
	* Jail implements inter-process debugging limits already, so
	* allow jailed root various debugging privileges.
	*/
	case PRIV_DEBUG_DIFFCRED:
	case PRIV_DEBUG_SUGID:
	case PRIV_DEBUG_UNPRIV:

	/*
	* Allow jail to set various resource limits and login
	* properties, and for now, exceed process resource limits.
	*/
	case PRIV_PROC_LIMIT:
	case PRIV_PROC_SETLOGIN:
	case PRIV_PROC_SETRLIMIT:

	/*
	* System V and POSIX IPC privileges are granted in jail.
	*/
	case PRIV_IPC_READ:
	case PRIV_IPC_WRITE:
	case PRIV_IPC_ADMIN:
	case PRIV_IPC_MSGSIZE:
	case PRIV_MQ_ADMIN:

	/*
	* Jail implements its own inter-process limits, so allow
	* root processes in jail to change scheduling on other
	* processes in the same jail. Likewise for signalling.
	*/
	case PRIV_SCHED_DIFFCRED:
	case PRIV_SIGNAL_DIFFCRED:
	case PRIV_SIGNAL_SUGID:

	/*
	* Allow jailed processes to write to sysctls marked as jail
	* writable.
	*/
	case PRIV_SYSCTL_WRITEJAIL:

	/*
	* Allow root in jail to manage a variety of quota
	* properties. These should likely be conditional on a
	* configuration option.
	*/
	case PRIV_VFS_GETQUOTA:
	case PRIV_VFS_SETQUOTA:

	/*
	* Since Jail relies on chroot() to implement file system
	* protections, grant many VFS privileges to root in jail.
	* Be careful to exclude mount-related and NFS-related
	* privileges.
	*/
	case PRIV_VFS_READ:
	case PRIV_VFS_WRITE:
	case PRIV_VFS_ADMIN:
	case PRIV_VFS_EXEC:
	case PRIV_VFS_LOOKUP:
	case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
	case PRIV_VFS_CHFLAGS_DEV:
	case PRIV_VFS_CHOWN:
	case PRIV_VFS_CHROOT:
	case PRIV_VFS_RETAINSUGID:
	case PRIV_VFS_FCHROOT:
	case PRIV_VFS_LINK:
	case PRIV_VFS_SETGID:
	case PRIV_VFS_STAT:
	case PRIV_VFS_STICKYFILE:
	return (0);

	/*
	* Depending on the global setting, allow privilege of
	* setting system flags.
	*/
	case PRIV_VFS_SYSFLAGS:
	if (jail_chflags_allowed)
	return (0);
	else
	return (EPERM);

	/*
	* Depending on the global setting, allow privilege of
	* mounting/unmounting file systems.
	*/
	case PRIV_VFS_MOUNT:
	case PRIV_VFS_UNMOUNT:
	case PRIV_VFS_MOUNT_NONUSER:
	case PRIV_VFS_MOUNT_OWNER:
	if (jail_mount_allowed)
	return (0);
	else
	return (EPERM);

	/*
	* Allow jailed root to bind reserved ports and reuse in-use
	* ports.
	*/
	case PRIV_NETINET_RESERVEDPORT:
	case PRIV_NETINET_REUSEPORT:
	return (0);

	/*
	* Allow jailed root to set certian IPv4/6 (option) headers.
	*/
	case PRIV_NETINET_SETHDROPTS:
	return (0);

	/*
	* Conditionally allow creating raw sockets in jail.
	*/
	case PRIV_NETINET_RAW:
	if (jail_allow_raw_sockets)
	return (0);
	else
	return (EPERM);

	/*
	* Since jail implements its own visibility limits on netstat
	* sysctls, allow getcred. This allows identd to work in
	* jail.
	*/
	case PRIV_NETINET_GETCRED:
	return (0);

	default:
	/*
	* In all remaining cases, deny the privilege request. This
	* includes almost all network privileges, many system
	* configuration privileges.
	*/
	return (EPERM);
	}
	}

	/*
	* Register jail service. Provides 'create' and 'destroy' methods.
	* 'create' method will be called for every existing jail and all
	* jails in the future as they beeing created.
	* 'destroy' method will be called for every jail going away and
	* for all existing jails at the time of service deregistration.
	*/
	struct prison_service *
	prison_service_register(const char *name, prison_create_t create,
	prison_destroy_t destroy)
	{
	struct prison_service psrv, psrv2;
	struct prison *pr;
	int reallocate = 1, slotno = 0;
	void slots, oldslots;

	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
	M_WAITOK \| M_ZERO);
	psrv->ps_create = create;
	psrv->ps_destroy = destroy;
	strcpy(psrv->ps_name, name);
	/*
	* Grab the allprison_lock here, so we won't miss any jail
	* creation/destruction.
	*/
	sx_xlock(&allprison_lock);
	#ifdef INVARIANTS
	/*
	* Verify if service is not already registered.
	*/
	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
	KASSERT(strcmp(psrv2->ps_name, name) != 0,
	("jail service %s already registered", name));
	}
	#endif
	/*
	* Find free slot. When there is no existing free slot available,
	* allocate one at the end.
	*/
	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
	if (psrv2->ps_slotno != slotno) {
	KASSERT(slotno < psrv2->ps_slotno,
	("Invalid slotno (slotno=%d >= ps_slotno=%d",
	slotno, psrv2->ps_slotno));
	/* We found free slot. */
	reallocate = 0;
	break;
	}
	slotno++;
	}
	psrv->ps_slotno = slotno;
	/*
	* Keep the list sorted by slot number.
	*/
	if (psrv2 != NULL) {
	KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
	TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
	} else {
	KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
	TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
	}
	prison_service_slots++;
	sx_downgrade(&allprison_lock);
	/*
	* Allocate memory for new slot if we didn't found empty one.
	* Do not use realloc(9), because pr_slots is protected with a mutex,
	* so we can't sleep.
	*/
	LIST_FOREACH(pr, &allprison, pr_list) {
	if (reallocate) {
	/* First allocate memory with M_WAITOK. */
	slots = malloc(sizeof(slots) prison_service_slots,
	M_PRISON, M_WAITOK);
	/* Now grab the mutex and replace pr_slots. */
	mtx_lock(&pr->pr_mtx);
	oldslots = pr->pr_slots;
	if (psrv->ps_slotno > 0) {
	bcopy(oldslots, slots,
	sizeof(slots) (prison_service_slots - 1));
	}
	slots[psrv->ps_slotno] = NULL;
	pr->pr_slots = slots;
	mtx_unlock(&pr->pr_mtx);
	if (oldslots != NULL)
	free(oldslots, M_PRISON);
	}
	/*
	* Call 'create' method for each existing jail.
	*/
	psrv->ps_create(psrv, pr);
	}
	sx_sunlock(&allprison_lock);

	return (psrv);
	}

	void
	prison_service_deregister(struct prison_service *psrv)
	{
	struct prison *pr;
	void slots, oldslots;
	int last = 0;

	sx_xlock(&allprison_lock);
	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
	last = 1;
	TAILQ_REMOVE(&prison_services, psrv, ps_next);
	prison_service_slots--;
	sx_downgrade(&allprison_lock);
	LIST_FOREACH(pr, &allprison, pr_list) {
	/*
	* Call 'destroy' method for every currently existing jail.
	*/
	psrv->ps_destroy(psrv, pr);
	/*
	* If this is the last slot, free the memory allocated for it.
	*/
	if (last) {
	if (prison_service_slots == 0)
	slots = NULL;
	else {
	slots = malloc(sizeof(slots) prison_service_slots,
	M_PRISON, M_WAITOK);
	}
	mtx_lock(&pr->pr_mtx);
	oldslots = pr->pr_slots;
	/*
	* We require setting slot to NULL after freeing it,
	* this way we can check for memory leaks here.
	*/
	KASSERT(oldslots[psrv->ps_slotno] == NULL,
	("Slot %d (service %s, jailid=%d) still contains data?",
	psrv->ps_slotno, psrv->ps_name, pr->pr_id));
	if (psrv->ps_slotno > 0) {
	bcopy(oldslots, slots,
	sizeof(slots) prison_service_slots);
	}
	pr->pr_slots = slots;
	mtx_unlock(&pr->pr_mtx);
	KASSERT(oldslots != NULL, ("oldslots == NULL"));
	free(oldslots, M_PRISON);
	}
	}
	sx_sunlock(&allprison_lock);
	free(psrv, M_PRISON);
	}

	/*
	* Function sets data for the given jail in slot assigned for the given
	* jail service.
	*/
	void
	prison_service_data_set(struct prison_service psrv, struct prison pr,
	void *data)
	{

	mtx_assert(&pr->pr_mtx, MA_OWNED);
	pr->pr_slots[psrv->ps_slotno] = data;
	}

	/*
	* Function clears slots assigned for the given jail service in the given
	* prison structure and returns current slot data.
	*/
	void *
	prison_service_data_del(struct prison_service psrv, struct prison pr)
	{
	void *data;

	mtx_assert(&pr->pr_mtx, MA_OWNED);
	data = pr->pr_slots[psrv->ps_slotno];
	pr->pr_slots[psrv->ps_slotno] = NULL;
	return (data);
	}

	/*
	* Function returns current data from the slot assigned to the given jail
	* service for the given jail.
	*/
	void *
	prison_service_data_get(struct prison_service psrv, struct prison pr)
	{

	mtx_assert(&pr->pr_mtx, MA_OWNED);
	return (pr->pr_slots[psrv->ps_slotno]);
	}

	static int
	sysctl_jail_list(SYSCTL_HANDLER_ARGS)
	{
	struct xprison xp, sxp;
	struct prison *pr;
	int count, error;

	if (jailed(req->td->td_ucred))
	return (0);

	sx_slock(&allprison_lock);
	if ((count = prisoncount) == 0) {
	sx_sunlock(&allprison_lock);
	return (0);
	}

	sxp = xp = malloc(sizeof(xp) count, M_TEMP, M_WAITOK \| M_ZERO);

	LIST_FOREACH(pr, &allprison, pr_list) {
	xp->pr_version = XPRISON_VERSION;
	xp->pr_id = pr->pr_id;
	xp->pr_ip = pr->pr_ip;
	strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
	mtx_lock(&pr->pr_mtx);
	strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
	mtx_unlock(&pr->pr_mtx);
	xp++;
	}
	sx_sunlock(&allprison_lock);

	error = SYSCTL_OUT(req, sxp, sizeof(sxp) count);
	free(sxp, M_TEMP);
	return (error);
	}

	SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT \| CTLFLAG_RD,
	NULL, 0, sysctl_jail_list, "S", "List of active jails");

	static int
	sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
	{
	int error, injail;

	injail = jailed(req->td->td_ucred);
	error = SYSCTL_OUT(req, &injail, sizeof(injail));

	return (error);
	}
	SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT \| CTLFLAG_RD,
	NULL, 0, sysctl_jail_jailed, "I", "Process in jail?");
	Index: head/sys/kern/kern_mib.c
	===================================================================
	--- head/sys/kern/kern_mib.c (revision 183549)
	+++ head/sys/kern/kern_mib.c (revision 183550)
	@@ -1,465 +1,470 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Mike Karels at Berkeley Software Design, Inc.
	*
	* Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
	* project, to make these variables more userfriendly.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_posix.h"
	#include "opt_config.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/sbuf.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/jail.h>
	#include <sys/smp.h>
	#include <sys/unistd.h>
	#include <sys/vimage.h>

	SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0,
	"Sysctl internal magic");
	SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0,
	"High kernel, proc, limits &c");
	SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0,
	"Virtual memory");
	SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0,
	"File system");
	SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0,
	"Network, (see socket.h)");
	SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0,
	"Debugging");
	SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW, 0,
	"Sizeof various things");
	SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0,
	"hardware");
	SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
	"machine dependent");
	SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0,
	"user-level");
	SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0,
	"p1003_1b, (see p1003_1b.h)");

	SYSCTL_NODE(, OID_AUTO, compat, CTLFLAG_RW, 0,
	"Compatibility code");
	SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0,
	"Security");
	#ifdef REGRESSION
	SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0,
	"Regression test MIB");
	#endif

	SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD,
	kern_ident, 0, "Kernel identifier");

	SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD,
	osrelease, 0, "Operating system release");

	SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD,
	0, BSD, "Operating system revision");

	SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD,
	version, 0, "Kernel version");

	SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD,
	ostype, 0, "Operating system type");

	/*
	* NOTICE: The userland release date is available in
	* /usr/include/osreldate.h
	*/
	SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD,
	&osreldate, 0, "Kernel release date");

	SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN,
	&maxproc, 0, "Maximum number of processes");

	SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
	&maxprocperuid, 0, "Maximum processes allowed per userid");

	SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN,
	&maxusers, 0, "Hint for kernel tuning");

	SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD,
	0, ARG_MAX, "Maximum bytes of argument to execve(2)");

	SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD,
	0, _POSIX_VERSION, "Version of POSIX attempting to comply to");

	SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD,
	0, NGROUPS_MAX, "Maximum number of groups a user can belong to");

	SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD,
	0, 1, "Whether job control is available");

	#ifdef _POSIX_SAVED_IDS
	SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
	0, 1, "Whether saved set-group/user ID is available");
	#else
	SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
	0, 0, "Whether saved set-group/user ID is available");
	#endif

	char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */

	SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
	kernelname, sizeof kernelname, "Name of kernel file booted");

	SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD,
	&mp_ncpus, 0, "Number of active CPUs");

	SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD,
	0, BYTE_ORDER, "System byte order");

	SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD,
	0, PAGE_SIZE, "System memory page size");

	static int
	sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
	{
	char buf[256];
	size_t len;

	len = req->oldlen;
	if (len > sizeof(buf))
	len = sizeof(buf);
	arc4rand(buf, len, 0);
	return (SYSCTL_OUT(req, buf, len));
	}

	SYSCTL_PROC(_kern, KERN_ARND, arandom, CTLTYPE_OPAQUE \| CTLFLAG_RD,
	NULL, 0, sysctl_kern_arnd, "", "arc4rand");

	static int
	sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
	{
	u_long val;

	val = ctob(physmem);
	return (sysctl_handle_long(oidp, &val, 0, req));
	}

	SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG \| CTLFLAG_RD,
	0, 0, sysctl_hw_physmem, "LU", "");

	static int
	sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
	{
	u_long val;
	val = ctob(realmem);
	return (sysctl_handle_long(oidp, &val, 0, req));
	}
	SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG \| CTLFLAG_RD,
	0, 0, sysctl_hw_realmem, "LU", "");
	static int
	sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
	{
	u_long val;

	val = ctob(physmem - cnt.v_wire_count);
	return (sysctl_handle_long(oidp, &val, 0, req));
	}

	SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG \| CTLFLAG_RD,
	0, 0, sysctl_hw_usermem, "LU", "");

	SYSCTL_ULONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "");

	static char machine_arch[] = MACHINE_ARCH;
	SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
	machine_arch, 0, "System architecture");

	+#ifndef VIMAGE
	char hostname[MAXHOSTNAMELEN];
	+#endif

	/*
	* This mutex is used to protect the hostname and domainname variables, and
	* perhaps in the future should also protect hostid, hostuid, and others.
	*/
	struct mtx hostname_mtx;
	MTX_SYSINIT(hostname_mtx, &hostname_mtx, "hostname", MTX_DEF);

	static int
	sysctl_hostname(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VPROCG(TD_TO_VPROCG(req->td));
	struct prison *pr;
	char tmphostname[MAXHOSTNAMELEN];
	int error;

	pr = req->td->td_ucred->cr_prison;
	if (pr != NULL) {
	if (!jail_set_hostname_allowed && req->newptr)
	return (EPERM);
	/*
	* Process is in jail, so make a local copy of jail
	* hostname to get/set so we don't have to hold the jail
	* mutex during the sysctl copyin/copyout activities.
	*/
	mtx_lock(&pr->pr_mtx);
	bcopy(pr->pr_host, tmphostname, MAXHOSTNAMELEN);
	mtx_unlock(&pr->pr_mtx);

	error = sysctl_handle_string(oidp, tmphostname,
	sizeof pr->pr_host, req);

	if (req->newptr != NULL && error == 0) {
	/*
	* Copy the locally set hostname to the jail, if
	* appropriate.
	*/
	mtx_lock(&pr->pr_mtx);
	bcopy(tmphostname, pr->pr_host, MAXHOSTNAMELEN);
	mtx_unlock(&pr->pr_mtx);
	}
	} else {
	mtx_lock(&hostname_mtx);
	bcopy(V_hostname, tmphostname, MAXHOSTNAMELEN);
	mtx_unlock(&hostname_mtx);
	error = sysctl_handle_string(oidp, tmphostname,
	sizeof tmphostname, req);
	if (req->newptr != NULL && error == 0) {
	mtx_lock(&hostname_mtx);
	bcopy(tmphostname, V_hostname, MAXHOSTNAMELEN);
	mtx_unlock(&hostname_mtx);
	}
	}
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
	CTLTYPE_STRING\|CTLFLAG_RW\|CTLFLAG_PRISON,
	0, 0, sysctl_hostname, "A", "Hostname");

	static int regression_securelevel_nonmonotonic = 0;

	#ifdef REGRESSION
	SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
	&regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
	#endif

	int securelevel = -1;
	static struct mtx securelevel_mtx;

	MTX_SYSINIT(securelevel_lock, &securelevel_mtx, "securelevel mutex lock",
	MTX_DEF);

	static int
	sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
	{
	struct prison *pr;
	int error, level;

	pr = req->td->td_ucred->cr_prison;

	/*
	* If the process is in jail, return the maximum of the global and
	* local levels; otherwise, return the global level. Perform a
	* lockless read since the securelevel is an integer.
	*/
	if (pr != NULL)
	level = imax(securelevel, pr->pr_securelevel);
	else
	level = securelevel;
	error = sysctl_handle_int(oidp, &level, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	/*
	* Permit update only if the new securelevel exceeds the
	* global level, and local level if any.
	*/
	if (pr != NULL) {
	mtx_lock(&pr->pr_mtx);
	if (!regression_securelevel_nonmonotonic &&
	(level < imax(securelevel, pr->pr_securelevel))) {
	mtx_unlock(&pr->pr_mtx);
	return (EPERM);
	}
	pr->pr_securelevel = level;
	mtx_unlock(&pr->pr_mtx);
	} else {
	mtx_lock(&securelevel_mtx);
	if (!regression_securelevel_nonmonotonic &&
	(level < securelevel)) {
	mtx_unlock(&securelevel_mtx);
	return (EPERM);
	}
	securelevel = level;
	mtx_unlock(&securelevel_mtx);
	}
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
	CTLTYPE_INT\|CTLFLAG_RW\|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
	"I", "Current secure level");

	#ifdef INCLUDE_CONFIG_FILE
	/* Actual kernel configuration options. */
	extern char kernconfstring[];

	static int
	sysctl_kern_config(SYSCTL_HANDLER_ARGS)
	{
	return (sysctl_handle_string(oidp, kernconfstring,
	strlen(kernconfstring), req));
	}

	SYSCTL_PROC(_kern, OID_AUTO, conftxt, CTLTYPE_STRING\|CTLFLAG_RW,
	0, 0, sysctl_kern_config, "", "Kernel configuration file");
	#endif

	+#ifndef VIMAGE
	char domainname[MAXHOSTNAMELEN]; /* Protected by hostname_mtx. */
	+#endif

	static int
	sysctl_domainname(SYSCTL_HANDLER_ARGS)
	{
	char tmpdomainname[MAXHOSTNAMELEN];
	int error;

	mtx_lock(&hostname_mtx);
	- bcopy(domainname, tmpdomainname, MAXHOSTNAMELEN);
	+ bcopy(V_domainname, tmpdomainname, MAXHOSTNAMELEN);
	mtx_unlock(&hostname_mtx);
	error = sysctl_handle_string(oidp, tmpdomainname,
	sizeof tmpdomainname, req);
	if (req->newptr != NULL && error == 0) {
	mtx_lock(&hostname_mtx);
	- bcopy(tmpdomainname, domainname, MAXHOSTNAMELEN);
	+ bcopy(tmpdomainname, V_domainname, MAXHOSTNAMELEN);
	mtx_unlock(&hostname_mtx);
	}
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname, CTLTYPE_STRING\|CTLFLAG_RW,
	0, 0, sysctl_domainname, "A", "Name of the current YP/NIS domain");

	u_long hostid;
	SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
	char hostuuid[64] = "00000000-0000-0000-0000-000000000000";
	SYSCTL_STRING(_kern, KERN_HOSTUUID, hostuuid, CTLFLAG_RW, hostuuid,
	sizeof(hostuuid), "Host UUID");

	SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD, 0, "Kernel Features");

	#ifdef COMPAT_FREEBSD4
	FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
	#endif

	#ifdef COMPAT_FREEBSD5
	FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
	#endif

	#ifdef COMPAT_FREEBSD6
	FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
	#endif

	#ifdef COMPAT_FREEBSD7
	FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
	#endif

	/*
	* This is really cheating. These actually live in the libc, something
	* which I'm not quite sure is a good idea anyway, but in order for
	* getnext and friends to actually work, we define dummies here.
	*/
	SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
	"", 0, "PATH that finds all the standard utilities");
	SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
	0, 0, "Max ibase/obase values in bc(1)");
	SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
	0, 0, "Max array size in bc(1)");
	SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
	0, 0, "Max scale value in bc(1)");
	SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
	0, 0, "Max string length in bc(1)");
	SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
	0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
	SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
	SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
	0, 0, "Max length (bytes) of a text-processing utility's input line");
	SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
	0, 0, "Maximum number of repeats of a regexp permitted");
	SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
	0, 0,
	"The version of POSIX 1003.2 with which the system attempts to comply");
	SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
	0, 0, "Whether C development supports the C bindings option");
	SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
	0, 0, "Whether system supports the C development utilities option");
	SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
	0, 0, "");
	SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
	0, 0, "Whether system supports FORTRAN development utilities");
	SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
	0, 0, "Whether system supports FORTRAN runtime utilities");
	SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
	0, 0, "Whether system supports creation of locales");
	SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
	0, 0, "Whether system supports software development utilities");
	SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
	0, 0, "Whether system supports the user portability utilities");
	SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
	0, 0, "Min Maximum number of streams a process may have open at one time");
	SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
	0, 0, "Min Maximum number of types supported for timezone names");

	#include <sys/vnode.h>
	SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
	0, sizeof(struct vnode), "sizeof(struct vnode)");

	SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
	0, sizeof(struct proc), "sizeof(struct proc)");

	#include <sys/bio.h>
	#include <sys/buf.h>
	SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
	0, sizeof(struct bio), "sizeof(struct bio)");
	SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
	0, sizeof(struct buf), "sizeof(struct buf)");

	#include <sys/user.h>
	SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
	0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");

	/* XXX compatibility, remove for 6.0 */
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
	&__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
	"compatibility for kern.fallback_elf_brand");
	Index: head/sys/kern/kern_uuid.c
	===================================================================
	--- head/sys/kern/kern_uuid.c (revision 183549)
	+++ head/sys/kern/kern_uuid.c (revision 183550)
	@@ -1,362 +1,363 @@
	/*-
	* Copyright (c) 2002 Marcel Moolenaar
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sbuf.h>
	#include <sys/socket.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/uuid.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>

	/*
	* See also:
	* http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
	* http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
	*
	* Note that the generator state is itself an UUID, but the time and clock
	* sequence fields are written in the native byte order.
	*/

	CTASSERT(sizeof(struct uuid) == 16);

	/* We use an alternative, more convenient representation in the generator. */
	struct uuid_private {
	union {
	uint64_t ll; /* internal. */
	struct {
	uint32_t low;
	uint16_t mid;
	uint16_t hi;
	} x;
	} time;
	uint16_t seq; /* Big-endian. */
	uint16_t node[UUID_NODE_LEN>>1];
	};

	CTASSERT(sizeof(struct uuid_private) == 16);

	static struct uuid_private uuid_last;

	static struct mtx uuid_mutex;
	MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);

	/*
	* Return the first MAC address we encounter or, if none was found,
	* construct a sufficiently random multicast address. We don't try
	* to return the same MAC address as previously returned. We always
	* generate a new multicast address if no MAC address exists in the
	* system.
	* It would be nice to know if 'ifnet' or any of its sub-structures
	* has been changed in any way. If not, we could simply skip the
	* scan and safely return the MAC address we returned before.
	*/
	static void
	uuid_node(uint16_t *node)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;
	int i;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	/* Walk the address list */
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	sdl = (struct sockaddr_dl*)ifa->ifa_addr;
	if (sdl != NULL && sdl->sdl_family == AF_LINK &&
	sdl->sdl_type == IFT_ETHER) {
	/* Got a MAC address. */
	bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
	IFNET_RUNLOCK();
	return;
	}
	}
	}
	IFNET_RUNLOCK();

	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
	node[i] = (uint16_t)arc4random();
	((uint8_t)node) \|= 0x01;
	}

	/*
	* Get the current time as a 60 bit count of 100-nanosecond intervals
	* since 00:00:00.00, October 15,1582. We apply a magic offset to convert
	* the Unix time since 00:00:00.00, January 1, 1970 to the date of the
	* Gregorian reform to the Christian calendar.
	*/
	static uint64_t
	uuid_time(void)
	{
	struct bintime bt;
	uint64_t time = 0x01B21DD213814000LL;

	bintime(&bt);
	time += (uint64_t)bt.sec * 10000000LL;
	time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
	return (time & ((1LL << 60) - 1LL));
	}

	struct uuid *
	kern_uuidgen(struct uuid *store, size_t count)
	{
	struct uuid_private uuid;
	uint64_t time;
	size_t n;

	mtx_lock(&uuid_mutex);

	uuid_node(uuid.node);
	time = uuid_time();

	if (uuid_last.time.ll == 0LL \|\| uuid_last.node[0] != uuid.node[0] \|\|
	uuid_last.node[1] != uuid.node[1] \|\|
	uuid_last.node[2] != uuid.node[2])
	uuid.seq = (uint16_t)arc4random() & 0x3fff;
	else if (uuid_last.time.ll >= time)
	uuid.seq = (uuid_last.seq + 1) & 0x3fff;
	else
	uuid.seq = uuid_last.seq;

	uuid_last = uuid;
	uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);

	mtx_unlock(&uuid_mutex);

	/* Set sequence and variant and deal with byte order. */
	uuid.seq = htobe16(uuid.seq \| 0x8000);

	for (n = 0; n < count; n++) {
	/* Set time and version (=1). */
	uuid.time.x.low = (uint32_t)time;
	uuid.time.x.mid = (uint16_t)(time >> 32);
	uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) \| (1 << 12);
	store[n] = (struct uuid )&uuid;
	time++;
	}

	return (store);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct uuidgen_args {
	struct uuid *store;
	int count;
	};
	#endif
	int
	uuidgen(struct thread td, struct uuidgen_args uap)
	{
	struct uuid *store;
	size_t count;
	int error;

	/*
	* Limit the number of UUIDs that can be created at the same time
	* to some arbitrary number. This isn't really necessary, but I
	* like to have some sort of upper-bound that's less than 2G :-)
	* XXX probably needs to be tunable.
	*/
	if (uap->count < 1 \|\| uap->count > 2048)
	return (EINVAL);

	count = uap->count;
	store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
	kern_uuidgen(store, count);
	error = copyout(store, uap->store, count * sizeof(struct uuid));
	free(store, M_TEMP);
	return (error);
	}

	int
	snprintf_uuid(char buf, size_t sz, struct uuid uuid)
	{
	struct uuid_private *id;
	int cnt;

	id = (struct uuid_private *)uuid;
	cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
	id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
	be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
	return (cnt);
	}

	int
	printf_uuid(struct uuid *uuid)
	{
	char buf[38];

	snprintf_uuid(buf, sizeof(buf), uuid);
	return (printf("%s", buf));
	}

	int
	sbuf_printf_uuid(struct sbuf sb, struct uuid uuid)
	{
	char buf[38];

	snprintf_uuid(buf, sizeof(buf), uuid);
	return (sbuf_printf(sb, "%s", buf));
	}

	/*
	* Encode/Decode UUID into byte-stream.
	* http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
	*
	* 0 1 2 3
	* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| time_low \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| time_mid \| time_hi_and_version \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \|clk_seq_hi_res \| clk_seq_low \| node (0-1) \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| node (2-5) \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	*/

	void
	le_uuid_enc(void buf, struct uuid const uuid)
	{
	u_char *p;
	int i;

	p = buf;
	le32enc(p, uuid->time_low);
	le16enc(p + 4, uuid->time_mid);
	le16enc(p + 6, uuid->time_hi_and_version);
	p[8] = uuid->clock_seq_hi_and_reserved;
	p[9] = uuid->clock_seq_low;
	for (i = 0; i < _UUID_NODE_LEN; i++)
	p[10 + i] = uuid->node[i];
	}

	void
	le_uuid_dec(void const buf, struct uuid uuid)
	{
	u_char const *p;
	int i;

	p = buf;
	uuid->time_low = le32dec(p);
	uuid->time_mid = le16dec(p + 4);
	uuid->time_hi_and_version = le16dec(p + 6);
	uuid->clock_seq_hi_and_reserved = p[8];
	uuid->clock_seq_low = p[9];
	for (i = 0; i < _UUID_NODE_LEN; i++)
	uuid->node[i] = p[10 + i];
	}

	void
	be_uuid_enc(void buf, struct uuid const uuid)
	{
	u_char *p;
	int i;

	p = buf;
	be32enc(p, uuid->time_low);
	be16enc(p + 4, uuid->time_mid);
	be16enc(p + 6, uuid->time_hi_and_version);
	p[8] = uuid->clock_seq_hi_and_reserved;
	p[9] = uuid->clock_seq_low;
	for (i = 0; i < _UUID_NODE_LEN; i++)
	p[10 + i] = uuid->node[i];
	}

	void
	be_uuid_dec(void const buf, struct uuid uuid)
	{
	u_char const *p;
	int i;

	p = buf;
	uuid->time_low = be32dec(p);
	uuid->time_mid = le16dec(p + 4);
	uuid->time_hi_and_version = be16dec(p + 6);
	uuid->clock_seq_hi_and_reserved = p[8];
	uuid->clock_seq_low = p[9];
	for (i = 0; i < _UUID_NODE_LEN; i++)
	uuid->node[i] = p[10 + i];
	}

	int
	parse_uuid(const char str, struct uuid uuid)
	{
	u_int c[11];
	int n;

	/* An empty string represents a nil UUID. */
	if (*str == '\0') {
	bzero(uuid, sizeof(*uuid));
	return (0);
	}

	/* The UUID string representation has a fixed length. */
	if (strlen(str) != 36)
	return (EINVAL);

	/*
	* We only work with "new" UUIDs. New UUIDs have the form:
	* 01234567-89ab-cdef-0123-456789abcdef
	* The so called "old" UUIDs, which we don't support, have the form:
	* 0123456789ab.cd.ef.01.23.45.67.89.ab
	*/
	if (str[8] != '-')
	return (EINVAL);

	n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
	c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
	/* Make sure we have all conversions. */
	if (n != 11)
	return (EINVAL);

	/* Successful scan. Build the UUID. */
	uuid->time_low = c[0];
	uuid->time_mid = c[1];
	uuid->time_hi_and_version = c[2];
	uuid->clock_seq_hi_and_reserved = c[3];
	uuid->clock_seq_low = c[4];
	for (n = 0; n < 6; n++)
	uuid->node[n] = c[n + 5];

	/* Check semantics... */
	return (((c[3] & 0x80) != 0x00 && /* variant 0? */
	(c[3] & 0xc0) != 0x80 && /* variant 1? */
	(c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */
	}
	Index: head/sys/kern/kern_xxx.c
	===================================================================
	--- head/sys/kern/kern_xxx.c (revision 183549)
	+++ head/sys/kern/kern_xxx.c (revision 183550)
	@@ -1,291 +1,293 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/kernel.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysctl.h>
	#include <sys/utsname.h>
	#include <sys/vimage.h>


	#if defined(COMPAT_43)

	#ifndef _SYS_SYSPROTO_H_
	struct gethostname_args {
	char *hostname;
	u_int len;
	};
	#endif
	/* ARGSUSED */
	int
	ogethostname(td, uap)
	struct thread *td;
	struct gethostname_args *uap;
	{
	int name[2];
	int error;
	size_t len = uap->len;

	name[0] = CTL_KERN;
	name[1] = KERN_HOSTNAME;
	mtx_lock(&Giant);
	error = userland_sysctl(td, name, 2, uap->hostname, &len,
	1, 0, 0, 0, 0);
	mtx_unlock(&Giant);
	return(error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sethostname_args {
	char *hostname;
	u_int len;
	};
	#endif
	/* ARGSUSED */
	int
	osethostname(td, uap)
	struct thread *td;
	register struct sethostname_args *uap;
	{
	int name[2];
	int error;

	name[0] = CTL_KERN;
	name[1] = KERN_HOSTNAME;
	mtx_lock(&Giant);
	error = userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname,
	uap->len, 0, 0);
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ogethostid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	ogethostid(td, uap)
	struct thread *td;
	struct ogethostid_args *uap;
	{

	(long )(td->td_retval) = hostid;
	return (0);
	}
	#endif /* COMPAT_43 */

	#ifdef COMPAT_43
	#ifndef _SYS_SYSPROTO_H_
	struct osethostid_args {
	long hostid;
	};
	#endif
	/* ARGSUSED */
	int
	osethostid(td, uap)
	struct thread *td;
	struct osethostid_args *uap;
	{
	int error;

	error = priv_check(td, PRIV_SETHOSTID);
	if (error)
	return (error);
	mtx_lock(&Giant);
	hostid = uap->hostid;
	mtx_unlock(&Giant);
	return (0);
	}

	int
	oquota(td, uap)
	struct thread *td;
	struct oquota_args *uap;
	{

	return (ENOSYS);
	}
	#endif /* COMPAT_43 */

	/*
	* This is the FreeBSD-1.1 compatable uname(2) interface. These days it is
	* done in libc as a wrapper around a bunch of sysctl's. This must maintain
	* the old 1.1 binary ABI.
	*/
	#if SYS_NMLN != 32
	#error "FreeBSD-1.1 uname syscall has been broken"
	#endif
	#ifndef _SYS_SYSPROTO_H_
	struct uname_args {
	struct utsname *name;
	};
	#endif
	/* ARGSUSED */
	int
	uname(td, uap)
	struct thread *td;
	struct uname_args *uap;
	{
	int name[2], error;
	size_t len;
	char s, us;

	name[0] = CTL_KERN;
	name[1] = KERN_OSTYPE;
	len = sizeof (uap->name->sysname);
	mtx_lock(&Giant);
	error = userland_sysctl(td, name, 2, uap->name->sysname, &len,
	1, 0, 0, 0, 0);
	if (error)
	goto done2;
	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);

	name[1] = KERN_HOSTNAME;
	len = sizeof uap->name->nodename;
	error = userland_sysctl(td, name, 2, uap->name->nodename, &len,
	1, 0, 0, 0, 0);
	if (error)
	goto done2;
	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);

	name[1] = KERN_OSRELEASE;
	len = sizeof uap->name->release;
	error = userland_sysctl(td, name, 2, uap->name->release, &len,
	1, 0, 0, 0, 0);
	if (error)
	goto done2;
	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);

	/*
	name = KERN_VERSION;
	len = sizeof uap->name->version;
	error = userland_sysctl(td, name, 2, uap->name->version, &len,
	1, 0, 0, 0, 0);
	if (error)
	goto done2;
	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
	*/

	/*
	* this stupid hackery to make the version field look like FreeBSD 1.1
	*/
	for(s = version; s && s != '#'; s++);

	for(us = uap->name->version; s && s != ':'; s++) {
	error = subyte( us++, *s);
	if (error)
	goto done2;
	}
	error = subyte( us++, 0);
	if (error)
	goto done2;

	name[0] = CTL_HW;
	name[1] = HW_MACHINE;
	len = sizeof uap->name->machine;
	error = userland_sysctl(td, name, 2, uap->name->machine, &len,
	1, 0, 0, 0, 0);
	if (error)
	goto done2;
	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getdomainname_args {
	char *domainname;
	int len;
	};
	#endif
	/* ARGSUSED */
	int
	getdomainname(td, uap)
	struct thread *td;
	struct getdomainname_args *uap;
	{
	+ INIT_VPROCG(TD_TO_VPROCG(td));
	char tmpdomainname[MAXHOSTNAMELEN];
	int domainnamelen;

	mtx_lock(&hostname_mtx);
	bcopy(V_domainname, tmpdomainname, sizeof(tmpdomainname));
	mtx_unlock(&hostname_mtx);

	domainnamelen = strlen(tmpdomainname) + 1;
	if ((u_int)uap->len > domainnamelen)
	uap->len = domainnamelen;
	return (copyout(tmpdomainname, uap->domainname, uap->len));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setdomainname_args {
	char *domainname;
	int len;
	};
	#endif
	/* ARGSUSED */
	int
	setdomainname(td, uap)
	struct thread *td;
	struct setdomainname_args *uap;
	{
	+ INIT_VPROCG(TD_TO_VPROCG(td));
	char tmpdomainname[MAXHOSTNAMELEN];
	int error, domainnamelen;

	error = priv_check(td, PRIV_SETDOMAINNAME);
	if (error)
	return (error);
	if ((u_int)uap->len > sizeof(tmpdomainname) - 1)
	return (EINVAL);
	domainnamelen = uap->len;
	error = copyin(uap->domainname, tmpdomainname, uap->len);
	if (error == 0) {
	tmpdomainname[domainnamelen] = 0;
	mtx_lock(&hostname_mtx);
	- bcopy(tmpdomainname, V_domainname, sizeof(domainname));
	+ bcopy(tmpdomainname, V_domainname, sizeof(V_domainname));
	mtx_unlock(&hostname_mtx);
	}
	return (error);
	}
	Index: head/sys/net/bridgestp.c
	===================================================================
	--- head/sys/net/bridgestp.c (revision 183549)
	+++ head/sys/net/bridgestp.c (revision 183550)
	@@ -1,2260 +1,2261 @@
	/* $NetBSD: bridgestp.c,v 1.5 2003/11/28 08:56:48 keihan Exp $ */

	/*
	* Copyright (c) 2000 Jason L. Wright (jason@thought.net)
	* Copyright (c) 2006 Andrew Thompson (thompsa@FreeBSD.org)
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp
	*/

	/*
	* Implementation of the spanning tree protocol as defined in
	* ISO/IEC 802.1D-2004, June 9, 2004.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/kernel.h>
	#include <sys/callout.h>
	#include <sys/module.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/taskqueue.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/if_llc.h>
	#include <net/if_media.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>
	#include <net/bridgestp.h>

	#ifdef BRIDGESTP_DEBUG
	#define DPRINTF(fmt, arg...) printf("bstp: " fmt, ##arg)
	#else
	#define DPRINTF(fmt, arg...)
	#endif

	#define PV2ADDR(pv, eaddr) do { \
	eaddr[0] = pv >> 40; \
	eaddr[1] = pv >> 32; \
	eaddr[2] = pv >> 24; \
	eaddr[3] = pv >> 16; \
	eaddr[4] = pv >> 8; \
	eaddr[5] = pv >> 0; \
	} while (0)

	#define INFO_BETTER 1
	#define INFO_SAME 0
	#define INFO_WORSE -1

	const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };

	LIST_HEAD(, bstp_state) bstp_list;
	static struct mtx bstp_list_mtx;

	static void bstp_transmit(struct bstp_state , struct bstp_port );
	static void bstp_transmit_bpdu(struct bstp_state , struct bstp_port );
	static void bstp_transmit_tcn(struct bstp_state , struct bstp_port );
	static void bstp_decode_bpdu(struct bstp_port , struct bstp_cbpdu ,
	struct bstp_config_unit *);
	static void bstp_send_bpdu(struct bstp_state , struct bstp_port ,
	struct bstp_cbpdu *);
	static void bstp_enqueue(struct ifnet , struct mbuf );
	static int bstp_pdu_flags(struct bstp_port *);
	static void bstp_received_stp(struct bstp_state , struct bstp_port ,
	struct mbuf *, struct bstp_tbpdu );
	static void bstp_received_rstp(struct bstp_state , struct bstp_port ,
	struct mbuf *, struct bstp_tbpdu );
	static void bstp_received_tcn(struct bstp_state , struct bstp_port ,
	struct bstp_tcn_unit *);
	static void bstp_received_bpdu(struct bstp_state , struct bstp_port ,
	struct bstp_config_unit *);
	static int bstp_pdu_rcvtype(struct bstp_port , struct bstp_config_unit );
	static int bstp_pdu_bettersame(struct bstp_port *, int);
	static int bstp_info_cmp(struct bstp_pri_vector *,
	struct bstp_pri_vector *);
	static int bstp_info_superior(struct bstp_pri_vector *,
	struct bstp_pri_vector *);
	static void bstp_assign_roles(struct bstp_state *);
	static void bstp_update_roles(struct bstp_state , struct bstp_port );
	static void bstp_update_state(struct bstp_state , struct bstp_port );
	static void bstp_update_tc(struct bstp_port *);
	static void bstp_update_info(struct bstp_port *);
	static void bstp_set_other_tcprop(struct bstp_port *);
	static void bstp_set_all_reroot(struct bstp_state *);
	static void bstp_set_all_sync(struct bstp_state *);
	static void bstp_set_port_state(struct bstp_port *, int);
	static void bstp_set_port_role(struct bstp_port *, int);
	static void bstp_set_port_proto(struct bstp_port *, int);
	static void bstp_set_port_tc(struct bstp_port *, int);
	static void bstp_set_timer_tc(struct bstp_port *);
	static void bstp_set_timer_msgage(struct bstp_port *);
	static int bstp_rerooted(struct bstp_state , struct bstp_port );
	static uint32_t bstp_calc_path_cost(struct bstp_port *);
	static void bstp_notify_state(void *, int);
	static void bstp_notify_rtage(void *, int);
	static void bstp_ifupdstatus(struct bstp_state , struct bstp_port );
	static void bstp_enable_port(struct bstp_state , struct bstp_port );
	static void bstp_disable_port(struct bstp_state , struct bstp_port );
	static void bstp_tick(void *);
	static void bstp_timer_start(struct bstp_timer *, uint16_t);
	static void bstp_timer_stop(struct bstp_timer *);
	static void bstp_timer_latch(struct bstp_timer *);
	static int bstp_timer_expired(struct bstp_timer *);
	static void bstp_hello_timer_expiry(struct bstp_state *,
	struct bstp_port *);
	static void bstp_message_age_expiry(struct bstp_state *,
	struct bstp_port *);
	static void bstp_migrate_delay_expiry(struct bstp_state *,
	struct bstp_port *);
	static void bstp_edge_delay_expiry(struct bstp_state *,
	struct bstp_port *);
	static int bstp_addr_cmp(const uint8_t , const uint8_t );
	static int bstp_same_bridgeid(uint64_t, uint64_t);
	static void bstp_reinit(struct bstp_state *);

	static void
	bstp_transmit(struct bstp_state bs, struct bstp_port bp)
	{
	if (bs->bs_running == 0)
	return;

	/*
	* a PDU can only be sent if we have tx quota left and the
	* hello timer is running.
	*/
	if (bp->bp_hello_timer.active == 0) {
	/* Test if it needs to be reset */
	bstp_hello_timer_expiry(bs, bp);
	return;
	}
	if (bp->bp_txcount > bs->bs_txholdcount)
	/* Ran out of karma */
	return;

	if (bp->bp_protover == BSTP_PROTO_RSTP) {
	bstp_transmit_bpdu(bs, bp);
	bp->bp_tc_ack = 0;
	} else { /* STP */
	switch (bp->bp_role) {
	case BSTP_ROLE_DESIGNATED:
	bstp_transmit_bpdu(bs, bp);
	bp->bp_tc_ack = 0;
	break;

	case BSTP_ROLE_ROOT:
	bstp_transmit_tcn(bs, bp);
	break;
	}
	}
	bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime);
	bp->bp_flags &= ~BSTP_PORT_NEWINFO;
	}

	static void
	bstp_transmit_bpdu(struct bstp_state bs, struct bstp_port bp)
	{
	struct bstp_cbpdu bpdu;

	BSTP_LOCK_ASSERT(bs);

	bpdu.cbu_rootpri = htons(bp->bp_desg_pv.pv_root_id >> 48);
	PV2ADDR(bp->bp_desg_pv.pv_root_id, bpdu.cbu_rootaddr);

	bpdu.cbu_rootpathcost = htonl(bp->bp_desg_pv.pv_cost);

	bpdu.cbu_bridgepri = htons(bp->bp_desg_pv.pv_dbridge_id >> 48);
	PV2ADDR(bp->bp_desg_pv.pv_dbridge_id, bpdu.cbu_bridgeaddr);

	bpdu.cbu_portid = htons(bp->bp_port_id);
	bpdu.cbu_messageage = htons(bp->bp_desg_msg_age);
	bpdu.cbu_maxage = htons(bp->bp_desg_max_age);
	bpdu.cbu_hellotime = htons(bp->bp_desg_htime);
	bpdu.cbu_forwarddelay = htons(bp->bp_desg_fdelay);

	bpdu.cbu_flags = bstp_pdu_flags(bp);

	switch (bp->bp_protover) {
	case BSTP_PROTO_STP:
	bpdu.cbu_bpdutype = BSTP_MSGTYPE_CFG;
	break;

	case BSTP_PROTO_RSTP:
	bpdu.cbu_bpdutype = BSTP_MSGTYPE_RSTP;
	break;
	}

	bstp_send_bpdu(bs, bp, &bpdu);
	}

	static void
	bstp_transmit_tcn(struct bstp_state bs, struct bstp_port bp)
	{
	struct bstp_tbpdu bpdu;
	struct ifnet *ifp = bp->bp_ifp;
	struct ether_header *eh;
	struct mbuf *m;

	KASSERT(bp == bs->bs_root_port, ("%s: bad root port\n", __func__));

	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
	return;

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return;

	m->m_pkthdr.rcvif = ifp;
	m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu);
	m->m_len = m->m_pkthdr.len;

	eh = mtod(m, struct ether_header *);

	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
	memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN);
	eh->ether_type = htons(sizeof(bpdu));

	bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP;
	bpdu.tbu_ctl = LLC_UI;
	bpdu.tbu_protoid = 0;
	bpdu.tbu_protover = 0;
	bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN;

	memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu));

	bp->bp_txcount++;
	bstp_enqueue(ifp, m);
	}

	static void
	bstp_decode_bpdu(struct bstp_port bp, struct bstp_cbpdu cpdu,
	struct bstp_config_unit *cu)
	{
	int flags;

	cu->cu_pv.pv_root_id =
	(((uint64_t)ntohs(cpdu->cbu_rootpri)) << 48) \|
	(((uint64_t)cpdu->cbu_rootaddr[0]) << 40) \|
	(((uint64_t)cpdu->cbu_rootaddr[1]) << 32) \|
	(((uint64_t)cpdu->cbu_rootaddr[2]) << 24) \|
	(((uint64_t)cpdu->cbu_rootaddr[3]) << 16) \|
	(((uint64_t)cpdu->cbu_rootaddr[4]) << 8) \|
	(((uint64_t)cpdu->cbu_rootaddr[5]) << 0);

	cu->cu_pv.pv_dbridge_id =
	(((uint64_t)ntohs(cpdu->cbu_bridgepri)) << 48) \|
	(((uint64_t)cpdu->cbu_bridgeaddr[0]) << 40) \|
	(((uint64_t)cpdu->cbu_bridgeaddr[1]) << 32) \|
	(((uint64_t)cpdu->cbu_bridgeaddr[2]) << 24) \|
	(((uint64_t)cpdu->cbu_bridgeaddr[3]) << 16) \|
	(((uint64_t)cpdu->cbu_bridgeaddr[4]) << 8) \|
	(((uint64_t)cpdu->cbu_bridgeaddr[5]) << 0);

	cu->cu_pv.pv_cost = ntohl(cpdu->cbu_rootpathcost);
	cu->cu_message_age = ntohs(cpdu->cbu_messageage);
	cu->cu_max_age = ntohs(cpdu->cbu_maxage);
	cu->cu_hello_time = ntohs(cpdu->cbu_hellotime);
	cu->cu_forward_delay = ntohs(cpdu->cbu_forwarddelay);
	cu->cu_pv.pv_dport_id = ntohs(cpdu->cbu_portid);
	cu->cu_pv.pv_port_id = bp->bp_port_id;
	cu->cu_message_type = cpdu->cbu_bpdutype;

	/* Strip off unused flags in STP mode */
	flags = cpdu->cbu_flags;
	switch (cpdu->cbu_protover) {
	case BSTP_PROTO_STP:
	flags &= BSTP_PDU_STPMASK;
	/* A STP BPDU explicitly conveys a Designated Port */
	cu->cu_role = BSTP_ROLE_DESIGNATED;
	break;

	case BSTP_PROTO_RSTP:
	flags &= BSTP_PDU_RSTPMASK;
	break;
	}

	cu->cu_topology_change_ack =
	(flags & BSTP_PDU_F_TCA) ? 1 : 0;
	cu->cu_proposal =
	(flags & BSTP_PDU_F_P) ? 1 : 0;
	cu->cu_agree =
	(flags & BSTP_PDU_F_A) ? 1 : 0;
	cu->cu_learning =
	(flags & BSTP_PDU_F_L) ? 1 : 0;
	cu->cu_forwarding =
	(flags & BSTP_PDU_F_F) ? 1 : 0;
	cu->cu_topology_change =
	(flags & BSTP_PDU_F_TC) ? 1 : 0;

	switch ((flags & BSTP_PDU_PRMASK) >> BSTP_PDU_PRSHIFT) {
	case BSTP_PDU_F_ROOT:
	cu->cu_role = BSTP_ROLE_ROOT;
	break;
	case BSTP_PDU_F_ALT:
	cu->cu_role = BSTP_ROLE_ALTERNATE;
	break;
	case BSTP_PDU_F_DESG:
	cu->cu_role = BSTP_ROLE_DESIGNATED;
	break;
	}
	}

	static void
	bstp_send_bpdu(struct bstp_state bs, struct bstp_port bp,
	struct bstp_cbpdu *bpdu)
	{
	struct ifnet *ifp;
	struct mbuf *m;
	struct ether_header *eh;

	BSTP_LOCK_ASSERT(bs);

	ifp = bp->bp_ifp;

	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
	return;

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return;

	eh = mtod(m, struct ether_header *);

	bpdu->cbu_ssap = bpdu->cbu_dsap = LLC_8021D_LSAP;
	bpdu->cbu_ctl = LLC_UI;
	bpdu->cbu_protoid = htons(BSTP_PROTO_ID);

	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
	memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN);

	switch (bpdu->cbu_bpdutype) {
	case BSTP_MSGTYPE_CFG:
	bpdu->cbu_protover = BSTP_PROTO_STP;
	m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_STP_LEN;
	eh->ether_type = htons(BSTP_BPDU_STP_LEN);
	memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu,
	BSTP_BPDU_STP_LEN);
	break;

	case BSTP_MSGTYPE_RSTP:
	bpdu->cbu_protover = BSTP_PROTO_RSTP;
	bpdu->cbu_versionlen = htons(0);
	m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_RSTP_LEN;
	eh->ether_type = htons(BSTP_BPDU_RSTP_LEN);
	memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu,
	BSTP_BPDU_RSTP_LEN);
	break;

	default:
	panic("not implemented");
	}
	m->m_pkthdr.rcvif = ifp;
	m->m_len = m->m_pkthdr.len;

	bp->bp_txcount++;
	bstp_enqueue(ifp, m);
	}

	static void
	bstp_enqueue(struct ifnet dst_ifp, struct mbuf m)
	{
	int err = 0;

	IFQ_ENQUEUE(&dst_ifp->if_snd, m, err);

	if ((dst_ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
	(*dst_ifp->if_start)(dst_ifp);
	}

	static int
	bstp_pdu_flags(struct bstp_port *bp)
	{
	int flags = 0;

	if (bp->bp_proposing && bp->bp_state != BSTP_IFSTATE_FORWARDING)
	flags \|= BSTP_PDU_F_P;

	if (bp->bp_agree)
	flags \|= BSTP_PDU_F_A;

	if (bp->bp_tc_timer.active)
	flags \|= BSTP_PDU_F_TC;

	if (bp->bp_tc_ack)
	flags \|= BSTP_PDU_F_TCA;

	switch (bp->bp_state) {
	case BSTP_IFSTATE_LEARNING:
	flags \|= BSTP_PDU_F_L;
	break;

	case BSTP_IFSTATE_FORWARDING:
	flags \|= (BSTP_PDU_F_L \| BSTP_PDU_F_F);
	break;
	}

	switch (bp->bp_role) {
	case BSTP_ROLE_ROOT:
	flags \|=
	(BSTP_PDU_F_ROOT << BSTP_PDU_PRSHIFT);
	break;

	case BSTP_ROLE_ALTERNATE:
	case BSTP_ROLE_BACKUP: /* fall through */
	flags \|=
	(BSTP_PDU_F_ALT << BSTP_PDU_PRSHIFT);
	break;

	case BSTP_ROLE_DESIGNATED:
	flags \|=
	(BSTP_PDU_F_DESG << BSTP_PDU_PRSHIFT);
	break;
	}

	/* Strip off unused flags in either mode */
	switch (bp->bp_protover) {
	case BSTP_PROTO_STP:
	flags &= BSTP_PDU_STPMASK;
	break;
	case BSTP_PROTO_RSTP:
	flags &= BSTP_PDU_RSTPMASK;
	break;
	}
	return (flags);
	}

	struct mbuf *
	bstp_input(struct bstp_port bp, struct ifnet ifp, struct mbuf *m)
	{
	struct bstp_state *bs = bp->bp_bs;
	struct ether_header *eh;
	struct bstp_tbpdu tpdu;
	uint16_t len;

	if (bp->bp_active == 0) {
	m_freem(m);
	return (NULL);
	}

	BSTP_LOCK(bs);

	eh = mtod(m, struct ether_header *);

	len = ntohs(eh->ether_type);
	if (len < sizeof(tpdu))
	goto out;

	m_adj(m, ETHER_HDR_LEN);

	if (m->m_pkthdr.len > len)
	m_adj(m, len - m->m_pkthdr.len);
	if (m->m_len < sizeof(tpdu) &&
	(m = m_pullup(m, sizeof(tpdu))) == NULL)
	goto out;

	memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu));

	/* basic packet checks */
	if (tpdu.tbu_dsap != LLC_8021D_LSAP \|\|
	tpdu.tbu_ssap != LLC_8021D_LSAP \|\|
	tpdu.tbu_ctl != LLC_UI)
	goto out;
	if (tpdu.tbu_protoid != BSTP_PROTO_ID)
	goto out;

	/*
	* We can treat later versions of the PDU as the same as the maximum
	* version we implement. All additional parameters/flags are ignored.
	*/
	if (tpdu.tbu_protover > BSTP_PROTO_MAX)
	tpdu.tbu_protover = BSTP_PROTO_MAX;

	if (tpdu.tbu_protover != bp->bp_protover) {
	/*
	* Wait for the migration delay timer to expire before changing
	* protocol version to avoid flip-flops.
	*/
	if (bp->bp_flags & BSTP_PORT_CANMIGRATE)
	bstp_set_port_proto(bp, tpdu.tbu_protover);
	else
	goto out;
	}

	/* Clear operedge upon receiving a PDU on the port */
	bp->bp_operedge = 0;
	bstp_timer_start(&bp->bp_edge_delay_timer,
	BSTP_DEFAULT_MIGRATE_DELAY);

	switch (tpdu.tbu_protover) {
	case BSTP_PROTO_STP:
	bstp_received_stp(bs, bp, &m, &tpdu);
	break;

	case BSTP_PROTO_RSTP:
	bstp_received_rstp(bs, bp, &m, &tpdu);
	break;
	}
	out:
	BSTP_UNLOCK(bs);
	if (m)
	m_freem(m);
	return (NULL);
	}

	static void
	bstp_received_stp(struct bstp_state bs, struct bstp_port bp,
	struct mbuf *mp, struct bstp_tbpdu tpdu)
	{
	struct bstp_cbpdu cpdu;
	struct bstp_config_unit *cu = &bp->bp_msg_cu;
	struct bstp_tcn_unit tu;

	switch (tpdu->tbu_bpdutype) {
	case BSTP_MSGTYPE_TCN:
	tu.tu_message_type = tpdu->tbu_bpdutype;
	bstp_received_tcn(bs, bp, &tu);
	break;
	case BSTP_MSGTYPE_CFG:
	if ((*mp)->m_len < BSTP_BPDU_STP_LEN &&
	(mp = m_pullup(mp, BSTP_BPDU_STP_LEN)) == NULL)
	return;
	memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_STP_LEN);

	bstp_decode_bpdu(bp, &cpdu, cu);
	bstp_received_bpdu(bs, bp, cu);
	break;
	}
	}

	static void
	bstp_received_rstp(struct bstp_state bs, struct bstp_port bp,
	struct mbuf *mp, struct bstp_tbpdu tpdu)
	{
	struct bstp_cbpdu cpdu;
	struct bstp_config_unit *cu = &bp->bp_msg_cu;

	if (tpdu->tbu_bpdutype != BSTP_MSGTYPE_RSTP)
	return;

	if ((*mp)->m_len < BSTP_BPDU_RSTP_LEN &&
	(mp = m_pullup(mp, BSTP_BPDU_RSTP_LEN)) == NULL)
	return;
	memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_RSTP_LEN);

	bstp_decode_bpdu(bp, &cpdu, cu);
	bstp_received_bpdu(bs, bp, cu);
	}

	static void
	bstp_received_tcn(struct bstp_state bs, struct bstp_port bp,
	struct bstp_tcn_unit *tcn)
	{
	bp->bp_rcvdtcn = 1;
	bstp_update_tc(bp);
	}

	static void
	bstp_received_bpdu(struct bstp_state bs, struct bstp_port bp,
	struct bstp_config_unit *cu)
	{
	int type;

	BSTP_LOCK_ASSERT(bs);

	/* We need to have transitioned to INFO_MINE before proceeding */
	switch (bp->bp_infois) {
	case BSTP_INFO_DISABLED:
	case BSTP_INFO_AGED:
	return;
	}

	type = bstp_pdu_rcvtype(bp, cu);

	switch (type) {
	case BSTP_PDU_SUPERIOR:
	bs->bs_allsynced = 0;
	bp->bp_agreed = 0;
	bp->bp_proposing = 0;

	if (cu->cu_proposal && cu->cu_forwarding == 0)
	bp->bp_proposed = 1;
	if (cu->cu_topology_change)
	bp->bp_rcvdtc = 1;
	if (cu->cu_topology_change_ack)
	bp->bp_rcvdtca = 1;

	if (bp->bp_agree &&
	!bstp_pdu_bettersame(bp, BSTP_INFO_RECEIVED))
	bp->bp_agree = 0;

	/* copy the received priority and timers to the port */
	bp->bp_port_pv = cu->cu_pv;
	bp->bp_port_msg_age = cu->cu_message_age;
	bp->bp_port_max_age = cu->cu_max_age;
	bp->bp_port_fdelay = cu->cu_forward_delay;
	bp->bp_port_htime =
	(cu->cu_hello_time > BSTP_MIN_HELLO_TIME ?
	cu->cu_hello_time : BSTP_MIN_HELLO_TIME);

	/* set expiry for the new info */
	bstp_set_timer_msgage(bp);

	bp->bp_infois = BSTP_INFO_RECEIVED;
	bstp_assign_roles(bs);
	break;

	case BSTP_PDU_REPEATED:
	if (cu->cu_proposal && cu->cu_forwarding == 0)
	bp->bp_proposed = 1;
	if (cu->cu_topology_change)
	bp->bp_rcvdtc = 1;
	if (cu->cu_topology_change_ack)
	bp->bp_rcvdtca = 1;

	/* rearm the age timer */
	bstp_set_timer_msgage(bp);
	break;

	case BSTP_PDU_INFERIOR:
	if (cu->cu_learning) {
	bp->bp_agreed = 1;
	bp->bp_proposing = 0;
	}
	break;

	case BSTP_PDU_INFERIORALT:
	/*
	* only point to point links are allowed fast
	* transitions to forwarding.
	*/
	if (cu->cu_agree && bp->bp_ptp_link) {
	bp->bp_agreed = 1;
	bp->bp_proposing = 0;
	} else
	bp->bp_agreed = 0;

	if (cu->cu_topology_change)
	bp->bp_rcvdtc = 1;
	if (cu->cu_topology_change_ack)
	bp->bp_rcvdtca = 1;
	break;

	case BSTP_PDU_OTHER:
	return; /* do nothing */
	}
	/* update the state machines with the new data */
	bstp_update_state(bs, bp);
	}

	static int
	bstp_pdu_rcvtype(struct bstp_port bp, struct bstp_config_unit cu)
	{
	int type;

	/* default return type */
	type = BSTP_PDU_OTHER;

	switch (cu->cu_role) {
	case BSTP_ROLE_DESIGNATED:
	if (bstp_info_superior(&bp->bp_port_pv, &cu->cu_pv))
	/* bpdu priority is superior */
	type = BSTP_PDU_SUPERIOR;
	else if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) ==
	INFO_SAME) {
	if (bp->bp_port_msg_age != cu->cu_message_age \|\|
	bp->bp_port_max_age != cu->cu_max_age \|\|
	bp->bp_port_fdelay != cu->cu_forward_delay \|\|
	bp->bp_port_htime != cu->cu_hello_time)
	/* bpdu priority is equal and timers differ */
	type = BSTP_PDU_SUPERIOR;
	else
	/* bpdu is equal */
	type = BSTP_PDU_REPEATED;
	} else
	/* bpdu priority is worse */
	type = BSTP_PDU_INFERIOR;

	break;

	case BSTP_ROLE_ROOT:
	case BSTP_ROLE_ALTERNATE:
	case BSTP_ROLE_BACKUP:
	if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) <= INFO_SAME)
	/*
	* not a designated port and priority is the same or
	* worse
	*/
	type = BSTP_PDU_INFERIORALT;
	break;
	}

	return (type);
	}

	static int
	bstp_pdu_bettersame(struct bstp_port *bp, int newinfo)
	{
	if (newinfo == BSTP_INFO_RECEIVED &&
	bp->bp_infois == BSTP_INFO_RECEIVED &&
	bstp_info_cmp(&bp->bp_port_pv, &bp->bp_msg_cu.cu_pv) >= INFO_SAME)
	return (1);

	if (newinfo == BSTP_INFO_MINE &&
	bp->bp_infois == BSTP_INFO_MINE &&
	bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) >= INFO_SAME)
	return (1);

	return (0);
	}

	static int
	bstp_info_cmp(struct bstp_pri_vector *pv,
	struct bstp_pri_vector *cpv)
	{
	if (cpv->pv_root_id < pv->pv_root_id)
	return (INFO_BETTER);
	if (cpv->pv_root_id > pv->pv_root_id)
	return (INFO_WORSE);

	if (cpv->pv_cost < pv->pv_cost)
	return (INFO_BETTER);
	if (cpv->pv_cost > pv->pv_cost)
	return (INFO_WORSE);

	if (cpv->pv_dbridge_id < pv->pv_dbridge_id)
	return (INFO_BETTER);
	if (cpv->pv_dbridge_id > pv->pv_dbridge_id)
	return (INFO_WORSE);

	if (cpv->pv_dport_id < pv->pv_dport_id)
	return (INFO_BETTER);
	if (cpv->pv_dport_id > pv->pv_dport_id)
	return (INFO_WORSE);

	return (INFO_SAME);
	}

	/*
	* This message priority vector is superior to the port priority vector and
	* will replace it if, and only if, the message priority vector is better than
	* the port priority vector, or the message has been transmitted from the same
	* designated bridge and designated port as the port priority vector.
	*/
	static int
	bstp_info_superior(struct bstp_pri_vector *pv,
	struct bstp_pri_vector *cpv)
	{
	if (bstp_info_cmp(pv, cpv) == INFO_BETTER \|\|
	(bstp_same_bridgeid(pv->pv_dbridge_id, cpv->pv_dbridge_id) &&
	(cpv->pv_dport_id & 0xfff) == (pv->pv_dport_id & 0xfff)))
	return (1);
	return (0);
	}

	static void
	bstp_assign_roles(struct bstp_state *bs)
	{
	struct bstp_port bp, rbp = NULL;
	struct bstp_pri_vector pv;

	/* default to our priority vector */
	bs->bs_root_pv = bs->bs_bridge_pv;
	bs->bs_root_msg_age = 0;
	bs->bs_root_max_age = bs->bs_bridge_max_age;
	bs->bs_root_fdelay = bs->bs_bridge_fdelay;
	bs->bs_root_htime = bs->bs_bridge_htime;
	bs->bs_root_port = NULL;

	/* check if any recieved info supersedes us */
	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	if (bp->bp_infois != BSTP_INFO_RECEIVED)
	continue;

	pv = bp->bp_port_pv;
	pv.pv_cost += bp->bp_path_cost;

	/*
	* The root priority vector is the best of the set comprising
	* the bridge priority vector plus all root path priority
	* vectors whose bridge address is not equal to us.
	*/
	if (bstp_same_bridgeid(pv.pv_dbridge_id,
	bs->bs_bridge_pv.pv_dbridge_id) == 0 &&
	bstp_info_cmp(&bs->bs_root_pv, &pv) == INFO_BETTER) {
	/* the port vector replaces the root */
	bs->bs_root_pv = pv;
	bs->bs_root_msg_age = bp->bp_port_msg_age +
	BSTP_MESSAGE_AGE_INCR;
	bs->bs_root_max_age = bp->bp_port_max_age;
	bs->bs_root_fdelay = bp->bp_port_fdelay;
	bs->bs_root_htime = bp->bp_port_htime;
	rbp = bp;
	}
	}

	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	/* calculate the port designated vector */
	bp->bp_desg_pv.pv_root_id = bs->bs_root_pv.pv_root_id;
	bp->bp_desg_pv.pv_cost = bs->bs_root_pv.pv_cost;
	bp->bp_desg_pv.pv_dbridge_id = bs->bs_bridge_pv.pv_dbridge_id;
	bp->bp_desg_pv.pv_dport_id = bp->bp_port_id;
	bp->bp_desg_pv.pv_port_id = bp->bp_port_id;

	/* calculate designated times */
	bp->bp_desg_msg_age = bs->bs_root_msg_age;
	bp->bp_desg_max_age = bs->bs_root_max_age;
	bp->bp_desg_fdelay = bs->bs_root_fdelay;
	bp->bp_desg_htime = bs->bs_bridge_htime;


	switch (bp->bp_infois) {
	case BSTP_INFO_DISABLED:
	bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
	break;

	case BSTP_INFO_AGED:
	bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED);
	bstp_update_info(bp);
	break;

	case BSTP_INFO_MINE:
	bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED);
	/* update the port info if stale */
	if (bstp_info_cmp(&bp->bp_port_pv,
	&bp->bp_desg_pv) != INFO_SAME \|\|
	(rbp != NULL &&
	(bp->bp_port_msg_age != rbp->bp_port_msg_age \|\|
	bp->bp_port_max_age != rbp->bp_port_max_age \|\|
	bp->bp_port_fdelay != rbp->bp_port_fdelay \|\|
	bp->bp_port_htime != rbp->bp_port_htime)))
	bstp_update_info(bp);
	break;

	case BSTP_INFO_RECEIVED:
	if (bp == rbp) {
	/*
	* root priority is derived from this
	* port, make it the root port.
	*/
	bstp_set_port_role(bp, BSTP_ROLE_ROOT);
	bs->bs_root_port = bp;
	} else if (bstp_info_cmp(&bp->bp_port_pv,
	&bp->bp_desg_pv) == INFO_BETTER) {
	/*
	* the port priority is lower than the root
	* port.
	*/
	bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED);
	bstp_update_info(bp);
	} else {
	if (bstp_same_bridgeid(
	bp->bp_port_pv.pv_dbridge_id,
	bs->bs_bridge_pv.pv_dbridge_id)) {
	/*
	* the designated bridge refers to
	* another port on this bridge.
	*/
	bstp_set_port_role(bp,
	BSTP_ROLE_BACKUP);
	} else {
	/*
	* the port is an inferior path to the
	* root bridge.
	*/
	bstp_set_port_role(bp,
	BSTP_ROLE_ALTERNATE);
	}
	}
	break;
	}
	}
	}

	static void
	bstp_update_state(struct bstp_state bs, struct bstp_port bp)
	{
	struct bstp_port *bp2;
	int synced;

	BSTP_LOCK_ASSERT(bs);

	/* check if all the ports have syncronised again */
	if (!bs->bs_allsynced) {
	synced = 1;
	LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) {
	if (!(bp2->bp_synced \|\|
	bp2->bp_role == BSTP_ROLE_ROOT)) {
	synced = 0;
	break;
	}
	}
	bs->bs_allsynced = synced;
	}

	bstp_update_roles(bs, bp);
	bstp_update_tc(bp);
	}

	static void
	bstp_update_roles(struct bstp_state bs, struct bstp_port bp)
	{
	switch (bp->bp_role) {
	case BSTP_ROLE_DISABLED:
	/* Clear any flags if set */
	if (bp->bp_sync \|\| !bp->bp_synced \|\| bp->bp_reroot) {
	bp->bp_sync = 0;
	bp->bp_synced = 1;
	bp->bp_reroot = 0;
	}
	break;

	case BSTP_ROLE_ALTERNATE:
	case BSTP_ROLE_BACKUP:
	if ((bs->bs_allsynced && !bp->bp_agree) \|\|
	(bp->bp_proposed && bp->bp_agree)) {
	bp->bp_proposed = 0;
	bp->bp_agree = 1;
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	DPRINTF("%s -> ALTERNATE_AGREED\n",
	bp->bp_ifp->if_xname);
	}

	if (bp->bp_proposed && !bp->bp_agree) {
	bstp_set_all_sync(bs);
	bp->bp_proposed = 0;
	DPRINTF("%s -> ALTERNATE_PROPOSED\n",
	bp->bp_ifp->if_xname);
	}

	/* Clear any flags if set */
	if (bp->bp_sync \|\| !bp->bp_synced \|\| bp->bp_reroot) {
	bp->bp_sync = 0;
	bp->bp_synced = 1;
	bp->bp_reroot = 0;
	DPRINTF("%s -> ALTERNATE_PORT\n", bp->bp_ifp->if_xname);
	}
	break;

	case BSTP_ROLE_ROOT:
	if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_reroot) {
	bstp_set_all_reroot(bs);
	DPRINTF("%s -> ROOT_REROOT\n", bp->bp_ifp->if_xname);
	}

	if ((bs->bs_allsynced && !bp->bp_agree) \|\|
	(bp->bp_proposed && bp->bp_agree)) {
	bp->bp_proposed = 0;
	bp->bp_sync = 0;
	bp->bp_agree = 1;
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	DPRINTF("%s -> ROOT_AGREED\n", bp->bp_ifp->if_xname);
	}

	if (bp->bp_proposed && !bp->bp_agree) {
	bstp_set_all_sync(bs);
	bp->bp_proposed = 0;
	DPRINTF("%s -> ROOT_PROPOSED\n", bp->bp_ifp->if_xname);
	}

	if (bp->bp_state != BSTP_IFSTATE_FORWARDING &&
	(bp->bp_forward_delay_timer.active == 0 \|\|
	(bstp_rerooted(bs, bp) &&
	bp->bp_recent_backup_timer.active == 0 &&
	bp->bp_protover == BSTP_PROTO_RSTP))) {
	switch (bp->bp_state) {
	case BSTP_IFSTATE_DISCARDING:
	bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING);
	break;
	case BSTP_IFSTATE_LEARNING:
	bstp_set_port_state(bp,
	BSTP_IFSTATE_FORWARDING);
	break;
	}
	}

	if (bp->bp_state == BSTP_IFSTATE_FORWARDING && bp->bp_reroot) {
	bp->bp_reroot = 0;
	DPRINTF("%s -> ROOT_REROOTED\n", bp->bp_ifp->if_xname);
	}
	break;

	case BSTP_ROLE_DESIGNATED:
	if (bp->bp_recent_root_timer.active == 0 && bp->bp_reroot) {
	bp->bp_reroot = 0;
	DPRINTF("%s -> DESIGNATED_RETIRED\n",
	bp->bp_ifp->if_xname);
	}

	if ((bp->bp_state == BSTP_IFSTATE_DISCARDING &&
	!bp->bp_synced) \|\| (bp->bp_agreed && !bp->bp_synced) \|\|
	(bp->bp_operedge && !bp->bp_synced) \|\|
	(bp->bp_sync && bp->bp_synced)) {
	bstp_timer_stop(&bp->bp_recent_root_timer);
	bp->bp_synced = 1;
	bp->bp_sync = 0;
	DPRINTF("%s -> DESIGNATED_SYNCED\n",
	bp->bp_ifp->if_xname);
	}

	if (bp->bp_state != BSTP_IFSTATE_FORWARDING &&
	!bp->bp_agreed && !bp->bp_proposing &&
	!bp->bp_operedge) {
	bp->bp_proposing = 1;
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	bstp_timer_start(&bp->bp_edge_delay_timer,
	(bp->bp_ptp_link ? BSTP_DEFAULT_MIGRATE_DELAY :
	bp->bp_desg_max_age));
	DPRINTF("%s -> DESIGNATED_PROPOSE\n",
	bp->bp_ifp->if_xname);
	}

	if (bp->bp_state != BSTP_IFSTATE_FORWARDING &&
	(bp->bp_forward_delay_timer.active == 0 \|\| bp->bp_agreed \|\|
	bp->bp_operedge) &&
	(bp->bp_recent_root_timer.active == 0 \|\| !bp->bp_reroot) &&
	!bp->bp_sync) {
	if (bp->bp_agreed)
	DPRINTF("%s -> AGREED\n", bp->bp_ifp->if_xname);
	/*
	* If agreed\|operedge then go straight to forwarding,
	* otherwise follow discard -> learn -> forward.
	*/
	if (bp->bp_agreed \|\| bp->bp_operedge \|\|
	bp->bp_state == BSTP_IFSTATE_LEARNING) {
	bstp_set_port_state(bp,
	BSTP_IFSTATE_FORWARDING);
	bp->bp_agreed = bp->bp_protover;
	} else if (bp->bp_state == BSTP_IFSTATE_DISCARDING)
	bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING);
	}

	if (((bp->bp_sync && !bp->bp_synced) \|\|
	(bp->bp_reroot && bp->bp_recent_root_timer.active) \|\|
	(bp->bp_flags & BSTP_PORT_DISPUTED)) && !bp->bp_operedge &&
	bp->bp_state != BSTP_IFSTATE_DISCARDING) {
	bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
	bp->bp_flags &= ~BSTP_PORT_DISPUTED;
	bstp_timer_start(&bp->bp_forward_delay_timer,
	bp->bp_protover == BSTP_PROTO_RSTP ?
	bp->bp_desg_htime : bp->bp_desg_fdelay);
	DPRINTF("%s -> DESIGNATED_DISCARD\n",
	bp->bp_ifp->if_xname);
	}
	break;
	}

	if (bp->bp_flags & BSTP_PORT_NEWINFO)
	bstp_transmit(bs, bp);
	}

	static void
	bstp_update_tc(struct bstp_port *bp)
	{
	switch (bp->bp_tcstate) {
	case BSTP_TCSTATE_ACTIVE:
	if ((bp->bp_role != BSTP_ROLE_DESIGNATED &&
	bp->bp_role != BSTP_ROLE_ROOT) \|\| bp->bp_operedge)
	bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING);

	if (bp->bp_rcvdtcn)
	bstp_set_port_tc(bp, BSTP_TCSTATE_TCN);
	if (bp->bp_rcvdtc)
	bstp_set_port_tc(bp, BSTP_TCSTATE_TC);

	if (bp->bp_tc_prop && !bp->bp_operedge)
	bstp_set_port_tc(bp, BSTP_TCSTATE_PROPAG);

	if (bp->bp_rcvdtca)
	bstp_set_port_tc(bp, BSTP_TCSTATE_ACK);
	break;

	case BSTP_TCSTATE_INACTIVE:
	if ((bp->bp_state == BSTP_IFSTATE_LEARNING \|\|
	bp->bp_state == BSTP_IFSTATE_FORWARDING) &&
	bp->bp_fdbflush == 0)
	bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING);
	break;

	case BSTP_TCSTATE_LEARNING:
	if (bp->bp_rcvdtc \|\| bp->bp_rcvdtcn \|\| bp->bp_rcvdtca \|\|
	bp->bp_tc_prop)
	bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING);
	else if (bp->bp_role != BSTP_ROLE_DESIGNATED &&
	bp->bp_role != BSTP_ROLE_ROOT &&
	bp->bp_state == BSTP_IFSTATE_DISCARDING)
	bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE);

	if ((bp->bp_role == BSTP_ROLE_DESIGNATED \|\|
	bp->bp_role == BSTP_ROLE_ROOT) &&
	bp->bp_state == BSTP_IFSTATE_FORWARDING &&
	!bp->bp_operedge)
	bstp_set_port_tc(bp, BSTP_TCSTATE_DETECTED);
	break;

	/* these are transient states and go straight back to ACTIVE */
	case BSTP_TCSTATE_DETECTED:
	case BSTP_TCSTATE_TCN:
	case BSTP_TCSTATE_TC:
	case BSTP_TCSTATE_PROPAG:
	case BSTP_TCSTATE_ACK:
	DPRINTF("Invalid TC state for %s\n",
	bp->bp_ifp->if_xname);
	break;
	}

	}

	static void
	bstp_update_info(struct bstp_port *bp)
	{
	struct bstp_state *bs = bp->bp_bs;

	bp->bp_proposing = 0;
	bp->bp_proposed = 0;

	if (bp->bp_agreed && !bstp_pdu_bettersame(bp, BSTP_INFO_MINE))
	bp->bp_agreed = 0;

	if (bp->bp_synced && !bp->bp_agreed) {
	bp->bp_synced = 0;
	bs->bs_allsynced = 0;
	}

	/* copy the designated pv to the port */
	bp->bp_port_pv = bp->bp_desg_pv;
	bp->bp_port_msg_age = bp->bp_desg_msg_age;
	bp->bp_port_max_age = bp->bp_desg_max_age;
	bp->bp_port_fdelay = bp->bp_desg_fdelay;
	bp->bp_port_htime = bp->bp_desg_htime;
	bp->bp_infois = BSTP_INFO_MINE;

	/* Set transmit flag but do not immediately send */
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	}

	/* set tcprop on every port other than the caller */
	static void
	bstp_set_other_tcprop(struct bstp_port *bp)
	{
	struct bstp_state *bs = bp->bp_bs;
	struct bstp_port *bp2;

	BSTP_LOCK_ASSERT(bs);

	LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) {
	if (bp2 == bp)
	continue;
	bp2->bp_tc_prop = 1;
	}
	}

	static void
	bstp_set_all_reroot(struct bstp_state *bs)
	{
	struct bstp_port *bp;

	BSTP_LOCK_ASSERT(bs);

	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
	bp->bp_reroot = 1;
	}

	static void
	bstp_set_all_sync(struct bstp_state *bs)
	{
	struct bstp_port *bp;

	BSTP_LOCK_ASSERT(bs);

	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	bp->bp_sync = 1;
	bp->bp_synced = 0; /* Not explicit in spec */
	}

	bs->bs_allsynced = 0;
	}

	static void
	bstp_set_port_state(struct bstp_port *bp, int state)
	{
	if (bp->bp_state == state)
	return;

	bp->bp_state = state;

	switch (bp->bp_state) {
	case BSTP_IFSTATE_DISCARDING:
	DPRINTF("state changed to DISCARDING on %s\n",
	bp->bp_ifp->if_xname);
	break;

	case BSTP_IFSTATE_LEARNING:
	DPRINTF("state changed to LEARNING on %s\n",
	bp->bp_ifp->if_xname);

	bstp_timer_start(&bp->bp_forward_delay_timer,
	bp->bp_protover == BSTP_PROTO_RSTP ?
	bp->bp_desg_htime : bp->bp_desg_fdelay);
	break;

	case BSTP_IFSTATE_FORWARDING:
	DPRINTF("state changed to FORWARDING on %s\n",
	bp->bp_ifp->if_xname);

	bstp_timer_stop(&bp->bp_forward_delay_timer);
	/* Record that we enabled forwarding */
	bp->bp_forward_transitions++;
	break;
	}

	/* notify the parent bridge */
	taskqueue_enqueue(taskqueue_swi, &bp->bp_statetask);
	}

	static void
	bstp_set_port_role(struct bstp_port *bp, int role)
	{
	struct bstp_state *bs = bp->bp_bs;

	if (bp->bp_role == role)
	return;

	/* perform pre-change tasks */
	switch (bp->bp_role) {
	case BSTP_ROLE_DISABLED:
	bstp_timer_start(&bp->bp_forward_delay_timer,
	bp->bp_desg_max_age);
	break;

	case BSTP_ROLE_BACKUP:
	bstp_timer_start(&bp->bp_recent_backup_timer,
	bp->bp_desg_htime * 2);
	/* fall through */
	case BSTP_ROLE_ALTERNATE:
	bstp_timer_start(&bp->bp_forward_delay_timer,
	bp->bp_desg_fdelay);
	bp->bp_sync = 0;
	bp->bp_synced = 1;
	bp->bp_reroot = 0;
	break;

	case BSTP_ROLE_ROOT:
	bstp_timer_start(&bp->bp_recent_root_timer,
	BSTP_DEFAULT_FORWARD_DELAY);
	break;
	}

	bp->bp_role = role;
	/* clear values not carried between roles */
	bp->bp_proposing = 0;
	bs->bs_allsynced = 0;

	/* initialise the new role */
	switch (bp->bp_role) {
	case BSTP_ROLE_DISABLED:
	case BSTP_ROLE_ALTERNATE:
	case BSTP_ROLE_BACKUP:
	DPRINTF("%s role -> ALT/BACK/DISABLED\n",
	bp->bp_ifp->if_xname);
	bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
	bstp_timer_stop(&bp->bp_recent_root_timer);
	bstp_timer_latch(&bp->bp_forward_delay_timer);
	bp->bp_sync = 0;
	bp->bp_synced = 1;
	bp->bp_reroot = 0;
	break;

	case BSTP_ROLE_ROOT:
	DPRINTF("%s role -> ROOT\n",
	bp->bp_ifp->if_xname);
	bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
	bstp_timer_latch(&bp->bp_recent_root_timer);
	bp->bp_proposing = 0;
	break;

	case BSTP_ROLE_DESIGNATED:
	DPRINTF("%s role -> DESIGNATED\n",
	bp->bp_ifp->if_xname);
	bstp_timer_start(&bp->bp_hello_timer,
	bp->bp_desg_htime);
	bp->bp_agree = 0;
	break;
	}

	/* let the TC state know that the role changed */
	bstp_update_tc(bp);
	}

	static void
	bstp_set_port_proto(struct bstp_port *bp, int proto)
	{
	struct bstp_state *bs = bp->bp_bs;

	/* supported protocol versions */
	switch (proto) {
	case BSTP_PROTO_STP:
	/* we can downgrade protocols only */
	bstp_timer_stop(&bp->bp_migrate_delay_timer);
	/* clear unsupported features */
	bp->bp_operedge = 0;
	/* STP compat mode only uses 16 bits of the 32 */
	if (bp->bp_path_cost > 65535)
	bp->bp_path_cost = 65535;
	break;

	case BSTP_PROTO_RSTP:
	bstp_timer_start(&bp->bp_migrate_delay_timer,
	bs->bs_migration_delay);
	break;

	default:
	DPRINTF("Unsupported STP version %d\n", proto);
	return;
	}

	bp->bp_protover = proto;
	bp->bp_flags &= ~BSTP_PORT_CANMIGRATE;
	}

	static void
	bstp_set_port_tc(struct bstp_port *bp, int state)
	{
	struct bstp_state *bs = bp->bp_bs;

	bp->bp_tcstate = state;

	/* initialise the new state */
	switch (bp->bp_tcstate) {
	case BSTP_TCSTATE_ACTIVE:
	DPRINTF("%s -> TC_ACTIVE\n", bp->bp_ifp->if_xname);
	/* nothing to do */
	break;

	case BSTP_TCSTATE_INACTIVE:
	bstp_timer_stop(&bp->bp_tc_timer);
	/* flush routes on the parent bridge */
	bp->bp_fdbflush = 1;
	taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask);
	bp->bp_tc_ack = 0;
	DPRINTF("%s -> TC_INACTIVE\n", bp->bp_ifp->if_xname);
	break;

	case BSTP_TCSTATE_LEARNING:
	bp->bp_rcvdtc = 0;
	bp->bp_rcvdtcn = 0;
	bp->bp_rcvdtca = 0;
	bp->bp_tc_prop = 0;
	DPRINTF("%s -> TC_LEARNING\n", bp->bp_ifp->if_xname);
	break;

	case BSTP_TCSTATE_DETECTED:
	bstp_set_timer_tc(bp);
	bstp_set_other_tcprop(bp);
	/* send out notification */
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	bstp_transmit(bs, bp);
	getmicrotime(&bs->bs_last_tc_time);
	DPRINTF("%s -> TC_DETECTED\n", bp->bp_ifp->if_xname);
	bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
	break;

	case BSTP_TCSTATE_TCN:
	bstp_set_timer_tc(bp);
	DPRINTF("%s -> TC_TCN\n", bp->bp_ifp->if_xname);
	/* fall through */
	case BSTP_TCSTATE_TC:
	bp->bp_rcvdtc = 0;
	bp->bp_rcvdtcn = 0;
	if (bp->bp_role == BSTP_ROLE_DESIGNATED)
	bp->bp_tc_ack = 1;

	bstp_set_other_tcprop(bp);
	DPRINTF("%s -> TC_TC\n", bp->bp_ifp->if_xname);
	bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
	break;

	case BSTP_TCSTATE_PROPAG:
	/* flush routes on the parent bridge */
	bp->bp_fdbflush = 1;
	taskqueue_enqueue(taskqueue_swi, &bp->bp_rtagetask);
	bp->bp_tc_prop = 0;
	bstp_set_timer_tc(bp);
	DPRINTF("%s -> TC_PROPAG\n", bp->bp_ifp->if_xname);
	bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
	break;

	case BSTP_TCSTATE_ACK:
	bstp_timer_stop(&bp->bp_tc_timer);
	bp->bp_rcvdtca = 0;
	DPRINTF("%s -> TC_ACK\n", bp->bp_ifp->if_xname);
	bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */
	break;
	}
	}

	static void
	bstp_set_timer_tc(struct bstp_port *bp)
	{
	struct bstp_state *bs = bp->bp_bs;

	if (bp->bp_tc_timer.active)
	return;

	switch (bp->bp_protover) {
	case BSTP_PROTO_RSTP:
	bstp_timer_start(&bp->bp_tc_timer,
	bp->bp_desg_htime + BSTP_TICK_VAL);
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	break;

	case BSTP_PROTO_STP:
	bstp_timer_start(&bp->bp_tc_timer,
	bs->bs_root_max_age + bs->bs_root_fdelay);
	break;
	}
	}

	static void
	bstp_set_timer_msgage(struct bstp_port *bp)
	{
	if (bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR <=
	bp->bp_port_max_age) {
	bstp_timer_start(&bp->bp_message_age_timer,
	bp->bp_port_htime * 3);
	} else
	/* expires immediately */
	bstp_timer_start(&bp->bp_message_age_timer, 0);
	}

	static int
	bstp_rerooted(struct bstp_state bs, struct bstp_port bp)
	{
	struct bstp_port *bp2;
	int rr_set = 0;

	LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) {
	if (bp2 == bp)
	continue;
	if (bp2->bp_recent_root_timer.active) {
	rr_set = 1;
	break;
	}
	}
	return (!rr_set);
	}

	int
	bstp_set_htime(struct bstp_state *bs, int t)
	{
	/* convert seconds to ticks */
	t *= BSTP_TICK_VAL;

	/* value can only be changed in leagacy stp mode */
	if (bs->bs_protover != BSTP_PROTO_STP)
	return (EPERM);

	if (t < BSTP_MIN_HELLO_TIME \|\| t > BSTP_MAX_HELLO_TIME)
	return (EINVAL);

	BSTP_LOCK(bs);
	bs->bs_bridge_htime = t;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_fdelay(struct bstp_state *bs, int t)
	{
	/* convert seconds to ticks */
	t *= BSTP_TICK_VAL;

	if (t < BSTP_MIN_FORWARD_DELAY \|\| t > BSTP_MAX_FORWARD_DELAY)
	return (EINVAL);

	BSTP_LOCK(bs);
	bs->bs_bridge_fdelay = t;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_maxage(struct bstp_state *bs, int t)
	{
	/* convert seconds to ticks */
	t *= BSTP_TICK_VAL;

	if (t < BSTP_MIN_MAX_AGE \|\| t > BSTP_MAX_MAX_AGE)
	return (EINVAL);

	BSTP_LOCK(bs);
	bs->bs_bridge_max_age = t;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_holdcount(struct bstp_state *bs, int count)
	{
	struct bstp_port *bp;

	if (count < BSTP_MIN_HOLD_COUNT \|\|
	count > BSTP_MAX_HOLD_COUNT)
	return (EINVAL);

	BSTP_LOCK(bs);
	bs->bs_txholdcount = count;
	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
	bp->bp_txcount = 0;
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_protocol(struct bstp_state *bs, int proto)
	{
	struct bstp_port *bp;

	switch (proto) {
	/* Supported protocol versions */
	case BSTP_PROTO_STP:
	case BSTP_PROTO_RSTP:
	break;

	default:
	return (EINVAL);
	}

	BSTP_LOCK(bs);
	bs->bs_protover = proto;
	bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME;
	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	/* reinit state */
	bp->bp_infois = BSTP_INFO_DISABLED;
	bp->bp_txcount = 0;
	bstp_set_port_proto(bp, bs->bs_protover);
	bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
	bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE);
	bstp_timer_stop(&bp->bp_recent_backup_timer);
	}
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_priority(struct bstp_state *bs, int pri)
	{
	if (pri < 0 \|\| pri > BSTP_MAX_PRIORITY)
	return (EINVAL);

	/* Limit to steps of 4096 */
	pri -= pri % 4096;

	BSTP_LOCK(bs);
	bs->bs_bridge_priority = pri;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_port_priority(struct bstp_port *bp, int pri)
	{
	struct bstp_state *bs = bp->bp_bs;

	if (pri < 0 \|\| pri > BSTP_MAX_PORT_PRIORITY)
	return (EINVAL);

	/* Limit to steps of 16 */
	pri -= pri % 16;

	BSTP_LOCK(bs);
	bp->bp_priority = pri;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_path_cost(struct bstp_port *bp, uint32_t path_cost)
	{
	struct bstp_state *bs = bp->bp_bs;

	if (path_cost > BSTP_MAX_PATH_COST)
	return (EINVAL);

	/* STP compat mode only uses 16 bits of the 32 */
	if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535)
	path_cost = 65535;

	BSTP_LOCK(bs);

	if (path_cost == 0) { /* use auto */
	bp->bp_flags &= ~BSTP_PORT_ADMCOST;
	bp->bp_path_cost = bstp_calc_path_cost(bp);
	} else {
	bp->bp_path_cost = path_cost;
	bp->bp_flags \|= BSTP_PORT_ADMCOST;
	}
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_edge(struct bstp_port *bp, int set)
	{
	struct bstp_state *bs = bp->bp_bs;

	BSTP_LOCK(bs);
	if ((bp->bp_operedge = set) == 0)
	bp->bp_flags &= ~BSTP_PORT_ADMEDGE;
	else
	bp->bp_flags \|= BSTP_PORT_ADMEDGE;
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_autoedge(struct bstp_port *bp, int set)
	{
	struct bstp_state *bs = bp->bp_bs;

	BSTP_LOCK(bs);
	if (set) {
	bp->bp_flags \|= BSTP_PORT_AUTOEDGE;
	/* we may be able to transition straight to edge */
	if (bp->bp_edge_delay_timer.active == 0)
	bstp_edge_delay_expiry(bs, bp);
	} else
	bp->bp_flags &= ~BSTP_PORT_AUTOEDGE;
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_ptp(struct bstp_port *bp, int set)
	{
	struct bstp_state *bs = bp->bp_bs;

	BSTP_LOCK(bs);
	bp->bp_ptp_link = set;
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_set_autoptp(struct bstp_port *bp, int set)
	{
	struct bstp_state *bs = bp->bp_bs;

	BSTP_LOCK(bs);
	if (set) {
	bp->bp_flags \|= BSTP_PORT_AUTOPTP;
	if (bp->bp_role != BSTP_ROLE_DISABLED)
	bstp_ifupdstatus(bs, bp);
	} else
	bp->bp_flags &= ~BSTP_PORT_AUTOPTP;
	BSTP_UNLOCK(bs);
	return (0);
	}

	/*
	* Calculate the path cost according to the link speed.
	*/
	static uint32_t
	bstp_calc_path_cost(struct bstp_port *bp)
	{
	struct ifnet *ifp = bp->bp_ifp;
	uint32_t path_cost;

	/* If the priority has been manually set then retain the value */
	if (bp->bp_flags & BSTP_PORT_ADMCOST)
	return bp->bp_path_cost;

	if (ifp->if_link_state == LINK_STATE_DOWN) {
	/* Recalc when the link comes up again */
	bp->bp_flags \|= BSTP_PORT_PNDCOST;
	return (BSTP_DEFAULT_PATH_COST);
	}

	if (ifp->if_baudrate < 1000)
	return (BSTP_DEFAULT_PATH_COST);

	/* formula from section 17.14, IEEE Std 802.1D-2004 */
	path_cost = 20000000000ULL / (ifp->if_baudrate / 1000);

	if (path_cost > BSTP_MAX_PATH_COST)
	path_cost = BSTP_MAX_PATH_COST;

	/* STP compat mode only uses 16 bits of the 32 */
	if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535)
	path_cost = 65535;

	return (path_cost);
	}

	/*
	* Notify the bridge that a port state has changed, we need to do this from a
	* taskqueue to avoid a LOR.
	*/
	static void
	bstp_notify_state(void *arg, int pending)
	{
	struct bstp_port bp = (struct bstp_port )arg;
	struct bstp_state *bs = bp->bp_bs;

	if (bp->bp_active == 1 && bs->bs_state_cb != NULL)
	(*bs->bs_state_cb)(bp->bp_ifp, bp->bp_state);
	}

	/*
	* Flush the routes on the bridge port, we need to do this from a
	* taskqueue to avoid a LOR.
	*/
	static void
	bstp_notify_rtage(void *arg, int pending)
	{
	struct bstp_port bp = (struct bstp_port )arg;
	struct bstp_state *bs = bp->bp_bs;
	int age = 0;

	BSTP_LOCK(bs);
	switch (bp->bp_protover) {
	case BSTP_PROTO_STP:
	/* convert to seconds */
	age = bp->bp_desg_fdelay / BSTP_TICK_VAL;
	break;

	case BSTP_PROTO_RSTP:
	age = 0;
	break;
	}
	BSTP_UNLOCK(bs);

	if (bp->bp_active == 1 && bs->bs_rtage_cb != NULL)
	(*bs->bs_rtage_cb)(bp->bp_ifp, age);

	/* flush is complete */
	BSTP_LOCK(bs);
	bp->bp_fdbflush = 0;
	BSTP_UNLOCK(bs);
	}

	void
	bstp_linkstate(struct ifnet *ifp, int state)
	{
	struct bstp_state *bs;
	struct bstp_port *bp;

	/* search for the stp port */
	mtx_lock(&bstp_list_mtx);
	LIST_FOREACH(bs, &bstp_list, bs_list) {
	BSTP_LOCK(bs);
	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	if (bp->bp_ifp == ifp) {
	bstp_ifupdstatus(bs, bp);
	bstp_update_state(bs, bp);
	/* it only exists once so return */
	BSTP_UNLOCK(bs);
	mtx_unlock(&bstp_list_mtx);
	return;
	}
	}
	BSTP_UNLOCK(bs);
	}
	mtx_unlock(&bstp_list_mtx);
	}

	static void
	bstp_ifupdstatus(struct bstp_state bs, struct bstp_port bp)
	{
	struct ifnet *ifp = bp->bp_ifp;
	struct ifmediareq ifmr;
	int error = 0;

	BSTP_LOCK_ASSERT(bs);

	bzero((char *)&ifmr, sizeof(ifmr));
	error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr);

	if ((error == 0) && (ifp->if_flags & IFF_UP)) {
	if (ifmr.ifm_status & IFM_ACTIVE) {
	/* A full-duplex link is assumed to be point to point */
	if (bp->bp_flags & BSTP_PORT_AUTOPTP) {
	bp->bp_ptp_link =
	ifmr.ifm_active & IFM_FDX ? 1 : 0;
	}

	/* Calc the cost if the link was down previously */
	if (bp->bp_flags & BSTP_PORT_PNDCOST) {
	bp->bp_path_cost = bstp_calc_path_cost(bp);
	bp->bp_flags &= ~BSTP_PORT_PNDCOST;
	}

	if (bp->bp_role == BSTP_ROLE_DISABLED)
	bstp_enable_port(bs, bp);
	} else {
	if (bp->bp_role != BSTP_ROLE_DISABLED) {
	bstp_disable_port(bs, bp);
	if ((bp->bp_flags & BSTP_PORT_ADMEDGE) &&
	bp->bp_protover == BSTP_PROTO_RSTP)
	bp->bp_operedge = 1;
	}
	}
	return;
	}

	if (bp->bp_infois != BSTP_INFO_DISABLED)
	bstp_disable_port(bs, bp);
	}

	static void
	bstp_enable_port(struct bstp_state bs, struct bstp_port bp)
	{
	bp->bp_infois = BSTP_INFO_AGED;
	bstp_assign_roles(bs);
	}

	static void
	bstp_disable_port(struct bstp_state bs, struct bstp_port bp)
	{
	bp->bp_infois = BSTP_INFO_DISABLED;
	bstp_assign_roles(bs);
	}

	static void
	bstp_tick(void *arg)
	{
	struct bstp_state *bs = arg;
	struct bstp_port *bp;

	BSTP_LOCK_ASSERT(bs);

	if (bs->bs_running == 0)
	return;

	/* slow timer to catch missed link events */
	if (bstp_timer_expired(&bs->bs_link_timer)) {
	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
	bstp_ifupdstatus(bs, bp);
	bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER);
	}

	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	/* no events need to happen for these */
	bstp_timer_expired(&bp->bp_tc_timer);
	bstp_timer_expired(&bp->bp_recent_root_timer);
	bstp_timer_expired(&bp->bp_forward_delay_timer);
	bstp_timer_expired(&bp->bp_recent_backup_timer);

	if (bstp_timer_expired(&bp->bp_hello_timer))
	bstp_hello_timer_expiry(bs, bp);

	if (bstp_timer_expired(&bp->bp_message_age_timer))
	bstp_message_age_expiry(bs, bp);

	if (bstp_timer_expired(&bp->bp_migrate_delay_timer))
	bstp_migrate_delay_expiry(bs, bp);

	if (bstp_timer_expired(&bp->bp_edge_delay_timer))
	bstp_edge_delay_expiry(bs, bp);

	/* update the various state machines for the port */
	bstp_update_state(bs, bp);

	if (bp->bp_txcount > 0)
	bp->bp_txcount--;
	}

	callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs);
	}

	static void
	bstp_timer_start(struct bstp_timer *t, uint16_t v)
	{
	t->value = v;
	t->active = 1;
	t->latched = 0;
	}

	static void
	bstp_timer_stop(struct bstp_timer *t)
	{
	t->value = 0;
	t->active = 0;
	t->latched = 0;
	}

	static void
	bstp_timer_latch(struct bstp_timer *t)
	{
	t->latched = 1;
	t->active = 1;
	}

	static int
	bstp_timer_expired(struct bstp_timer *t)
	{
	if (t->active == 0 \|\| t->latched)
	return (0);
	t->value -= BSTP_TICK_VAL;
	if (t->value <= 0) {
	bstp_timer_stop(t);
	return (1);
	}
	return (0);
	}

	static void
	bstp_hello_timer_expiry(struct bstp_state bs, struct bstp_port bp)
	{
	if ((bp->bp_flags & BSTP_PORT_NEWINFO) \|\|
	bp->bp_role == BSTP_ROLE_DESIGNATED \|\|
	(bp->bp_role == BSTP_ROLE_ROOT &&
	bp->bp_tc_timer.active == 1)) {
	bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime);
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	bstp_transmit(bs, bp);
	}
	}

	static void
	bstp_message_age_expiry(struct bstp_state bs, struct bstp_port bp)
	{
	if (bp->bp_infois == BSTP_INFO_RECEIVED) {
	bp->bp_infois = BSTP_INFO_AGED;
	bstp_assign_roles(bs);
	DPRINTF("aged info on %s\n", bp->bp_ifp->if_xname);
	}
	}

	static void
	bstp_migrate_delay_expiry(struct bstp_state bs, struct bstp_port bp)
	{
	bp->bp_flags \|= BSTP_PORT_CANMIGRATE;
	}

	static void
	bstp_edge_delay_expiry(struct bstp_state bs, struct bstp_port bp)
	{
	if ((bp->bp_flags & BSTP_PORT_AUTOEDGE) &&
	bp->bp_protover == BSTP_PROTO_RSTP && bp->bp_proposing &&
	bp->bp_role == BSTP_ROLE_DESIGNATED) {
	bp->bp_operedge = 1;
	DPRINTF("%s -> edge port\n", bp->bp_ifp->if_xname);
	}
	}

	static int
	bstp_addr_cmp(const uint8_t a, const uint8_t b)
	{
	int i, d;

	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) {
	d = ((int)a[i]) - ((int)b[i]);
	}

	return (d);
	}

	/*
	* compare the bridge address component of the bridgeid
	*/
	static int
	bstp_same_bridgeid(uint64_t id1, uint64_t id2)
	{
	u_char addr1[ETHER_ADDR_LEN];
	u_char addr2[ETHER_ADDR_LEN];

	PV2ADDR(id1, addr1);
	PV2ADDR(id2, addr2);

	if (bstp_addr_cmp(addr1, addr2) == 0)
	return (1);

	return (0);
	}

	void
	bstp_reinit(struct bstp_state *bs)
	{
	+ INIT_VNET_NET(curvnet);
	struct bstp_port *bp;
	struct ifnet ifp, mif;
	u_char *e_addr;
	static const u_char llzero[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */

	BSTP_LOCK_ASSERT(bs);

	mif = NULL;
	/*
	* Search through the Ethernet adapters and find the one with the
	* lowest value. The adapter which we take the MAC address from does
	* not need to be part of the bridge, it just needs to be a unique
	* value.
	*/
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (ifp->if_type != IFT_ETHER)
	continue;

	if (bstp_addr_cmp(IF_LLADDR(ifp), llzero) == 0)
	continue;

	if (mif == NULL) {
	mif = ifp;
	continue;
	}
	if (bstp_addr_cmp(IF_LLADDR(ifp), IF_LLADDR(mif)) < 0) {
	mif = ifp;
	continue;
	}
	}
	IFNET_RUNLOCK();

	if (LIST_EMPTY(&bs->bs_bplist) \|\| mif == NULL) {
	/* Set the bridge and root id (lower bits) to zero */
	bs->bs_bridge_pv.pv_dbridge_id =
	((uint64_t)bs->bs_bridge_priority) << 48;
	bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id;
	bs->bs_root_pv = bs->bs_bridge_pv;
	/* Disable any remaining ports, they will have no MAC address */
	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	bp->bp_infois = BSTP_INFO_DISABLED;
	bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
	}
	callout_stop(&bs->bs_bstpcallout);
	return;
	}

	e_addr = IF_LLADDR(mif);
	bs->bs_bridge_pv.pv_dbridge_id =
	(((uint64_t)bs->bs_bridge_priority) << 48) \|
	(((uint64_t)e_addr[0]) << 40) \|
	(((uint64_t)e_addr[1]) << 32) \|
	(((uint64_t)e_addr[2]) << 24) \|
	(((uint64_t)e_addr[3]) << 16) \|
	(((uint64_t)e_addr[4]) << 8) \|
	(((uint64_t)e_addr[5]));

	bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id;
	bs->bs_bridge_pv.pv_cost = 0;
	bs->bs_bridge_pv.pv_dport_id = 0;
	bs->bs_bridge_pv.pv_port_id = 0;

	if (bs->bs_running && callout_pending(&bs->bs_bstpcallout) == 0)
	callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs);

	LIST_FOREACH(bp, &bs->bs_bplist, bp_next) {
	bp->bp_port_id = (bp->bp_priority << 8) \|
	(bp->bp_ifp->if_index & 0xfff);
	bstp_ifupdstatus(bs, bp);
	}

	bstp_assign_roles(bs);
	bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER);
	}

	static int
	bstp_modevent(module_t mod, int type, void *data)
	{
	switch (type) {
	case MOD_LOAD:
	mtx_init(&bstp_list_mtx, "bridgestp list", NULL, MTX_DEF);
	LIST_INIT(&bstp_list);
	bstp_linkstate_p = bstp_linkstate;
	break;
	case MOD_UNLOAD:
	bstp_linkstate_p = NULL;
	mtx_destroy(&bstp_list_mtx);
	break;
	default:
	return (EOPNOTSUPP);
	}
	return (0);
	}

	static moduledata_t bstp_mod = {
	"bridgestp",
	bstp_modevent,
	0
	};

	DECLARE_MODULE(bridgestp, bstp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_VERSION(bridgestp, 1);

	void
	bstp_attach(struct bstp_state bs, struct bstp_cb_ops cb)
	{
	BSTP_LOCK_INIT(bs);
	callout_init_mtx(&bs->bs_bstpcallout, &bs->bs_mtx, 0);
	LIST_INIT(&bs->bs_bplist);

	bs->bs_bridge_max_age = BSTP_DEFAULT_MAX_AGE;
	bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME;
	bs->bs_bridge_fdelay = BSTP_DEFAULT_FORWARD_DELAY;
	bs->bs_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY;
	bs->bs_hold_time = BSTP_DEFAULT_HOLD_TIME;
	bs->bs_migration_delay = BSTP_DEFAULT_MIGRATE_DELAY;
	bs->bs_txholdcount = BSTP_DEFAULT_HOLD_COUNT;
	bs->bs_protover = BSTP_PROTO_RSTP;
	bs->bs_state_cb = cb->bcb_state;
	bs->bs_rtage_cb = cb->bcb_rtage;

	getmicrotime(&bs->bs_last_tc_time);

	mtx_lock(&bstp_list_mtx);
	LIST_INSERT_HEAD(&bstp_list, bs, bs_list);
	mtx_unlock(&bstp_list_mtx);
	}

	void
	bstp_detach(struct bstp_state *bs)
	{
	KASSERT(LIST_EMPTY(&bs->bs_bplist), ("bstp still active"));

	mtx_lock(&bstp_list_mtx);
	LIST_REMOVE(bs, bs_list);
	mtx_unlock(&bstp_list_mtx);
	callout_drain(&bs->bs_bstpcallout);
	BSTP_LOCK_DESTROY(bs);
	}

	void
	bstp_init(struct bstp_state *bs)
	{
	BSTP_LOCK(bs);
	callout_reset(&bs->bs_bstpcallout, hz, bstp_tick, bs);
	bs->bs_running = 1;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	}

	void
	bstp_stop(struct bstp_state *bs)
	{
	struct bstp_port *bp;

	BSTP_LOCK(bs);

	LIST_FOREACH(bp, &bs->bs_bplist, bp_next)
	bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);

	bs->bs_running = 0;
	callout_stop(&bs->bs_bstpcallout);
	BSTP_UNLOCK(bs);
	}

	int
	bstp_create(struct bstp_state bs, struct bstp_port bp, struct ifnet *ifp)
	{
	bzero(bp, sizeof(struct bstp_port));

	BSTP_LOCK(bs);
	bp->bp_ifp = ifp;
	bp->bp_bs = bs;
	bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY;
	TASK_INIT(&bp->bp_statetask, 0, bstp_notify_state, bp);
	TASK_INIT(&bp->bp_rtagetask, 0, bstp_notify_rtage, bp);

	/* Init state */
	bp->bp_infois = BSTP_INFO_DISABLED;
	bp->bp_flags = BSTP_PORT_AUTOEDGE\|BSTP_PORT_AUTOPTP;
	bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING);
	bstp_set_port_proto(bp, bs->bs_protover);
	bstp_set_port_role(bp, BSTP_ROLE_DISABLED);
	bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE);
	bp->bp_path_cost = bstp_calc_path_cost(bp);
	BSTP_UNLOCK(bs);
	return (0);
	}

	int
	bstp_enable(struct bstp_port *bp)
	{
	struct bstp_state *bs = bp->bp_bs;
	struct ifnet *ifp = bp->bp_ifp;

	KASSERT(bp->bp_active == 0, ("already a bstp member"));

	switch (ifp->if_type) {
	case IFT_ETHER: /* These can do spanning tree. */
	break;
	default:
	/* Nothing else can. */
	return (EINVAL);
	}

	BSTP_LOCK(bs);
	LIST_INSERT_HEAD(&bs->bs_bplist, bp, bp_next);
	bp->bp_active = 1;
	bp->bp_flags \|= BSTP_PORT_NEWINFO;
	bstp_reinit(bs);
	bstp_update_roles(bs, bp);
	BSTP_UNLOCK(bs);
	return (0);
	}

	void
	bstp_disable(struct bstp_port *bp)
	{
	struct bstp_state *bs = bp->bp_bs;

	KASSERT(bp->bp_active == 1, ("not a bstp member"));

	BSTP_LOCK(bs);
	bstp_disable_port(bs, bp);
	LIST_REMOVE(bp, bp_next);
	bp->bp_active = 0;
	bstp_reinit(bs);
	BSTP_UNLOCK(bs);
	}

	/*
	* The bstp_port structure is about to be freed by the parent bridge.
	*/
	void
	bstp_destroy(struct bstp_port *bp)
	{
	KASSERT(bp->bp_active == 0, ("port is still attached"));
	taskqueue_drain(taskqueue_swi, &bp->bp_statetask);
	taskqueue_drain(taskqueue_swi, &bp->bp_rtagetask);
	}
	Index: head/sys/net/if.c
	===================================================================
	--- head/sys/net/if.c (revision 183549)
	+++ head/sys/net/if.c (revision 183550)
	@@ -1,2813 +1,2849 @@
	/*-
	* Copyright (c) 1980, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)if.c 8.5 (Berkeley) 1/9/95
	* $FreeBSD$
	*/

	#include "opt_compat.h"
	#include "opt_inet6.h"
	#include "opt_inet.h"
	#include "opt_mac.h"
	#include "opt_carp.h"

	#include <sys/param.h>
	#include <sys/types.h>
	#include <sys/conf.h>
	#include <sys/malloc.h>
	#include <sys/sbuf.h>
	#include <sys/bus.h>
	#include <sys/mbuf.h>
	#include <sys/systm.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/kernel.h>
	#include <sys/sockio.h>
	#include <sys/syslog.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/domain.h>
	#include <sys/jail.h>
	#include <sys/vimage.h>
	#include <machine/stdarg.h>

	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/radix.h>
	#include <net/route.h>

	#if defined(INET) \|\| defined(INET6)
	/XXX/
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#ifdef INET6
	#include <netinet6/in6_var.h>
	#include <netinet6/in6_ifattach.h>
	#endif
	#endif
	#ifdef INET
	#include <netinet/if_ether.h>
	#endif
	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif

	#include <security/mac/mac_framework.h>

	SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
	SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");

	/* Log link state change events */
	static int log_link_state_change = 1;

	SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
	&log_link_state_change, 0,
	"log interface link state change events");

	void (bstp_linkstate_p)(struct ifnet ifp, int state);
	void (ng_ether_link_state_p)(struct ifnet ifp, int state);
	void (lagg_linkstate_p)(struct ifnet ifp, int state);

	struct mbuf (tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;

	/*
	* XXX: Style; these should be sorted alphabetically, and unprototyped
	* static functions should be prototyped. Currently they are sorted by
	* declaration order.
	*/
	static void if_attachdomain(void *);
	static void if_attachdomain1(struct ifnet *);
	static int ifconf(u_long, caddr_t);
	static void if_freemulti(struct ifmultiaddr *);
	static void if_grow(void);
	static void if_init(void *);
	static void if_qflush(struct ifaltq *);
	static void if_route(struct ifnet *, int flag, int fam);
	static int if_setflag(struct ifnet , int, int, int , int);
	static void if_slowtimo(void *);
	static void if_unroute(struct ifnet *, int flag, int fam);
	static void link_rtrequest(int, struct rtentry , struct rt_addrinfo );
	static int if_rtdel(struct radix_node , void );
	static int ifhwioctl(u_long, struct ifnet , caddr_t, struct thread );
	static int if_delmulti_locked(struct ifnet , struct ifmultiaddr , int);
	static void if_start_deferred(void *context, int pending);
	static void do_link_state_change(void *, int);
	static int if_getgroup(struct ifgroupreq , struct ifnet );
	static int if_getgroupmembers(struct ifgroupreq *);
	#ifdef INET6
	/*
	* XXX: declare here to avoid to include many inet6 related files..
	* should be more generalized?
	*/
	extern void nd6_setmtu(struct ifnet *);
	#endif

	int if_index = 0;
	int ifqmaxlen = IFQ_MAXLEN;
	struct ifnethead ifnet; /* depend on static init XXX */
	struct ifgrouphead ifg_head;
	struct mtx ifnet_lock;
	static if_com_alloc_t *if_com_alloc[256];
	static if_com_free_t *if_com_free[256];

	static int if_indexlim = 8;
	static struct knlist ifklist;

	/*
	* Table of ifnet/cdev by index. Locked with ifnet_lock.
	*/
	static struct ifindex_entry *ifindex_table = NULL;

	static void filt_netdetach(struct knote *kn);
	static int filt_netdev(struct knote *kn, long hint);

	static struct filterops netdev_filtops =
	{ 1, NULL, filt_netdetach, filt_netdev };

	/*
	* System initialization
	*/
	SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL);
	SYSINIT(interface_check, SI_SUB_PROTO_IF, SI_ORDER_FIRST, if_slowtimo, NULL);

	MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
	MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
	MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");

	struct ifnet *
	ifnet_byindex(u_short idx)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;

	IFNET_RLOCK();
	ifp = V_ifindex_table[idx].ife_ifnet;
	IFNET_RUNLOCK();
	return (ifp);
	}

	static void
	ifnet_setbyindex(u_short idx, struct ifnet *ifp)
	{
	+ INIT_VNET_NET(curvnet);

	IFNET_WLOCK_ASSERT();

	V_ifindex_table[idx].ife_ifnet = ifp;
	}

	struct ifaddr *
	ifaddr_byindex(u_short idx)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifaddr *ifa;

	IFNET_RLOCK();
	ifa = ifnet_byindex(idx)->if_addr;
	IFNET_RUNLOCK();
	return (ifa);
	}

	struct cdev *
	ifdev_byindex(u_short idx)
	{
	+ INIT_VNET_NET(curvnet);
	struct cdev *cdev;

	IFNET_RLOCK();
	cdev = V_ifindex_table[idx].ife_dev;
	IFNET_RUNLOCK();
	return (cdev);
	}

	static void
	ifdev_setbyindex(u_short idx, struct cdev *cdev)
	{
	+ INIT_VNET_NET(curvnet);

	IFNET_WLOCK();
	V_ifindex_table[idx].ife_dev = cdev;
	IFNET_WUNLOCK();
	}

	static d_open_t netopen;
	static d_close_t netclose;
	static d_ioctl_t netioctl;
	static d_kqfilter_t netkqfilter;

	static struct cdevsw net_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_NEEDGIANT,
	.d_open = netopen,
	.d_close = netclose,
	.d_ioctl = netioctl,
	.d_name = "net",
	.d_kqfilter = netkqfilter,
	};

	static int
	netopen(struct cdev dev, int flag, int mode, struct thread td)
	{
	return (0);
	}

	static int
	netclose(struct cdev dev, int flags, int fmt, struct thread td)
	{
	return (0);
	}

	static int
	netioctl(struct cdev dev, u_long cmd, caddr_t data, int flag, struct thread td)
	{
	struct ifnet *ifp;
	int error, idx;

	/* only support interface specific ioctls */
	if (IOCGROUP(cmd) != 'i')
	return (EOPNOTSUPP);
	idx = dev2unit(dev);
	if (idx == 0) {
	/*
	* special network device, not interface.
	*/
	if (cmd == SIOCGIFCONF)
	return (ifconf(cmd, data)); /* XXX remove cmd */
	#ifdef __amd64__
	if (cmd == SIOCGIFCONF32)
	return (ifconf(cmd, data)); /* XXX remove cmd */
	#endif
	return (EOPNOTSUPP);
	}

	ifp = ifnet_byindex(idx);
	if (ifp == NULL)
	return (ENXIO);

	error = ifhwioctl(cmd, ifp, data, td);
	if (error == ENOIOCTL)
	error = EOPNOTSUPP;
	return (error);
	}

	static int
	netkqfilter(struct cdev dev, struct knote kn)
	{
	+ INIT_VNET_NET(curvnet);
	struct knlist *klist;
	struct ifnet *ifp;
	int idx;

	switch (kn->kn_filter) {
	case EVFILT_NETDEV:
	kn->kn_fop = &netdev_filtops;
	break;
	default:
	return (EINVAL);
	}

	idx = dev2unit(dev);
	if (idx == 0) {
	klist = &V_ifklist;
	} else {
	ifp = ifnet_byindex(idx);
	if (ifp == NULL)
	return (1);
	klist = &ifp->if_klist;
	}

	kn->kn_hook = (caddr_t)klist;

	knlist_add(klist, kn, 0);

	return (0);
	}

	static void
	filt_netdetach(struct knote *kn)
	{
	struct knlist klist = (struct knlist )kn->kn_hook;

	knlist_remove(klist, kn, 0);
	}

	static int
	filt_netdev(struct knote *kn, long hint)
	{
	struct knlist klist = (struct knlist )kn->kn_hook;

	/*
	* Currently NOTE_EXIT is abused to indicate device detach.
	*/
	if (hint == NOTE_EXIT) {
	kn->kn_data = NOTE_LINKINV;
	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	knlist_remove_inevent(klist, kn);
	return (1);
	}
	if (hint != 0)
	kn->kn_data = hint; /* current status */
	if (kn->kn_sfflags & hint)
	kn->kn_fflags \|= hint;
	return (kn->kn_fflags != 0);
	}

	/*
	* Network interface utility routines.
	*
	* Routines with ifa_ifwith* names take sockaddr *'s as
	* parameters.
	*/

	/* ARGSUSED*/
	static void
	if_init(void *dummy __unused)
	{
	+ INIT_VNET_NET(curvnet);

	IFNET_LOCK_INIT();
	TAILQ_INIT(&V_ifnet);
	TAILQ_INIT(&V_ifg_head);
	knlist_init(&V_ifklist, NULL, NULL, NULL, NULL);
	if_grow(); /* create initial table */
	ifdev_setbyindex(0, make_dev(&net_cdevsw, 0, UID_ROOT, GID_WHEEL,
	0600, "network"));
	if_clone_init();
	}

	static void
	if_grow(void)
	{
	+ INIT_VNET_NET(curvnet);
	u_int n;
	struct ifindex_entry *e;

	V_if_indexlim <<= 1;
	n = V_if_indexlim * sizeof(*e);
	e = malloc(n, M_IFNET, M_WAITOK \| M_ZERO);
	if (V_ifindex_table != NULL) {
	memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2);
	free((caddr_t)V_ifindex_table, M_IFNET);
	}
	V_ifindex_table = e;
	}

	/*
	* Allocate a struct ifnet and an index for an interface. A layer 2
	* common structure will also be allocated if an allocation routine is
	* registered for the passed type.
	*/
	struct ifnet*
	if_alloc(u_char type)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;

	ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK\|M_ZERO);

	/*
	* Try to find an empty slot below if_index. If we fail, take
	* the next slot.
	*
	* XXX: should be locked!
	*/
	for (ifp->if_index = 1; ifp->if_index <= V_if_index; ifp->if_index++) {
	if (ifnet_byindex(ifp->if_index) == NULL)
	break;
	}
	/* Catch if_index overflow. */
	if (ifp->if_index < 1) {
	free(ifp, M_IFNET);
	return (NULL);
	}
	if (ifp->if_index > V_if_index)
	V_if_index = ifp->if_index;
	if (V_if_index >= V_if_indexlim)
	if_grow();

	ifp->if_type = type;

	if (if_com_alloc[type] != NULL) {
	ifp->if_l2com = if_com_alloc[type](type, ifp);
	if (ifp->if_l2com == NULL) {
	free(ifp, M_IFNET);
	return (NULL);
	}
	}
	IFNET_WLOCK();
	ifnet_setbyindex(ifp->if_index, ifp);
	IFNET_WUNLOCK();
	IF_ADDR_LOCK_INIT(ifp);

	return (ifp);
	}

	/*
	* Free the struct ifnet, the associated index, and the layer 2 common
	* structure if needed. All the work is done in if_free_type().
	*
	* Do not add code to this function! Add it to if_free_type().
	*/
	void
	if_free(struct ifnet *ifp)
	{

	if_free_type(ifp, ifp->if_type);
	}

	/*
	* Do the actual work of freeing a struct ifnet, associated index, and
	* layer 2 common structure. This version should only be called by
	* intefaces that switch their type after calling if_alloc().
	*/
	void
	if_free_type(struct ifnet *ifp, u_char type)
	{
	+ INIT_VNET_NET(curvnet); /* ifp->if_vnet can be NULL here ! */

	if (ifp != ifnet_byindex(ifp->if_index)) {
	if_printf(ifp, "%s: value was not if_alloced, skipping\n",
	__func__);
	return;
	}

	IFNET_WLOCK();
	ifnet_setbyindex(ifp->if_index, NULL);

	/* XXX: should be locked with if_findindex() */
	while (V_if_index > 0 && ifnet_byindex(V_if_index) == NULL)
	V_if_index--;
	IFNET_WUNLOCK();

	if (if_com_free[type] != NULL)
	if_com_free[type](ifp->if_l2com, type);

	IF_ADDR_LOCK_DESTROY(ifp);
	free(ifp, M_IFNET);
	};

	/*
	* Perform generic interface initalization tasks and attach the interface
	* to the list of "active" interfaces.
	*
	* XXX:
	* - The decision to return void and thus require this function to
	* succeed is questionable.
	* - We do more initialization here then is probably a good idea.
	* Some of this should probably move to if_alloc().
	* - We should probably do more sanity checking. For instance we don't
	* do anything to insure if_xname is unique or non-empty.
	*/
	void
	if_attach(struct ifnet *ifp)
	{
	+ INIT_VNET_NET(curvnet);
	unsigned socksize, ifasize;
	int namelen, masklen;
	struct sockaddr_dl *sdl;
	struct ifaddr *ifa;

	if (ifp->if_index == 0 \|\| ifp != ifnet_byindex(ifp->if_index))
	panic ("%s: BUG: if_attach called without if_alloc'd input()\n",
	ifp->if_xname);

	TASK_INIT(&ifp->if_starttask, 0, if_start_deferred, ifp);
	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
	IF_AFDATA_LOCK_INIT(ifp);
	ifp->if_afdata_initialized = 0;

	TAILQ_INIT(&ifp->if_addrhead);
	TAILQ_INIT(&ifp->if_prefixhead);
	TAILQ_INIT(&ifp->if_multiaddrs);
	TAILQ_INIT(&ifp->if_groups);

	if_addgroup(ifp, IFG_ALL);

	knlist_init(&ifp->if_klist, NULL, NULL, NULL, NULL);
	getmicrotime(&ifp->if_lastchange);
	ifp->if_data.ifi_epoch = time_uptime;
	ifp->if_data.ifi_datalen = sizeof(struct if_data);

	#ifdef MAC
	mac_ifnet_init(ifp);
	mac_ifnet_create(ifp);
	#endif

	ifdev_setbyindex(ifp->if_index, make_dev(&net_cdevsw,
	ifp->if_index, UID_ROOT, GID_WHEEL, 0600, "%s/%s",
	net_cdevsw.d_name, ifp->if_xname));
	make_dev_alias(ifdev_byindex(ifp->if_index), "%s%d",
	net_cdevsw.d_name, ifp->if_index);

	mtx_init(&ifp->if_snd.ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);

	/*
	* create a Link Level name for this device
	*/
	namelen = strlen(ifp->if_xname);
	/*
	* Always save enough space for any possiable name so we can do
	* a rename in place later.
	*/
	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
	socksize = masklen + ifp->if_addrlen;
	if (socksize < sizeof(*sdl))
	socksize = sizeof(*sdl);
	socksize = roundup2(socksize, sizeof(long));
	ifasize = sizeof(ifa) + 2 socksize;
	ifa = malloc(ifasize, M_IFADDR, M_WAITOK \| M_ZERO);
	IFA_LOCK_INIT(ifa);
	sdl = (struct sockaddr_dl *)(ifa + 1);
	sdl->sdl_len = socksize;
	sdl->sdl_family = AF_LINK;
	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
	sdl->sdl_nlen = namelen;
	sdl->sdl_index = ifp->if_index;
	sdl->sdl_type = ifp->if_type;
	ifp->if_addr = ifa;
	ifa->ifa_ifp = ifp;
	ifa->ifa_rtrequest = link_rtrequest;
	ifa->ifa_addr = (struct sockaddr *)sdl;
	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
	ifa->ifa_netmask = (struct sockaddr *)sdl;
	sdl->sdl_len = masklen;
	while (namelen != 0)
	sdl->sdl_data[--namelen] = 0xff;
	ifa->ifa_refcnt = 1;
	TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
	ifp->if_broadcastaddr = NULL; /* reliably crash if used uninitialized */

	/*
	* XXX: why do we warn about this? We're correcting it and most
	* drivers just set the value the way we do.
	*/
	if (ifp->if_snd.ifq_maxlen == 0) {
	if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
	ifp->if_snd.ifq_maxlen = ifqmaxlen;
	}
	ifp->if_snd.altq_type = 0;
	ifp->if_snd.altq_disc = NULL;
	ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE;
	ifp->if_snd.altq_tbr = NULL;
	ifp->if_snd.altq_ifp = ifp;

	IFNET_WLOCK();
	TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
	IFNET_WUNLOCK();

	if (domain_init_status >= 2)
	if_attachdomain1(ifp);

	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);

	/* Announce the interface. */
	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);

	if (ifp->if_watchdog != NULL)
	if_printf(ifp,
	"WARNING: using obsoleted if_watchdog interface\n");
	if (ifp->if_flags & IFF_NEEDSGIANT)
	if_printf(ifp,
	"WARNING: using obsoleted IFF_NEEDSGIANT flag\n");
	}

	static void
	if_attachdomain(void *dummy)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	int s;

	s = splnet();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
	if_attachdomain1(ifp);
	splx(s);
	}
	SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
	if_attachdomain, NULL);

	static void
	if_attachdomain1(struct ifnet *ifp)
	{
	struct domain *dp;
	int s;

	s = splnet();

	/*
	* Since dp->dom_ifattach calls malloc() with M_WAITOK, we
	* cannot lock ifp->if_afdata initialization, entirely.
	*/
	if (IF_AFDATA_TRYLOCK(ifp) == 0) {
	splx(s);
	return;
	}
	if (ifp->if_afdata_initialized >= domain_init_status) {
	IF_AFDATA_UNLOCK(ifp);
	splx(s);
	printf("if_attachdomain called more than once on %s\n",
	ifp->if_xname);
	return;
	}
	ifp->if_afdata_initialized = domain_init_status;
	IF_AFDATA_UNLOCK(ifp);

	/* address family dependent data region */
	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
	for (dp = domains; dp; dp = dp->dom_next) {
	if (dp->dom_ifattach)
	ifp->if_afdata[dp->dom_family] =
	(*dp->dom_ifattach)(ifp);
	}

	splx(s);
	}

	/*
	* Remove any unicast or broadcast network addresses from an interface.
	*/
	void
	if_purgeaddrs(struct ifnet *ifp)
	{
	struct ifaddr ifa, next;

	TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
	if (ifa->ifa_addr->sa_family == AF_LINK)
	continue;
	#ifdef INET
	/* XXX: Ugly!! ad hoc just for INET */
	if (ifa->ifa_addr->sa_family == AF_INET) {
	struct ifaliasreq ifr;

	bzero(&ifr, sizeof(ifr));
	ifr.ifra_addr = *ifa->ifa_addr;
	if (ifa->ifa_dstaddr)
	ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
	if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
	NULL) == 0)
	continue;
	}
	#endif /* INET */
	#ifdef INET6
	if (ifa->ifa_addr->sa_family == AF_INET6) {
	in6_purgeaddr(ifa);
	/* ifp_addrhead is already updated */
	continue;
	}
	#endif /* INET6 */
	TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
	IFAFREE(ifa);
	}
	}

	/*
	* Remove any multicast network addresses from an interface.
	*/
	void
	if_purgemaddrs(struct ifnet *ifp)
	{
	struct ifmultiaddr *ifma;
	struct ifmultiaddr *next;

	IF_ADDR_LOCK(ifp);
	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
	if_delmulti_locked(ifp, ifma, 1);
	IF_ADDR_UNLOCK(ifp);
	}

	/*
	* Detach an interface, removing it from the
	* list of "active" interfaces.
	*
	* XXXRW: There are some significant questions about event ordering, and
	* how to prevent things from starting to use the interface during detach.
	*/
	void
	if_detach(struct ifnet *ifp)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	struct ifaddr *ifa;
	struct radix_node_head *rnh;
	int s;
	int i;
	struct domain *dp;
	struct ifnet *iter;
	int found = 0;

	IFNET_WLOCK();
	TAILQ_FOREACH(iter, &V_ifnet, if_link)
	if (iter == ifp) {
	TAILQ_REMOVE(&V_ifnet, ifp, if_link);
	found = 1;
	break;
	}
	IFNET_WUNLOCK();
	if (!found)
	return;

	/*
	* Remove/wait for pending events.
	*/
	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);

	/*
	* Remove routes and flush queues.
	*/
	s = splnet();
	if_down(ifp);
	#ifdef ALTQ
	if (ALTQ_IS_ENABLED(&ifp->if_snd))
	altq_disable(&ifp->if_snd);
	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
	altq_detach(&ifp->if_snd);
	#endif

	if_purgeaddrs(ifp);

	#ifdef INET
	in_ifdetach(ifp);
	#endif

	#ifdef INET6
	/*
	* Remove all IPv6 kernel structs related to ifp. This should be done
	* before removing routing entries below, since IPv6 interface direct
	* routes are expected to be removed by the IPv6-specific kernel API.
	* Otherwise, the kernel will detect some inconsistency and bark it.
	*/
	in6_ifdetach(ifp);
	#endif
	if_purgemaddrs(ifp);

	/*
	* Remove link ifaddr pointer and maybe decrement if_index.
	* Clean up all addresses.
	*/
	ifp->if_addr = NULL;
	destroy_dev(ifdev_byindex(ifp->if_index));
	ifdev_setbyindex(ifp->if_index, NULL);

	/* We can now free link ifaddr. */
	if (!TAILQ_EMPTY(&ifp->if_addrhead)) {
	ifa = TAILQ_FIRST(&ifp->if_addrhead);
	TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link);
	IFAFREE(ifa);
	}

	/*
	* Delete all remaining routes using this interface
	* Unfortuneatly the only way to do this is to slog through
	* the entire routing table looking for routes which point
	* to this interface...oh well...
	*/
	for (i = 1; i <= AF_MAX; i++) {
	int j;
	for (j = 0; j < rt_numfibs; j++) {
	if ((rnh = V_rt_tables[j][i]) == NULL)
	continue;
	RADIX_NODE_HEAD_LOCK(rnh);
	(void) rnh->rnh_walktree(rnh, if_rtdel, ifp);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	}
	}

	/* Announce that the interface is gone. */
	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);

	IF_AFDATA_LOCK(ifp);
	for (dp = domains; dp; dp = dp->dom_next) {
	if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
	(*dp->dom_ifdetach)(ifp,
	ifp->if_afdata[dp->dom_family]);
	}
	IF_AFDATA_UNLOCK(ifp);

	#ifdef MAC
	mac_ifnet_destroy(ifp);
	#endif /* MAC */
	KNOTE_UNLOCKED(&ifp->if_klist, NOTE_EXIT);
	knlist_clear(&ifp->if_klist, 0);
	knlist_destroy(&ifp->if_klist);
	mtx_destroy(&ifp->if_snd.ifq_mtx);
	IF_AFDATA_DESTROY(ifp);
	splx(s);
	}

	/*
	* Add a group to an interface
	*/
	int
	if_addgroup(struct ifnet ifp, const char groupname)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	struct ifg_list *ifgl;
	struct ifg_group *ifg = NULL;
	struct ifg_member *ifgm;

	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
	groupname[strlen(groupname) - 1] <= '9')
	return (EINVAL);

	IFNET_WLOCK();
	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
	if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
	IFNET_WUNLOCK();
	return (EEXIST);
	}

	if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP,
	M_NOWAIT)) == NULL) {
	IFNET_WUNLOCK();
	return (ENOMEM);
	}

	if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member),
	M_TEMP, M_NOWAIT)) == NULL) {
	free(ifgl, M_TEMP);
	IFNET_WUNLOCK();
	return (ENOMEM);
	}

	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
	if (!strcmp(ifg->ifg_group, groupname))
	break;

	if (ifg == NULL) {
	if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group),
	M_TEMP, M_NOWAIT)) == NULL) {
	free(ifgl, M_TEMP);
	free(ifgm, M_TEMP);
	IFNET_WUNLOCK();
	return (ENOMEM);
	}
	strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
	ifg->ifg_refcnt = 0;
	TAILQ_INIT(&ifg->ifg_members);
	EVENTHANDLER_INVOKE(group_attach_event, ifg);
	TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
	}

	ifg->ifg_refcnt++;
	ifgl->ifgl_group = ifg;
	ifgm->ifgm_ifp = ifp;

	IF_ADDR_LOCK(ifp);
	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
	IF_ADDR_UNLOCK(ifp);

	IFNET_WUNLOCK();

	EVENTHANDLER_INVOKE(group_change_event, groupname);

	return (0);
	}

	/*
	* Remove a group from an interface
	*/
	int
	if_delgroup(struct ifnet ifp, const char groupname)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	struct ifg_list *ifgl;
	struct ifg_member *ifgm;

	IFNET_WLOCK();
	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
	if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
	break;
	if (ifgl == NULL) {
	IFNET_WUNLOCK();
	return (ENOENT);
	}

	IF_ADDR_LOCK(ifp);
	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
	IF_ADDR_UNLOCK(ifp);

	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
	if (ifgm->ifgm_ifp == ifp)
	break;

	if (ifgm != NULL) {
	TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
	free(ifgm, M_TEMP);
	}

	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
	TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next);
	EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
	free(ifgl->ifgl_group, M_TEMP);
	}
	IFNET_WUNLOCK();

	free(ifgl, M_TEMP);

	EVENTHANDLER_INVOKE(group_change_event, groupname);

	return (0);
	}

	/*
	* Stores all groups from an interface in memory pointed
	* to by data
	*/
	static int
	if_getgroup(struct ifgroupreq data, struct ifnet ifp)
	{
	int len, error;
	struct ifg_list *ifgl;
	struct ifg_req ifgrq, *ifgp;
	struct ifgroupreq *ifgr = data;

	if (ifgr->ifgr_len == 0) {
	IF_ADDR_LOCK(ifp);
	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
	ifgr->ifgr_len += sizeof(struct ifg_req);
	IF_ADDR_UNLOCK(ifp);
	return (0);
	}

	len = ifgr->ifgr_len;
	ifgp = ifgr->ifgr_groups;
	/* XXX: wire */
	IF_ADDR_LOCK(ifp);
	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
	if (len < sizeof(ifgrq)) {
	IF_ADDR_UNLOCK(ifp);
	return (EINVAL);
	}
	bzero(&ifgrq, sizeof ifgrq);
	strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
	sizeof(ifgrq.ifgrq_group));
	if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
	IF_ADDR_UNLOCK(ifp);
	return (error);
	}
	len -= sizeof(ifgrq);
	ifgp++;
	}
	IF_ADDR_UNLOCK(ifp);

	return (0);
	}

	/*
	* Stores all members of a group in memory pointed to by data
	*/
	static int
	if_getgroupmembers(struct ifgroupreq *data)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifgroupreq *ifgr = data;
	struct ifg_group *ifg;
	struct ifg_member *ifgm;
	struct ifg_req ifgrq, *ifgp;
	int len, error;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
	if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
	break;
	if (ifg == NULL) {
	IFNET_RUNLOCK();
	return (ENOENT);
	}

	if (ifgr->ifgr_len == 0) {
	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
	ifgr->ifgr_len += sizeof(ifgrq);
	IFNET_RUNLOCK();
	return (0);
	}

	len = ifgr->ifgr_len;
	ifgp = ifgr->ifgr_groups;
	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
	if (len < sizeof(ifgrq)) {
	IFNET_RUNLOCK();
	return (EINVAL);
	}
	bzero(&ifgrq, sizeof ifgrq);
	strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
	sizeof(ifgrq.ifgrq_member));
	if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
	IFNET_RUNLOCK();
	return (error);
	}
	len -= sizeof(ifgrq);
	ifgp++;
	}
	IFNET_RUNLOCK();

	return (0);
	}

	/*
	* Delete Routes for a Network Interface
	*
	* Called for each routing entry via the rnh->rnh_walktree() call above
	* to delete all route entries referencing a detaching network interface.
	*
	* Arguments:
	* rn pointer to node in the routing table
	* arg argument passed to rnh->rnh_walktree() - detaching interface
	*
	* Returns:
	* 0 successful
	* errno failed - reason indicated
	*
	*/
	static int
	if_rtdel(struct radix_node rn, void arg)
	{
	struct rtentry rt = (struct rtentry )rn;
	struct ifnet *ifp = arg;
	int err;

	if (rt->rt_ifp == ifp) {

	/*
	* Protect (sorta) against walktree recursion problems
	* with cloned routes
	*/
	if ((rt->rt_flags & RTF_UP) == 0)
	return (0);

	err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway,
	rt_mask(rt), rt->rt_flags,
	(struct rtentry **) NULL, rt->rt_fibnum);
	if (err) {
	log(LOG_WARNING, "if_rtdel: error %d\n", err);
	}
	}

	return (0);
	}

	/*
	* XXX: Because sockaddr_dl has deeper structure than the sockaddr
	* structs used to represent other address families, it is necessary
	* to perform a different comparison.
	*/

	#define sa_equal(a1, a2) \
	(bcmp((a1), (a2), ((a1))->sa_len) == 0)

	#define sa_dl_equal(a1, a2) \
	((((struct sockaddr_dl *)(a1))->sdl_len == \
	((struct sockaddr_dl *)(a2))->sdl_len) && \
	(bcmp(LLADDR((struct sockaddr_dl *)(a1)), \
	LLADDR((struct sockaddr_dl *)(a2)), \
	((struct sockaddr_dl *)(a1))->sdl_alen) == 0))

	/*
	* Locate an interface based on a complete address.
	*/
	/ARGSUSED/
	struct ifaddr *
	ifa_ifwithaddr(struct sockaddr *addr)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sa_family)
	continue;
	if (sa_equal(addr, ifa->ifa_addr))
	goto done;
	/* IP6 doesn't have broadcast */
	if ((ifp->if_flags & IFF_BROADCAST) &&
	ifa->ifa_broadaddr &&
	ifa->ifa_broadaddr->sa_len != 0 &&
	sa_equal(ifa->ifa_broadaddr, addr))
	goto done;
	}
	ifa = NULL;
	done:
	IFNET_RUNLOCK();
	return (ifa);
	}

	/*
	* Locate an interface based on the broadcast address.
	*/
	/* ARGSUSED */
	struct ifaddr *
	ifa_ifwithbroadaddr(struct sockaddr *addr)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link)
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sa_family)
	continue;
	if ((ifp->if_flags & IFF_BROADCAST) &&
	ifa->ifa_broadaddr &&
	ifa->ifa_broadaddr->sa_len != 0 &&
	sa_equal(ifa->ifa_broadaddr, addr))
	goto done;
	}
	ifa = NULL;
	done:
	IFNET_RUNLOCK();
	return (ifa);
	}

	/*
	* Locate the point to point interface with a given destination address.
	*/
	/ARGSUSED/
	struct ifaddr *
	ifa_ifwithdstaddr(struct sockaddr *addr)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
	continue;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != addr->sa_family)
	continue;
	if (ifa->ifa_dstaddr != NULL &&
	sa_equal(addr, ifa->ifa_dstaddr))
	goto done;
	}
	}
	ifa = NULL;
	done:
	IFNET_RUNLOCK();
	return (ifa);
	}

	/*
	* Find an interface on a specific network. If many, choice
	* is most specific found.
	*/
	struct ifaddr *
	ifa_ifwithnet(struct sockaddr *addr)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct ifaddr ifa_maybe = (struct ifaddr ) 0;
	u_int af = addr->sa_family;
	char addr_data = addr->sa_data, cplim;

	/*
	* AF_LINK addresses can be looked up directly by their index number,
	* so do that if we can.
	*/
	if (af == AF_LINK) {
	struct sockaddr_dl sdl = (struct sockaddr_dl )addr;
	if (sdl->sdl_index && sdl->sdl_index <= V_if_index)
	return (ifaddr_byindex(sdl->sdl_index));
	}

	/*
	* Scan though each interface, looking for ones that have
	* addresses in this address family.
	*/
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	char cp, cp2, *cp3;

	if (ifa->ifa_addr->sa_family != af)
	next: continue;
	if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
	/*
	* This is a bit broken as it doesn't
	* take into account that the remote end may
	* be a single node in the network we are
	* looking for.
	* The trouble is that we don't know the
	* netmask for the remote end.
	*/
	if (ifa->ifa_dstaddr != NULL &&
	sa_equal(addr, ifa->ifa_dstaddr))
	goto done;
	} else {
	/*
	* if we have a special address handler,
	* then use it instead of the generic one.
	*/
	if (ifa->ifa_claim_addr) {
	if ((*ifa->ifa_claim_addr)(ifa, addr))
	goto done;
	continue;
	}

	/*
	* Scan all the bits in the ifa's address.
	* If a bit dissagrees with what we are
	* looking for, mask it with the netmask
	* to see if it really matters.
	* (A byte at a time)
	*/
	if (ifa->ifa_netmask == 0)
	continue;
	cp = addr_data;
	cp2 = ifa->ifa_addr->sa_data;
	cp3 = ifa->ifa_netmask->sa_data;
	cplim = ifa->ifa_netmask->sa_len
	+ (char *)ifa->ifa_netmask;
	while (cp3 < cplim)
	if ((cp++ ^ cp2++) & *cp3++)
	goto next; /* next address! */
	/*
	* If the netmask of what we just found
	* is more specific than what we had before
	* (if we had one) then remember the new one
	* before continuing to search
	* for an even better one.
	*/
	if (ifa_maybe == 0 \|\|
	rn_refines((caddr_t)ifa->ifa_netmask,
	(caddr_t)ifa_maybe->ifa_netmask))
	ifa_maybe = ifa;
	}
	}
	}
	ifa = ifa_maybe;
	done:
	IFNET_RUNLOCK();
	return (ifa);
	}

	/*
	* Find an interface address specific to an interface best matching
	* a given address.
	*/
	struct ifaddr *
	ifaof_ifpforaddr(struct sockaddr addr, struct ifnet ifp)
	{
	struct ifaddr *ifa;
	char cp, cp2, *cp3;
	char *cplim;
	struct ifaddr *ifa_maybe = 0;
	u_int af = addr->sa_family;

	if (af >= AF_MAX)
	return (0);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != af)
	continue;
	if (ifa_maybe == 0)
	ifa_maybe = ifa;
	if (ifa->ifa_netmask == 0) {
	if (sa_equal(addr, ifa->ifa_addr) \|\|
	(ifa->ifa_dstaddr &&
	sa_equal(addr, ifa->ifa_dstaddr)))
	goto done;
	continue;
	}
	if (ifp->if_flags & IFF_POINTOPOINT) {
	if (sa_equal(addr, ifa->ifa_dstaddr))
	goto done;
	} else {
	cp = addr->sa_data;
	cp2 = ifa->ifa_addr->sa_data;
	cp3 = ifa->ifa_netmask->sa_data;
	cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
	for (; cp3 < cplim; cp3++)
	if ((cp++ ^ cp2++) & *cp3)
	break;
	if (cp3 == cplim)
	goto done;
	}
	}
	ifa = ifa_maybe;
	done:
	return (ifa);
	}

	#include <net/route.h>

	/*
	* Default action when installing a route with a Link Level gateway.
	* Lookup an appropriate real ifa to point to.
	* This should be moved to /sys/net/link.c eventually.
	*/
	static void
	link_rtrequest(int cmd, struct rtentry rt, struct rt_addrinfo info)
	{
	struct ifaddr ifa, oifa;
	struct sockaddr *dst;
	struct ifnet *ifp;

	RT_LOCK_ASSERT(rt);

	if (cmd != RTM_ADD \|\| ((ifa = rt->rt_ifa) == 0) \|\|
	((ifp = ifa->ifa_ifp) == 0) \|\| ((dst = rt_key(rt)) == 0))
	return;
	ifa = ifaof_ifpforaddr(dst, ifp);
	if (ifa) {
	IFAREF(ifa); /* XXX */
	oifa = rt->rt_ifa;
	rt->rt_ifa = ifa;
	IFAFREE(oifa);
	if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
	ifa->ifa_rtrequest(cmd, rt, info);
	}
	}

	/*
	* Mark an interface down and notify protocols of
	* the transition.
	* NOTE: must be called at splnet or eqivalent.
	*/
	static void
	if_unroute(struct ifnet *ifp, int flag, int fam)
	{
	struct ifaddr *ifa;

	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));

	ifp->if_flags &= ~flag;
	getmicrotime(&ifp->if_lastchange);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (fam == PF_UNSPEC \|\| (fam == ifa->ifa_addr->sa_family))
	pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
	if_qflush(&ifp->if_snd);
	#ifdef DEV_CARP
	if (ifp->if_carp)
	carp_carpdev_state(ifp->if_carp);
	#endif
	rt_ifmsg(ifp);
	}

	/*
	* Mark an interface up and notify protocols of
	* the transition.
	* NOTE: must be called at splnet or eqivalent.
	*/
	static void
	if_route(struct ifnet *ifp, int flag, int fam)
	{
	struct ifaddr *ifa;

	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));

	ifp->if_flags \|= flag;
	getmicrotime(&ifp->if_lastchange);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (fam == PF_UNSPEC \|\| (fam == ifa->ifa_addr->sa_family))
	pfctlinput(PRC_IFUP, ifa->ifa_addr);
	#ifdef DEV_CARP
	if (ifp->if_carp)
	carp_carpdev_state(ifp->if_carp);
	#endif
	rt_ifmsg(ifp);
	#ifdef INET6
	in6_if_up(ifp);
	#endif
	}

	void (vlan_link_state_p)(struct ifnet , int); /* XXX: private from if_vlan */
	void (vlan_trunk_cap_p)(struct ifnet ); /* XXX: private from if_vlan */

	/*
	* Handle a change in the interface link state. To avoid LORs
	* between driver lock and upper layer locks, as well as possible
	* recursions, we post event to taskqueue, and all job
	* is done in static do_link_state_change().
	*/
	void
	if_link_state_change(struct ifnet *ifp, int link_state)
	{
	/* Return if state hasn't changed. */
	if (ifp->if_link_state == link_state)
	return;

	ifp->if_link_state = link_state;

	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
	}

	static void
	do_link_state_change(void *arg, int pending)
	{
	struct ifnet ifp = (struct ifnet )arg;
	int link_state = ifp->if_link_state;
	int link;
	+ CURVNET_SET(ifp->if_vnet);

	/* Notify that the link state has changed. */
	rt_ifmsg(ifp);
	if (link_state == LINK_STATE_UP)
	link = NOTE_LINKUP;
	else if (link_state == LINK_STATE_DOWN)
	link = NOTE_LINKDOWN;
	else
	link = NOTE_LINKINV;
	KNOTE_UNLOCKED(&ifp->if_klist, link);
	if (ifp->if_vlantrunk != NULL)
	(*vlan_link_state_p)(ifp, link);

	if ((ifp->if_type == IFT_ETHER \|\| ifp->if_type == IFT_L2VLAN) &&
	IFP2AC(ifp)->ac_netgraph != NULL)
	(*ng_ether_link_state_p)(ifp, link_state);
	#ifdef DEV_CARP
	if (ifp->if_carp)
	carp_carpdev_state(ifp->if_carp);
	#endif
	if (ifp->if_bridge) {
	KASSERT(bstp_linkstate_p != NULL,("if_bridge bstp not loaded!"));
	(*bstp_linkstate_p)(ifp, link_state);
	}
	if (ifp->if_lagg) {
	KASSERT(lagg_linkstate_p != NULL,("if_lagg not loaded!"));
	(*lagg_linkstate_p)(ifp, link_state);
	}

	devctl_notify("IFNET", ifp->if_xname,
	(link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
	if (pending > 1)
	if_printf(ifp, "%d link states coalesced\n", pending);
	if (log_link_state_change)
	log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname,
	(link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
	+ CURVNET_RESTORE();
	}

	/*
	* Mark an interface down and notify protocols of
	* the transition.
	* NOTE: must be called at splnet or eqivalent.
	*/
	void
	if_down(struct ifnet *ifp)
	{

	if_unroute(ifp, IFF_UP, AF_UNSPEC);
	}

	/*
	* Mark an interface up and notify protocols of
	* the transition.
	* NOTE: must be called at splnet or eqivalent.
	*/
	void
	if_up(struct ifnet *ifp)
	{

	if_route(ifp, IFF_UP, AF_UNSPEC);
	}

	/*
	* Flush an interface queue.
	*/
	static void
	if_qflush(struct ifaltq *ifq)
	{
	struct mbuf m, n;

	IFQ_LOCK(ifq);
	#ifdef ALTQ
	if (ALTQ_IS_ENABLED(ifq))
	ALTQ_PURGE(ifq);
	#endif
	n = ifq->ifq_head;
	while ((m = n) != 0) {
	n = m->m_act;
	m_freem(m);
	}
	ifq->ifq_head = 0;
	ifq->ifq_tail = 0;
	ifq->ifq_len = 0;
	IFQ_UNLOCK(ifq);
	}

	/*
	* Handle interface watchdog timer routines. Called
	* from softclock, we decrement timers (if set) and
	* call the appropriate interface routine on expiration.
	*
	* XXXRW: Note that because timeouts run with Giant, if_watchdog() is called
	* holding Giant. If we switch to an MPSAFE callout, we likely need to grab
	* Giant before entering if_watchdog() on an IFF_NEEDSGIANT interface.
	*/
	static void
	if_slowtimo(void *arg)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct ifnet *ifp;
	int s = splimp();

	IFNET_RLOCK();
	- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	- if (ifp->if_timer == 0 \|\| --ifp->if_timer)
	- continue;
	- if (ifp->if_watchdog)
	- (*ifp->if_watchdog)(ifp);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_NET(vnet_iter);
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	+ if (ifp->if_timer == 0 \|\| --ifp->if_timer)
	+ continue;
	+ if (ifp->if_watchdog)
	+ (*ifp->if_watchdog)(ifp);
	+ }
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IFNET_RUNLOCK();
	splx(s);
	timeout(if_slowtimo, (void *)0, hz / IFNET_SLOWHZ);
	}

	/*
	* Map interface name to
	* interface structure pointer.
	*/
	struct ifnet *
	ifunit(const char *name)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;

	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
	break;
	}
	IFNET_RUNLOCK();
	return (ifp);
	}

	/*
	* Hardware specific interface ioctls.
	*/
	static int
	ifhwioctl(u_long cmd, struct ifnet ifp, caddr_t data, struct thread td)
	{
	struct ifreq *ifr;
	struct ifstat *ifs;
	int error = 0;
	int new_flags, temp_flags;
	size_t namelen, onamelen;
	char new_name[IFNAMSIZ];
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;

	ifr = (struct ifreq *)data;
	switch (cmd) {
	case SIOCGIFINDEX:
	ifr->ifr_index = ifp->if_index;
	break;

	case SIOCGIFFLAGS:
	temp_flags = ifp->if_flags \| ifp->if_drv_flags;
	ifr->ifr_flags = temp_flags & 0xffff;
	ifr->ifr_flagshigh = temp_flags >> 16;
	break;

	case SIOCGIFCAP:
	ifr->ifr_reqcap = ifp->if_capabilities;
	ifr->ifr_curcap = ifp->if_capenable;
	break;

	#ifdef MAC
	case SIOCGIFMAC:
	error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
	break;
	#endif

	case SIOCGIFMETRIC:
	ifr->ifr_metric = ifp->if_metric;
	break;

	case SIOCGIFMTU:
	ifr->ifr_mtu = ifp->if_mtu;
	break;

	case SIOCGIFPHYS:
	ifr->ifr_phys = ifp->if_physical;
	break;

	case SIOCSIFFLAGS:
	error = priv_check(td, PRIV_NET_SETIFFLAGS);
	if (error)
	return (error);
	/*
	* Currently, no driver owned flags pass the IFF_CANTCHANGE
	* check, so we don't need special handling here yet.
	*/
	new_flags = (ifr->ifr_flags & 0xffff) \|
	(ifr->ifr_flagshigh << 16);
	if (ifp->if_flags & IFF_SMART) {
	/* Smart drivers twiddle their own routes */
	} else if (ifp->if_flags & IFF_UP &&
	(new_flags & IFF_UP) == 0) {
	int s = splimp();
	if_down(ifp);
	splx(s);
	} else if (new_flags & IFF_UP &&
	(ifp->if_flags & IFF_UP) == 0) {
	int s = splimp();
	if_up(ifp);
	splx(s);
	}
	/* See if permanently promiscuous mode bit is about to flip */
	if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
	if (new_flags & IFF_PPROMISC)
	ifp->if_flags \|= IFF_PROMISC;
	else if (ifp->if_pcount == 0)
	ifp->if_flags &= ~IFF_PROMISC;
	log(LOG_INFO, "%s: permanently promiscuous mode %s\n",
	ifp->if_xname,
	(new_flags & IFF_PPROMISC) ? "enabled" : "disabled");
	}
	ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) \|
	(new_flags &~ IFF_CANTCHANGE);
	if (ifp->if_ioctl) {
	IFF_LOCKGIANT(ifp);
	(void) (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	}
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFCAP:
	error = priv_check(td, PRIV_NET_SETIFCAP);
	if (error)
	return (error);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	if (ifr->ifr_reqcap & ~ifp->if_capabilities)
	return (EINVAL);
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	#ifdef MAC
	case SIOCSIFMAC:
	error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
	break;
	#endif

	case SIOCSIFNAME:
	error = priv_check(td, PRIV_NET_SETIFNAME);
	if (error)
	return (error);
	error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
	if (error != 0)
	return (error);
	if (new_name[0] == '\0')
	return (EINVAL);
	if (ifunit(new_name) != NULL)
	return (EEXIST);

	/* Announce the departure of the interface. */
	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);

	log(LOG_INFO, "%s: changing name to '%s'\n",
	ifp->if_xname, new_name);

	strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
	ifa = ifp->if_addr;
	IFA_LOCK(ifa);
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	namelen = strlen(new_name);
	onamelen = sdl->sdl_nlen;
	/*
	* Move the address if needed. This is safe because we
	* allocate space for a name of length IFNAMSIZ when we
	* create this in if_attach().
	*/
	if (namelen != onamelen) {
	bcopy(sdl->sdl_data + onamelen,
	sdl->sdl_data + namelen, sdl->sdl_alen);
	}
	bcopy(new_name, sdl->sdl_data, namelen);
	sdl->sdl_nlen = namelen;
	sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
	bzero(sdl->sdl_data, onamelen);
	while (namelen != 0)
	sdl->sdl_data[--namelen] = 0xff;
	IFA_UNLOCK(ifa);

	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
	/* Announce the return of the interface. */
	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
	break;

	case SIOCSIFMETRIC:
	error = priv_check(td, PRIV_NET_SETIFMETRIC);
	if (error)
	return (error);
	ifp->if_metric = ifr->ifr_metric;
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFPHYS:
	error = priv_check(td, PRIV_NET_SETIFPHYS);
	if (error)
	return (error);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFMTU:
	{
	u_long oldmtu = ifp->if_mtu;

	error = priv_check(td, PRIV_NET_SETIFMTU);
	if (error)
	return (error);
	if (ifr->ifr_mtu < IF_MINMTU \|\| ifr->ifr_mtu > IF_MAXMTU)
	return (EINVAL);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	if (error == 0) {
	getmicrotime(&ifp->if_lastchange);
	rt_ifmsg(ifp);
	}
	/*
	* If the link MTU changed, do network layer specific procedure.
	*/
	if (ifp->if_mtu != oldmtu) {
	#ifdef INET6
	nd6_setmtu(ifp);
	#endif
	}
	break;
	}

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if (cmd == SIOCADDMULTI)
	error = priv_check(td, PRIV_NET_ADDMULTI);
	else
	error = priv_check(td, PRIV_NET_DELMULTI);
	if (error)
	return (error);

	/* Don't allow group membership on non-multicast interfaces. */
	if ((ifp->if_flags & IFF_MULTICAST) == 0)
	return (EOPNOTSUPP);

	/* Don't let users screw up protocols' entries. */
	if (ifr->ifr_addr.sa_family != AF_LINK)
	return (EINVAL);

	if (cmd == SIOCADDMULTI) {
	struct ifmultiaddr *ifma;

	/*
	* Userland is only permitted to join groups once
	* via the if_addmulti() KPI, because it cannot hold
	* struct ifmultiaddr * between calls. It may also
	* lose a race while we check if the membership
	* already exists.
	*/
	IF_ADDR_LOCK(ifp);
	ifma = if_findmulti(ifp, &ifr->ifr_addr);
	IF_ADDR_UNLOCK(ifp);
	if (ifma != NULL)
	error = EADDRINUSE;
	else
	error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
	} else {
	error = if_delmulti(ifp, &ifr->ifr_addr);
	}
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCSIFPHYADDR:
	case SIOCDIFPHYADDR:
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	#endif
	case SIOCSLIFPHYADDR:
	case SIOCSIFMEDIA:
	case SIOCSIFGENERIC:
	error = priv_check(td, PRIV_NET_HWIOCTL);
	if (error)
	return (error);
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	if (error == 0)
	getmicrotime(&ifp->if_lastchange);
	break;

	case SIOCGIFSTATUS:
	ifs = (struct ifstat *)data;
	ifs->ascii[0] = '\0';

	case SIOCGIFPSRCADDR:
	case SIOCGIFPDSTADDR:
	case SIOCGLIFPHYADDR:
	case SIOCGIFMEDIA:
	case SIOCGIFGENERIC:
	if (ifp->if_ioctl == NULL)
	return (EOPNOTSUPP);
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	break;

	case SIOCSIFLLADDR:
	error = priv_check(td, PRIV_NET_SETLLADDR);
	if (error)
	return (error);
	error = if_setlladdr(ifp,
	ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
	break;

	case SIOCAIFGROUP:
	{
	struct ifgroupreq ifgr = (struct ifgroupreq )ifr;

	error = priv_check(td, PRIV_NET_ADDIFGROUP);
	if (error)
	return (error);
	if ((error = if_addgroup(ifp, ifgr->ifgr_group)))
	return (error);
	break;
	}

	case SIOCGIFGROUP:
	if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp)))
	return (error);
	break;

	case SIOCDIFGROUP:
	{
	struct ifgroupreq ifgr = (struct ifgroupreq )ifr;

	error = priv_check(td, PRIV_NET_DELIFGROUP);
	if (error)
	return (error);
	if ((error = if_delgroup(ifp, ifgr->ifgr_group)))
	return (error);
	break;
	}

	default:
	error = ENOIOCTL;
	break;
	}
	return (error);
	}

	/*
	* Interface ioctls.
	*/
	int
	ifioctl(struct socket so, u_long cmd, caddr_t data, struct thread td)
	{
	struct ifnet *ifp;
	struct ifreq *ifr;
	int error;
	int oif_flags;

	switch (cmd) {
	case SIOCGIFCONF:
	case OSIOCGIFCONF:
	#ifdef __amd64__
	case SIOCGIFCONF32:
	#endif
	return (ifconf(cmd, data));
	}
	ifr = (struct ifreq *)data;

	switch (cmd) {
	case SIOCIFCREATE:
	case SIOCIFCREATE2:
	error = priv_check(td, PRIV_NET_IFCREATE);
	if (error)
	return (error);
	return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
	cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
	case SIOCIFDESTROY:
	error = priv_check(td, PRIV_NET_IFDESTROY);
	if (error)
	return (error);
	return if_clone_destroy(ifr->ifr_name);

	case SIOCIFGCLONERS:
	return (if_clone_list((struct if_clonereq *)data));
	case SIOCGIFGMEMB:
	return (if_getgroupmembers((struct ifgroupreq *)data));
	}

	ifp = ifunit(ifr->ifr_name);
	if (ifp == 0)
	return (ENXIO);

	error = ifhwioctl(cmd, ifp, data, td);
	if (error != ENOIOCTL)
	return (error);

	oif_flags = ifp->if_flags;
	if (so->so_proto == 0)
	return (EOPNOTSUPP);
	#ifndef COMPAT_43
	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd,
	data,
	ifp, td));
	#else
	{
	int ocmd = cmd;

	switch (cmd) {

	case SIOCSIFDSTADDR:
	case SIOCSIFADDR:
	case SIOCSIFBRDADDR:
	case SIOCSIFNETMASK:
	#if BYTE_ORDER != BIG_ENDIAN
	if (ifr->ifr_addr.sa_family == 0 &&
	ifr->ifr_addr.sa_len < 16) {
	ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
	ifr->ifr_addr.sa_len = 16;
	}
	#else
	if (ifr->ifr_addr.sa_len == 0)
	ifr->ifr_addr.sa_len = 16;
	#endif
	break;

	case OSIOCGIFADDR:
	cmd = SIOCGIFADDR;
	break;

	case OSIOCGIFDSTADDR:
	cmd = SIOCGIFDSTADDR;
	break;

	case OSIOCGIFBRDADDR:
	cmd = SIOCGIFBRDADDR;
	break;

	case OSIOCGIFNETMASK:
	cmd = SIOCGIFNETMASK;
	}
	error = ((*so->so_proto->pr_usrreqs->pru_control)(so,
	cmd,
	data,
	ifp, td));
	switch (ocmd) {

	case OSIOCGIFADDR:
	case OSIOCGIFDSTADDR:
	case OSIOCGIFBRDADDR:
	case OSIOCGIFNETMASK:
	(u_short )&ifr->ifr_addr = ifr->ifr_addr.sa_family;

	}
	}
	#endif /* COMPAT_43 */

	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
	#ifdef INET6
	DELAY(100);/* XXX: temporary workaround for fxp issue*/
	if (ifp->if_flags & IFF_UP) {
	int s = splimp();
	in6_if_up(ifp);
	splx(s);
	}
	#endif
	}
	return (error);
	}

	/*
	* The code common to handling reference counted flags,
	* e.g., in ifpromisc() and if_allmulti().
	* The "pflag" argument can specify a permanent mode flag to check,
	* such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
	*
	* Only to be used on stack-owned flags, not driver-owned flags.
	*/
	static int
	if_setflag(struct ifnet ifp, int flag, int pflag, int refcount, int onswitch)
	{
	struct ifreq ifr;
	int error;
	int oldflags, oldcount;

	/* Sanity checks to catch programming errors */
	KASSERT((flag & (IFF_DRV_OACTIVE\|IFF_DRV_RUNNING)) == 0,
	("%s: setting driver-owned flag %d", __func__, flag));

	if (onswitch)
	KASSERT(*refcount >= 0,
	("%s: increment negative refcount %d for flag %d",
	__func__, *refcount, flag));
	else
	KASSERT(*refcount > 0,
	("%s: decrement non-positive refcount %d for flag %d",
	__func__, *refcount, flag));

	/* In case this mode is permanent, just touch refcount */
	if (ifp->if_flags & pflag) {
	*refcount += onswitch ? 1 : -1;
	return (0);
	}

	/* Save ifnet parameters for if_ioctl() may fail */
	oldcount = *refcount;
	oldflags = ifp->if_flags;

	/*
	* See if we aren't the only and touching refcount is enough.
	* Actually toggle interface flag if we are the first or last.
	*/
	if (onswitch) {
	if ((*refcount)++)
	return (0);
	ifp->if_flags \|= flag;
	} else {
	if (--(*refcount))
	return (0);
	ifp->if_flags &= ~flag;
	}

	/* Call down the driver since we've changed interface flags */
	if (ifp->if_ioctl == NULL) {
	error = EOPNOTSUPP;
	goto recover;
	}
	ifr.ifr_flags = ifp->if_flags & 0xffff;
	ifr.ifr_flagshigh = ifp->if_flags >> 16;
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
	IFF_UNLOCKGIANT(ifp);
	if (error)
	goto recover;
	/* Notify userland that interface flags have changed */
	rt_ifmsg(ifp);
	return (0);

	recover:
	/* Recover after driver error */
	*refcount = oldcount;
	ifp->if_flags = oldflags;
	return (error);
	}

	/*
	* Set/clear promiscuous mode on interface ifp based on the truth value
	* of pswitch. The calls are reference counted so that only the first
	* "on" request actually has an effect, as does the final "off" request.
	* Results are undefined if the "off" and "on" requests are not matched.
	*/
	int
	ifpromisc(struct ifnet *ifp, int pswitch)
	{
	int error;
	int oldflags = ifp->if_flags;

	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
	&ifp->if_pcount, pswitch);
	/* If promiscuous mode status has changed, log a message */
	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC))
	log(LOG_INFO, "%s: promiscuous mode %s\n",
	ifp->if_xname,
	(ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
	return (error);
	}

	/*
	* Return interface configuration
	* of system. List may be used
	* in later ioctl's (above) to get
	* other information.
	*/
	/ARGSUSED/
	static int
	ifconf(u_long cmd, caddr_t data)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifconf ifc = (struct ifconf )data;
	#ifdef __amd64__
	struct ifconf32 ifc32 = (struct ifconf32 )data;
	struct ifconf ifc_swab;
	#endif
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct ifreq ifr;
	struct sbuf *sb;
	int error, full = 0, valid_len, max_len;

	#ifdef __amd64__
	if (cmd == SIOCGIFCONF32) {
	ifc_swab.ifc_len = ifc32->ifc_len;
	ifc_swab.ifc_buf = (caddr_t)(uintptr_t)ifc32->ifc_buf;
	ifc = &ifc_swab;
	}
	#endif
	/* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */
	max_len = MAXPHYS - 1;

	/* Prevent hostile input from being able to crash the system */
	if (ifc->ifc_len <= 0)
	return (EINVAL);

	again:
	if (ifc->ifc_len <= max_len) {
	max_len = ifc->ifc_len;
	full = 1;
	}
	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
	max_len = 0;
	valid_len = 0;

	IFNET_RLOCK(); /* could sleep XXX */
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	int addrs;

	/*
	* Zero the ifr_name buffer to make sure we don't
	* disclose the contents of the stack.
	*/
	memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name));

	if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
	>= sizeof(ifr.ifr_name)) {
	sbuf_delete(sb);
	IFNET_RUNLOCK();
	return (ENAMETOOLONG);
	}

	addrs = 0;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;

	if (jailed(curthread->td_ucred) &&
	prison_if(curthread->td_ucred, sa))
	continue;
	addrs++;
	#ifdef COMPAT_43
	if (cmd == OSIOCGIFCONF) {
	struct osockaddr *osa =
	(struct osockaddr *)&ifr.ifr_addr;
	ifr.ifr_addr = *sa;
	osa->sa_family = sa->sa_family;
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);
	} else
	#endif
	if (sa->sa_len <= sizeof(*sa)) {
	ifr.ifr_addr = *sa;
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);
	} else {
	sbuf_bcat(sb, &ifr,
	offsetof(struct ifreq, ifr_addr));
	max_len += offsetof(struct ifreq, ifr_addr);
	sbuf_bcat(sb, sa, sa->sa_len);
	max_len += sa->sa_len;
	}

	if (!sbuf_overflowed(sb))
	valid_len = sbuf_len(sb);
	}
	if (addrs == 0) {
	bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);

	if (!sbuf_overflowed(sb))
	valid_len = sbuf_len(sb);
	}
	}
	IFNET_RUNLOCK();

	/*
	* If we didn't allocate enough space (uncommon), try again. If
	* we have already allocated as much space as we are allowed,
	* return what we've got.
	*/
	if (valid_len != max_len && !full) {
	sbuf_delete(sb);
	goto again;
	}

	ifc->ifc_len = valid_len;
	#ifdef __amd64__
	if (cmd == SIOCGIFCONF32)
	ifc32->ifc_len = valid_len;
	#endif
	sbuf_finish(sb);
	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
	sbuf_delete(sb);
	return (error);
	}

	/*
	* Just like ifpromisc(), but for all-multicast-reception mode.
	*/
	int
	if_allmulti(struct ifnet *ifp, int onswitch)
	{

	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
	}

	struct ifmultiaddr *
	if_findmulti(struct ifnet ifp, struct sockaddr sa)
	{
	struct ifmultiaddr *ifma;

	IF_ADDR_LOCK_ASSERT(ifp);

	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (sa->sa_family == AF_LINK) {
	if (sa_dl_equal(ifma->ifma_addr, sa))
	break;
	} else {
	if (sa_equal(ifma->ifma_addr, sa))
	break;
	}
	}

	return ifma;
	}

	/*
	* Allocate a new ifmultiaddr and initialize based on passed arguments. We
	* make copies of passed sockaddrs. The ifmultiaddr will not be added to
	* the ifnet multicast address list here, so the caller must do that and
	* other setup work (such as notifying the device driver). The reference
	* count is initialized to 1.
	*/
	static struct ifmultiaddr *
	if_allocmulti(struct ifnet ifp, struct sockaddr sa, struct sockaddr *llsa,
	int mflags)
	{
	struct ifmultiaddr *ifma;
	struct sockaddr *dupsa;

	MALLOC(ifma, struct ifmultiaddr , sizeof ifma, M_IFMADDR, mflags \|
	M_ZERO);
	if (ifma == NULL)
	return (NULL);

	MALLOC(dupsa, struct sockaddr *, sa->sa_len, M_IFMADDR, mflags);
	if (dupsa == NULL) {
	FREE(ifma, M_IFMADDR);
	return (NULL);
	}
	bcopy(sa, dupsa, sa->sa_len);
	ifma->ifma_addr = dupsa;

	ifma->ifma_ifp = ifp;
	ifma->ifma_refcount = 1;
	ifma->ifma_protospec = NULL;

	if (llsa == NULL) {
	ifma->ifma_lladdr = NULL;
	return (ifma);
	}

	MALLOC(dupsa, struct sockaddr *, llsa->sa_len, M_IFMADDR, mflags);
	if (dupsa == NULL) {
	FREE(ifma->ifma_addr, M_IFMADDR);
	FREE(ifma, M_IFMADDR);
	return (NULL);
	}
	bcopy(llsa, dupsa, llsa->sa_len);
	ifma->ifma_lladdr = dupsa;

	return (ifma);
	}

	/*
	* if_freemulti: free ifmultiaddr structure and possibly attached related
	* addresses. The caller is responsible for implementing reference
	* counting, notifying the driver, handling routing messages, and releasing
	* any dependent link layer state.
	*/
	static void
	if_freemulti(struct ifmultiaddr *ifma)
	{

	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
	ifma->ifma_refcount));
	KASSERT(ifma->ifma_protospec == NULL,
	("if_freemulti: protospec not NULL"));

	if (ifma->ifma_lladdr != NULL)
	FREE(ifma->ifma_lladdr, M_IFMADDR);
	FREE(ifma->ifma_addr, M_IFMADDR);
	FREE(ifma, M_IFMADDR);
	}

	/*
	* Register an additional multicast address with a network interface.
	*
	* - If the address is already present, bump the reference count on the
	* address and return.
	* - If the address is not link-layer, look up a link layer address.
	* - Allocate address structures for one or both addresses, and attach to the
	* multicast address list on the interface. If automatically adding a link
	* layer address, the protocol address will own a reference to the link
	* layer address, to be freed when it is freed.
	* - Notify the network device driver of an addition to the multicast address
	* list.
	*
	* 'sa' points to caller-owned memory with the desired multicast address.
	*
	* 'retifma' will be used to return a pointer to the resulting multicast
	* address reference, if desired.
	*/
	int
	if_addmulti(struct ifnet ifp, struct sockaddr sa,
	struct ifmultiaddr **retifma)
	{
	struct ifmultiaddr ifma, ll_ifma;
	struct sockaddr *llsa;
	int error;

	/*
	* If the address is already present, return a new reference to it;
	* otherwise, allocate storage and set up a new address.
	*/
	IF_ADDR_LOCK(ifp);
	ifma = if_findmulti(ifp, sa);
	if (ifma != NULL) {
	ifma->ifma_refcount++;
	if (retifma != NULL)
	*retifma = ifma;
	IF_ADDR_UNLOCK(ifp);
	return (0);
	}

	/*
	* The address isn't already present; resolve the protocol address
	* into a link layer address, and then look that up, bump its
	* refcount or allocate an ifma for that also. If 'llsa' was
	* returned, we will need to free it later.
	*/
	llsa = NULL;
	ll_ifma = NULL;
	if (ifp->if_resolvemulti != NULL) {
	error = ifp->if_resolvemulti(ifp, &llsa, sa);
	if (error)
	goto unlock_out;
	}

	/*
	* Allocate the new address. Don't hook it up yet, as we may also
	* need to allocate a link layer multicast address.
	*/
	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
	if (ifma == NULL) {
	error = ENOMEM;
	goto free_llsa_out;
	}

	/*
	* If a link layer address is found, we'll need to see if it's
	* already present in the address list, or allocate is as well.
	* When this block finishes, the link layer address will be on the
	* list.
	*/
	if (llsa != NULL) {
	ll_ifma = if_findmulti(ifp, llsa);
	if (ll_ifma == NULL) {
	ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
	if (ll_ifma == NULL) {
	--ifma->ifma_refcount;
	if_freemulti(ifma);
	error = ENOMEM;
	goto free_llsa_out;
	}
	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
	ifma_link);
	} else
	ll_ifma->ifma_refcount++;
	ifma->ifma_llifma = ll_ifma;
	}

	/*
	* We now have a new multicast address, ifma, and possibly a new or
	* referenced link layer address. Add the primary address to the
	* ifnet address list.
	*/
	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);

	if (retifma != NULL)
	*retifma = ifma;

	/*
	* Must generate the message while holding the lock so that 'ifma'
	* pointer is still valid.
	*/
	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
	IF_ADDR_UNLOCK(ifp);

	/*
	* We are certain we have added something, so call down to the
	* interface to let them know about it.
	*/
	if (ifp->if_ioctl != NULL) {
	IFF_LOCKGIANT(ifp);
	(void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
	IFF_UNLOCKGIANT(ifp);
	}

	if (llsa != NULL)
	FREE(llsa, M_IFMADDR);

	return (0);

	free_llsa_out:
	if (llsa != NULL)
	FREE(llsa, M_IFMADDR);

	unlock_out:
	IF_ADDR_UNLOCK(ifp);
	return (error);
	}

	/*
	* Delete a multicast group membership by network-layer group address.
	*
	* Returns ENOENT if the entry could not be found. If ifp no longer
	* exists, results are undefined. This entry point should only be used
	* from subsystems which do appropriate locking to hold ifp for the
	* duration of the call.
	* Network-layer protocol domains must use if_delmulti_ifma().
	*/
	int
	if_delmulti(struct ifnet ifp, struct sockaddr sa)
	{
	struct ifmultiaddr *ifma;
	int lastref;
	#ifdef INVARIANTS
	struct ifnet *oifp;
	+ INIT_VNET_NET(ifp->if_vnet);

	IFNET_RLOCK();
	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
	if (ifp == oifp)
	break;
	if (ifp != oifp)
	ifp = NULL;
	IFNET_RUNLOCK();

	KASSERT(ifp != NULL, ("%s: ifnet went away", __func__));
	#endif
	if (ifp == NULL)
	return (ENOENT);

	IF_ADDR_LOCK(ifp);
	lastref = 0;
	ifma = if_findmulti(ifp, sa);
	if (ifma != NULL)
	lastref = if_delmulti_locked(ifp, ifma, 0);
	IF_ADDR_UNLOCK(ifp);

	if (ifma == NULL)
	return (ENOENT);

	if (lastref && ifp->if_ioctl != NULL) {
	IFF_LOCKGIANT(ifp);
	(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
	IFF_UNLOCKGIANT(ifp);
	}

	return (0);
	}

	/*
	* Delete a multicast group membership by group membership pointer.
	* Network-layer protocol domains must use this routine.
	*
	* It is safe to call this routine if the ifp disappeared. Callers should
	* hold IFF_LOCKGIANT() to avoid a LOR in case the hardware needs to be
	* reconfigured.
	*/
	void
	if_delmulti_ifma(struct ifmultiaddr *ifma)
	{
	+#ifdef DIAGNOSTIC
	+ INIT_VNET_NET(curvnet);
	+#endif
	struct ifnet *ifp;
	int lastref;

	ifp = ifma->ifma_ifp;
	#ifdef DIAGNOSTIC
	if (ifp == NULL) {
	printf("%s: ifma_ifp seems to be detached\n", __func__);
	} else {
	struct ifnet *oifp;

	IFNET_RLOCK();
	TAILQ_FOREACH(oifp, &V_ifnet, if_link)
	if (ifp == oifp)
	break;
	if (ifp != oifp) {
	printf("%s: ifnet %p disappeared\n", __func__, ifp);
	ifp = NULL;
	}
	IFNET_RUNLOCK();
	}
	#endif
	/*
	* If and only if the ifnet instance exists: Acquire the address lock.
	*/
	if (ifp != NULL)
	IF_ADDR_LOCK(ifp);

	lastref = if_delmulti_locked(ifp, ifma, 0);

	if (ifp != NULL) {
	/*
	* If and only if the ifnet instance exists:
	* Release the address lock.
	* If the group was left: update the hardware hash filter.
	*/
	IF_ADDR_UNLOCK(ifp);
	if (lastref && ifp->if_ioctl != NULL) {
	IFF_LOCKGIANT(ifp);
	(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
	IFF_UNLOCKGIANT(ifp);
	}
	}
	}

	/*
	* Perform deletion of network-layer and/or link-layer multicast address.
	*
	* Return 0 if the reference count was decremented.
	* Return 1 if the final reference was released, indicating that the
	* hardware hash filter should be reprogrammed.
	*/
	static int
	if_delmulti_locked(struct ifnet ifp, struct ifmultiaddr ifma, int detaching)
	{
	struct ifmultiaddr *ll_ifma;

	if (ifp != NULL && ifma->ifma_ifp != NULL) {
	KASSERT(ifma->ifma_ifp == ifp,
	("%s: inconsistent ifp %p", __func__, ifp));
	IF_ADDR_LOCK_ASSERT(ifp);
	}

	ifp = ifma->ifma_ifp;

	/*
	* If the ifnet is detaching, null out references to ifnet,
	* so that upper protocol layers will notice, and not attempt
	* to obtain locks for an ifnet which no longer exists. The
	* routing socket announcement must happen before the ifnet
	* instance is detached from the system.
	*/
	if (detaching) {
	#ifdef DIAGNOSTIC
	printf("%s: detaching ifnet instance %p\n", __func__, ifp);
	#endif
	/*
	* ifp may already be nulled out if we are being reentered
	* to delete the ll_ifma.
	*/
	if (ifp != NULL) {
	rt_newmaddrmsg(RTM_DELMADDR, ifma);
	ifma->ifma_ifp = NULL;
	}
	}

	if (--ifma->ifma_refcount > 0)
	return 0;

	/*
	* If this ifma is a network-layer ifma, a link-layer ifma may
	* have been associated with it. Release it first if so.
	*/
	ll_ifma = ifma->ifma_llifma;
	if (ll_ifma != NULL) {
	KASSERT(ifma->ifma_lladdr != NULL,
	("%s: llifma w/o lladdr", __func__));
	if (detaching)
	ll_ifma->ifma_ifp = NULL; /* XXX */
	if (--ll_ifma->ifma_refcount == 0) {
	if (ifp != NULL) {
	TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma,
	ifma_link);
	}
	if_freemulti(ll_ifma);
	}
	}

	if (ifp != NULL)
	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);

	if_freemulti(ifma);

	/*
	* The last reference to this instance of struct ifmultiaddr
	* was released; the hardware should be notified of this change.
	*/
	return 1;
	}

	/*
	* Set the link layer address on an interface.
	*
	* At this time we only support certain types of interfaces,
	* and we don't allow the length of the address to change.
	*/
	int
	if_setlladdr(struct ifnet ifp, const u_char lladdr, int len)
	{
	struct sockaddr_dl *sdl;
	struct ifaddr *ifa;
	struct ifreq ifr;

	ifa = ifp->if_addr;
	if (ifa == NULL)
	return (EINVAL);
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	if (sdl == NULL)
	return (EINVAL);
	if (len != sdl->sdl_alen) /* don't allow length to change */
	return (EINVAL);
	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_XETHER:
	case IFT_ISO88025:
	case IFT_L2VLAN:
	case IFT_BRIDGE:
	case IFT_ARCNET:
	case IFT_IEEE8023ADLAG:
	bcopy(lladdr, LLADDR(sdl), len);
	break;
	default:
	return (ENODEV);
	}
	/*
	* If the interface is already up, we need
	* to re-init it in order to reprogram its
	* address filter.
	*/
	if ((ifp->if_flags & IFF_UP) != 0) {
	if (ifp->if_ioctl) {
	IFF_LOCKGIANT(ifp);
	ifp->if_flags &= ~IFF_UP;
	ifr.ifr_flags = ifp->if_flags & 0xffff;
	ifr.ifr_flagshigh = ifp->if_flags >> 16;
	(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
	ifp->if_flags \|= IFF_UP;
	ifr.ifr_flags = ifp->if_flags & 0xffff;
	ifr.ifr_flagshigh = ifp->if_flags >> 16;
	(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
	IFF_UNLOCKGIANT(ifp);
	}
	#ifdef INET
	/*
	* Also send gratuitous ARPs to notify other nodes about
	* the address change.
	*/
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family == AF_INET)
	arp_ifinit(ifp, ifa);
	}
	#endif
	}
	return (0);
	}

	/*
	* The name argument must be a pointer to storage which will last as
	* long as the interface does. For physical devices, the result of
	* device_get_name(dev) is a good choice and for pseudo-devices a
	* static string works well.
	*/
	void
	if_initname(struct ifnet ifp, const char name, int unit)
	{
	ifp->if_dname = name;
	ifp->if_dunit = unit;
	if (unit != IF_DUNIT_NONE)
	snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
	else
	strlcpy(ifp->if_xname, name, IFNAMSIZ);
	}

	int
	if_printf(struct ifnet ifp, const char fmt, ...)
	{
	va_list ap;
	int retval;

	retval = printf("%s: ", ifp->if_xname);
	va_start(ap, fmt);
	retval += vprintf(fmt, ap);
	va_end(ap);
	return (retval);
	}

	/*
	* When an interface is marked IFF_NEEDSGIANT, its if_start() routine cannot
	* be called without Giant. However, we often can't acquire the Giant lock
	* at those points; instead, we run it via a task queue that holds Giant via
	* if_start_deferred.
	*
	* XXXRW: We need to make sure that the ifnet isn't fully detached until any
	* outstanding if_start_deferred() tasks that will run after the free. This
	* probably means waiting in if_detach().
	*/
	void
	if_start(struct ifnet *ifp)
	{

	if (ifp->if_flags & IFF_NEEDSGIANT) {
	if (mtx_owned(&Giant))
	(*(ifp)->if_start)(ifp);
	else
	taskqueue_enqueue(taskqueue_swi_giant,
	&ifp->if_starttask);
	} else
	(*(ifp)->if_start)(ifp);
	}

	static void
	if_start_deferred(void *context, int pending)
	{
	struct ifnet *ifp;

	GIANT_REQUIRED;

	ifp = context;
	(ifp->if_start)(ifp);
	}

	int
	if_handoff(struct ifqueue ifq, struct mbuf m, struct ifnet *ifp, int adjust)
	{
	int active = 0;

	IF_LOCK(ifq);
	if (_IF_QFULL(ifq)) {
	_IF_DROP(ifq);
	IF_UNLOCK(ifq);
	m_freem(m);
	return (0);
	}
	if (ifp != NULL) {
	ifp->if_obytes += m->m_pkthdr.len + adjust;
	if (m->m_flags & (M_BCAST\|M_MCAST))
	ifp->if_omcasts++;
	active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
	}
	_IF_ENQUEUE(ifq, m);
	IF_UNLOCK(ifq);
	if (ifp != NULL && !active)
	if_start(ifp);
	return (1);
	}

	void
	if_register_com_alloc(u_char type,
	if_com_alloc_t a, if_com_free_t f)
	{

	KASSERT(if_com_alloc[type] == NULL,
	("if_register_com_alloc: %d already registered", type));
	KASSERT(if_com_free[type] == NULL,
	("if_register_com_alloc: %d free already registered", type));

	if_com_alloc[type] = a;
	if_com_free[type] = f;
	}

	void
	if_deregister_com_alloc(u_char type)
	{

	KASSERT(if_com_alloc[type] != NULL,
	("if_deregister_com_alloc: %d not registered", type));
	KASSERT(if_com_free[type] != NULL,
	("if_deregister_com_alloc: %d free not registered", type));
	if_com_alloc[type] = NULL;
	if_com_free[type] = NULL;
	}
	Index: head/sys/net/if_bridge.c
	===================================================================
	--- head/sys/net/if_bridge.c (revision 183549)
	+++ head/sys/net/if_bridge.c (revision 183550)
	@@ -1,3420 +1,3425 @@
	/* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */

	/*
	* Copyright 2001 Wasabi Systems, Inc.
	* All rights reserved.
	*
	* Written by Jason R. Thorpe for Wasabi Systems, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed for the NetBSD Project by
	* Wasabi Systems, Inc.
	* 4. The name of Wasabi Systems, Inc. may not be used to endorse
	* or promote products derived from this software without specific prior
	* written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	* Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net)
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*
	* OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp
	*/

	/*
	* Network interface bridge support.
	*
	* TODO:
	*
	* - Currently only supports Ethernet-like interfaces (Ethernet,
	* 802.11, VLANs on Ethernet, etc.) Figure out a nice way
	* to bridge other types of interfaces (FDDI-FDDI, and maybe
	* consider heterogenous bridges).
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_carp.h"

	#include <sys/param.h>
	#include <sys/mbuf.h>
	#include <sys/malloc.h>
	#include <sys/protosw.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/socket.h> /* for net/if.h */
	#include <sys/sockio.h>
	#include <sys/ctype.h> /* string functions */
	#include <sys/kernel.h>
	#include <sys/random.h>
	#include <sys/syslog.h>
	#include <sys/sysctl.h>
	#include <vm/uma.h>
	#include <sys/module.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/vimage.h>

	#include <net/bpf.h>
	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/pfil.h>

	#include <netinet/in.h> /* for struct arpcom */
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif
	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif
	#include <machine/in_cksum.h>
	#include <netinet/if_ether.h> /* for struct arpcom */
	#include <net/bridgestp.h>
	#include <net/if_bridgevar.h>
	#include <net/if_llc.h>
	#include <net/if_vlan_var.h>

	#include <net/route.h>
	#include <netinet/ip_fw.h>
	#include <netinet/ip_dummynet.h>

	/*
	* Size of the route hash table. Must be a power of two.
	*/
	#ifndef BRIDGE_RTHASH_SIZE
	#define BRIDGE_RTHASH_SIZE 1024
	#endif

	#define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1)

	/*
	* Maximum number of addresses to cache.
	*/
	#ifndef BRIDGE_RTABLE_MAX
	#define BRIDGE_RTABLE_MAX 100
	#endif

	/*
	* Timeout (in seconds) for entries learned dynamically.
	*/
	#ifndef BRIDGE_RTABLE_TIMEOUT
	#define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */
	#endif

	/*
	* Number of seconds between walks of the route list.
	*/
	#ifndef BRIDGE_RTABLE_PRUNE_PERIOD
	#define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60)
	#endif

	/*
	* List of capabilities to possibly mask on the member interface.
	*/
	#define BRIDGE_IFCAPS_MASK (IFCAP_TOE\|IFCAP_TSO\|IFCAP_TXCSUM)

	/*
	* Bridge interface list entry.
	*/
	struct bridge_iflist {
	LIST_ENTRY(bridge_iflist) bif_next;
	struct ifnet bif_ifp; / member if */
	struct bstp_port bif_stp; /* STP state */
	uint32_t bif_flags; /* member if flags */
	int bif_savedcaps; /* saved capabilities */
	uint32_t bif_addrmax; /* max # of addresses */
	uint32_t bif_addrcnt; /* cur. # of addresses */
	uint32_t bif_addrexceeded;/* # of address violations */
	};

	/*
	* Bridge route node.
	*/
	struct bridge_rtnode {
	LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */
	LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */
	struct bridge_iflist brt_dst; / destination if */
	unsigned long brt_expire; /* expiration time */
	uint8_t brt_flags; /* address flags */
	uint8_t brt_addr[ETHER_ADDR_LEN];
	uint16_t brt_vlan; /* vlan id */
	};
	#define brt_ifp brt_dst->bif_ifp

	/*
	* Software state for each bridge.
	*/
	struct bridge_softc {
	struct ifnet sc_ifp; / make this an interface */
	LIST_ENTRY(bridge_softc) sc_list;
	struct mtx sc_mtx;
	struct cv sc_cv;
	uint32_t sc_brtmax; /* max # of addresses */
	uint32_t sc_brtcnt; /* cur. # of addresses */
	uint32_t sc_brttimeout; /* rt timeout in seconds */
	struct callout sc_brcallout; /* bridge callout */
	uint32_t sc_iflist_ref; /* refcount for sc_iflist */
	uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */
	LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */
	LIST_HEAD(, bridge_rtnode) sc_rthash; / our forwarding table */
	LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */
	uint32_t sc_rthash_key; /* key for hash */
	LIST_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */
	struct bstp_state sc_stp; /* STP state */
	uint32_t sc_brtexceeded; /* # of cache drops */
	u_char sc_defaddr[6]; /* Default MAC address */
	};

	static struct mtx bridge_list_mtx;
	eventhandler_tag bridge_detach_cookie = NULL;

	int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD;

	uma_zone_t bridge_rtnode_zone;

	static int bridge_clone_create(struct if_clone *, int, caddr_t);
	static void bridge_clone_destroy(struct ifnet *);

	static int bridge_ioctl(struct ifnet *, u_long, caddr_t);
	static void bridge_mutecaps(struct bridge_softc *);
	static void bridge_set_ifcap(struct bridge_softc , struct bridge_iflist ,
	int);
	static void bridge_ifdetach(void arg __unused, struct ifnet );
	static void bridge_init(void *);
	static void bridge_dummynet(struct mbuf , struct ifnet );
	static void bridge_stop(struct ifnet *, int);
	static void bridge_start(struct ifnet *);
	static struct mbuf bridge_input(struct ifnet , struct mbuf *);
	static int bridge_output(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *);
	static void bridge_enqueue(struct bridge_softc , struct ifnet ,
	struct mbuf *);
	static void bridge_rtdelete(struct bridge_softc , struct ifnet ifp, int);

	static void bridge_forward(struct bridge_softc , struct bridge_iflist ,
	struct mbuf *m);

	static void bridge_timer(void *);

	static void bridge_broadcast(struct bridge_softc , struct ifnet ,
	struct mbuf *, int);
	static void bridge_span(struct bridge_softc , struct mbuf );

	static int bridge_rtupdate(struct bridge_softc , const uint8_t ,
	uint16_t, struct bridge_iflist *, int, uint8_t);
	static struct ifnet bridge_rtlookup(struct bridge_softc , const uint8_t *,
	uint16_t);
	static void bridge_rttrim(struct bridge_softc *);
	static void bridge_rtage(struct bridge_softc *);
	static void bridge_rtflush(struct bridge_softc *, int);
	static int bridge_rtdaddr(struct bridge_softc , const uint8_t ,
	uint16_t);

	static int bridge_rtable_init(struct bridge_softc *);
	static void bridge_rtable_fini(struct bridge_softc *);

	static int bridge_rtnode_addr_cmp(const uint8_t , const uint8_t );
	static struct bridge_rtnode bridge_rtnode_lookup(struct bridge_softc ,
	const uint8_t *, uint16_t);
	static int bridge_rtnode_insert(struct bridge_softc *,
	struct bridge_rtnode *);
	static void bridge_rtnode_destroy(struct bridge_softc *,
	struct bridge_rtnode *);
	static void bridge_rtable_expire(struct ifnet *, int);
	static void bridge_state_change(struct ifnet *, int);

	static struct bridge_iflist bridge_lookup_member(struct bridge_softc ,
	const char *name);
	static struct bridge_iflist bridge_lookup_member_if(struct bridge_softc ,
	struct ifnet *ifp);
	static void bridge_delete_member(struct bridge_softc *,
	struct bridge_iflist *, int);
	static void bridge_delete_span(struct bridge_softc *,
	struct bridge_iflist *);

	static int bridge_ioctl_add(struct bridge_softc , void );
	static int bridge_ioctl_del(struct bridge_softc , void );
	static int bridge_ioctl_gifflags(struct bridge_softc , void );
	static int bridge_ioctl_sifflags(struct bridge_softc , void );
	static int bridge_ioctl_scache(struct bridge_softc , void );
	static int bridge_ioctl_gcache(struct bridge_softc , void );
	static int bridge_ioctl_gifs(struct bridge_softc , void );
	static int bridge_ioctl_rts(struct bridge_softc , void );
	static int bridge_ioctl_saddr(struct bridge_softc , void );
	static int bridge_ioctl_sto(struct bridge_softc , void );
	static int bridge_ioctl_gto(struct bridge_softc , void );
	static int bridge_ioctl_daddr(struct bridge_softc , void );
	static int bridge_ioctl_flush(struct bridge_softc , void );
	static int bridge_ioctl_gpri(struct bridge_softc , void );
	static int bridge_ioctl_spri(struct bridge_softc , void );
	static int bridge_ioctl_ght(struct bridge_softc , void );
	static int bridge_ioctl_sht(struct bridge_softc , void );
	static int bridge_ioctl_gfd(struct bridge_softc , void );
	static int bridge_ioctl_sfd(struct bridge_softc , void );
	static int bridge_ioctl_gma(struct bridge_softc , void );
	static int bridge_ioctl_sma(struct bridge_softc , void );
	static int bridge_ioctl_sifprio(struct bridge_softc , void );
	static int bridge_ioctl_sifcost(struct bridge_softc , void );
	static int bridge_ioctl_sifmaxaddr(struct bridge_softc , void );
	static int bridge_ioctl_addspan(struct bridge_softc , void );
	static int bridge_ioctl_delspan(struct bridge_softc , void );
	static int bridge_ioctl_gbparam(struct bridge_softc , void );
	static int bridge_ioctl_grte(struct bridge_softc , void );
	static int bridge_ioctl_gifsstp(struct bridge_softc , void );
	static int bridge_ioctl_sproto(struct bridge_softc , void );
	static int bridge_ioctl_stxhc(struct bridge_softc , void );
	static int bridge_pfil(struct mbuf *, struct ifnet , struct ifnet *,
	int);
	static int bridge_ip_checkbasic(struct mbuf **mp);
	#ifdef INET6
	static int bridge_ip6_checkbasic(struct mbuf **mp);
	#endif /* INET6 */
	static int bridge_fragment(struct ifnet , struct mbuf ,
	struct ether_header , int, struct llc );

	/* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */
	#define VLANTAGOF(_m) \
	(_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1

	static struct bstp_cb_ops bridge_ops = {
	.bcb_state = bridge_state_change,
	.bcb_rtage = bridge_rtable_expire
	};

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge");

	static int pfil_onlyip = 1; /* only pass IP[46] packets when pfil is enabled */
	static int pfil_bridge = 1; /* run pfil hooks on the bridge interface */
	static int pfil_member = 1; /* run pfil hooks on the member interface */
	static int pfil_ipfw = 0; /* layer2 filter with ipfw */
	static int pfil_ipfw_arp = 0; /* layer2 filter with ipfw */
	static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for
	locally destined packets */
	static int log_stp = 0; /* log STP state changes */
	static int bridge_inherit_mac = 0; /* share MAC with first bridge member */
	SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW,
	&pfil_onlyip, 0, "Only pass IP packets when pfil is enabled");
	SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW,
	&pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2");
	SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW,
	&pfil_bridge, 0, "Packet filter on the bridge interface");
	SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW,
	&pfil_member, 0, "Packet filter on the member interface");
	SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW,
	&pfil_local_phys, 0,
	"Packet filter on the physical interface for locally destined packets");
	SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW,
	&log_stp, 0, "Log STP state changes");
	SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RW,
	&bridge_inherit_mac, 0,
	"Inherit MAC address from the first bridge member");

	struct bridge_control {
	int (bc_func)(struct bridge_softc , void *);
	int bc_argsize;
	int bc_flags;
	};

	#define BC_F_COPYIN 0x01 /* copy arguments in */
	#define BC_F_COPYOUT 0x02 /* copy arguments out */
	#define BC_F_SUSER 0x04 /* do super-user check */

	const struct bridge_control bridge_control_table[] = {
	{ bridge_ioctl_add, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },
	{ bridge_ioctl_del, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_gifflags, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_COPYOUT },
	{ bridge_ioctl_sifflags, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_scache, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },
	{ bridge_ioctl_gcache, sizeof(struct ifbrparam),
	BC_F_COPYOUT },

	{ bridge_ioctl_gifs, sizeof(struct ifbifconf),
	BC_F_COPYIN\|BC_F_COPYOUT },
	{ bridge_ioctl_rts, sizeof(struct ifbaconf),
	BC_F_COPYIN\|BC_F_COPYOUT },

	{ bridge_ioctl_saddr, sizeof(struct ifbareq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_sto, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },
	{ bridge_ioctl_gto, sizeof(struct ifbrparam),
	BC_F_COPYOUT },

	{ bridge_ioctl_daddr, sizeof(struct ifbareq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_flush, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_gpri, sizeof(struct ifbrparam),
	BC_F_COPYOUT },
	{ bridge_ioctl_spri, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_ght, sizeof(struct ifbrparam),
	BC_F_COPYOUT },
	{ bridge_ioctl_sht, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_gfd, sizeof(struct ifbrparam),
	BC_F_COPYOUT },
	{ bridge_ioctl_sfd, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_gma, sizeof(struct ifbrparam),
	BC_F_COPYOUT },
	{ bridge_ioctl_sma, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_sifprio, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_sifcost, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_addspan, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },
	{ bridge_ioctl_delspan, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_gbparam, sizeof(struct ifbropreq),
	BC_F_COPYOUT },

	{ bridge_ioctl_grte, sizeof(struct ifbrparam),
	BC_F_COPYOUT },

	{ bridge_ioctl_gifsstp, sizeof(struct ifbpstpconf),
	BC_F_COPYIN\|BC_F_COPYOUT },

	{ bridge_ioctl_sproto, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_stxhc, sizeof(struct ifbrparam),
	BC_F_COPYIN\|BC_F_SUSER },

	{ bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq),
	BC_F_COPYIN\|BC_F_SUSER },

	};
	const int bridge_control_table_size =
	sizeof(bridge_control_table) / sizeof(bridge_control_table[0]);

	LIST_HEAD(, bridge_softc) bridge_list;

	IFC_SIMPLE_DECLARE(bridge, 0);

	static int
	bridge_modevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	mtx_init(&bridge_list_mtx, "if_bridge list", NULL, MTX_DEF);
	if_clone_attach(&bridge_cloner);
	bridge_rtnode_zone = uma_zcreate("bridge_rtnode",
	sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	LIST_INIT(&bridge_list);
	bridge_input_p = bridge_input;
	bridge_output_p = bridge_output;
	bridge_dn_p = bridge_dummynet;
	bridge_detach_cookie = EVENTHANDLER_REGISTER(
	ifnet_departure_event, bridge_ifdetach, NULL,
	EVENTHANDLER_PRI_ANY);
	break;
	case MOD_UNLOAD:
	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
	bridge_detach_cookie);
	if_clone_detach(&bridge_cloner);
	uma_zdestroy(bridge_rtnode_zone);
	bridge_input_p = NULL;
	bridge_output_p = NULL;
	bridge_dn_p = NULL;
	mtx_destroy(&bridge_list_mtx);
	break;
	default:
	return (EOPNOTSUPP);
	}
	return (0);
	}

	static moduledata_t bridge_mod = {
	"if_bridge",
	bridge_modevent,
	0
	};

	DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1);

	/*
	* handler for net.link.bridge.pfil_ipfw
	*/
	static int
	sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS)
	{
	int enable = pfil_ipfw;
	int error;

	error = sysctl_handle_int(oidp, &enable, 0, req);
	enable = (enable) ? 1 : 0;

	if (enable != pfil_ipfw) {
	pfil_ipfw = enable;

	/*
	* Disable pfil so that ipfw doesnt run twice, if the user
	* really wants both then they can re-enable pfil_bridge and/or
	* pfil_member. Also allow non-ip packets as ipfw can filter by
	* layer2 type.
	*/
	if (pfil_ipfw) {
	pfil_onlyip = 0;
	pfil_bridge = 0;
	pfil_member = 0;
	}
	}

	return (error);
	}
	SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT\|CTLFLAG_RW,
	&pfil_ipfw, 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW");

	/*
	* bridge_clone_create:
	*
	* Create a new bridge instance.
	*/
	static int
	bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params)
	{
	struct bridge_softc sc, sc2;
	struct ifnet bifp, ifp;
	int retry;

	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK\|M_ZERO);
	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
	if (ifp == NULL) {
	free(sc, M_DEVBUF);
	return (ENOSPC);
	}

	BRIDGE_LOCK_INIT(sc);
	sc->sc_brtmax = BRIDGE_RTABLE_MAX;
	sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT;

	/* Initialize our routing table. */
	bridge_rtable_init(sc);

	callout_init_mtx(&sc->sc_brcallout, &sc->sc_mtx, 0);

	LIST_INIT(&sc->sc_iflist);
	LIST_INIT(&sc->sc_spanlist);

	ifp->if_softc = sc;
	if_initname(ifp, ifc->ifc_name, unit);
	ifp->if_flags = IFF_BROADCAST \| IFF_SIMPLEX \| IFF_MULTICAST;
	ifp->if_ioctl = bridge_ioctl;
	ifp->if_start = bridge_start;
	ifp->if_init = bridge_init;
	ifp->if_type = IFT_BRIDGE;
	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
	ifp->if_snd.ifq_drv_maxlen = ifqmaxlen;
	IFQ_SET_READY(&ifp->if_snd);

	/*
	* Generate a random ethernet address with a locally administered
	* address.
	*
	* Since we are using random ethernet addresses for the bridge, it is
	* possible that we might have address collisions, so make sure that
	* this hardware address isn't already in use on another bridge.
	*/
	for (retry = 1; retry != 0;) {
	arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1);
	sc->sc_defaddr[0] &= ~1; /* clear multicast bit */
	sc->sc_defaddr[0] \|= 2; /* set the LAA bit */
	retry = 0;
	mtx_lock(&bridge_list_mtx);
	LIST_FOREACH(sc2, &bridge_list, sc_list) {
	bifp = sc2->sc_ifp;
	if (memcmp(sc->sc_defaddr,
	IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0)
	retry = 1;
	}
	mtx_unlock(&bridge_list_mtx);
	}

	bstp_attach(&sc->sc_stp, &bridge_ops);
	ether_ifattach(ifp, sc->sc_defaddr);
	/* Now undo some of the damage... */
	ifp->if_baudrate = 0;
	ifp->if_type = IFT_BRIDGE;

	mtx_lock(&bridge_list_mtx);
	LIST_INSERT_HEAD(&bridge_list, sc, sc_list);
	mtx_unlock(&bridge_list_mtx);

	return (0);
	}

	/*
	* bridge_clone_destroy:
	*
	* Destroy a bridge instance.
	*/
	static void
	bridge_clone_destroy(struct ifnet *ifp)
	{
	struct bridge_softc *sc = ifp->if_softc;
	struct bridge_iflist *bif;

	BRIDGE_LOCK(sc);

	bridge_stop(ifp, 1);
	ifp->if_flags &= ~IFF_UP;

	while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL)
	bridge_delete_member(sc, bif, 0);

	while ((bif = LIST_FIRST(&sc->sc_spanlist)) != NULL) {
	bridge_delete_span(sc, bif);
	}

	BRIDGE_UNLOCK(sc);

	callout_drain(&sc->sc_brcallout);

	mtx_lock(&bridge_list_mtx);
	LIST_REMOVE(sc, sc_list);
	mtx_unlock(&bridge_list_mtx);

	bstp_detach(&sc->sc_stp);
	ether_ifdetach(ifp);
	if_free_type(ifp, IFT_ETHER);

	/* Tear down the routing table. */
	bridge_rtable_fini(sc);

	BRIDGE_LOCK_DESTROY(sc);
	free(sc, M_DEVBUF);
	}

	/*
	* bridge_ioctl:
	*
	* Handle a control request from the operator.
	*/
	static int
	bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct bridge_softc *sc = ifp->if_softc;
	struct thread *td = curthread;
	union {
	struct ifbreq ifbreq;
	struct ifbifconf ifbifconf;
	struct ifbareq ifbareq;
	struct ifbaconf ifbaconf;
	struct ifbrparam ifbrparam;
	struct ifbropreq ifbropreq;
	} args;
	struct ifdrv ifd = (struct ifdrv ) data;
	const struct bridge_control *bc;
	int error = 0;

	switch (cmd) {

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	break;

	case SIOCGDRVSPEC:
	case SIOCSDRVSPEC:
	if (ifd->ifd_cmd >= bridge_control_table_size) {
	error = EINVAL;
	break;
	}
	bc = &bridge_control_table[ifd->ifd_cmd];

	if (cmd == SIOCGDRVSPEC &&
	(bc->bc_flags & BC_F_COPYOUT) == 0) {
	error = EINVAL;
	break;
	}
	else if (cmd == SIOCSDRVSPEC &&
	(bc->bc_flags & BC_F_COPYOUT) != 0) {
	error = EINVAL;
	break;
	}

	if (bc->bc_flags & BC_F_SUSER) {
	error = priv_check(td, PRIV_NET_BRIDGE);
	if (error)
	break;
	}

	if (ifd->ifd_len != bc->bc_argsize \|\|
	ifd->ifd_len > sizeof(args)) {
	error = EINVAL;
	break;
	}

	bzero(&args, sizeof(args));
	if (bc->bc_flags & BC_F_COPYIN) {
	error = copyin(ifd->ifd_data, &args, ifd->ifd_len);
	if (error)
	break;
	}

	BRIDGE_LOCK(sc);
	error = (*bc->bc_func)(sc, &args);
	BRIDGE_UNLOCK(sc);
	if (error)
	break;

	if (bc->bc_flags & BC_F_COPYOUT)
	error = copyout(&args, ifd->ifd_data, ifd->ifd_len);

	break;

	case SIOCSIFFLAGS:
	if (!(ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	/*
	* If interface is marked down and it is running,
	* then stop and disable it.
	*/
	BRIDGE_LOCK(sc);
	bridge_stop(ifp, 1);
	BRIDGE_UNLOCK(sc);
	} else if ((ifp->if_flags & IFF_UP) &&
	!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	/*
	* If interface is marked up and it is stopped, then
	* start it.
	*/
	(*ifp->if_init)(sc);
	}
	break;

	case SIOCSIFMTU:
	/* Do not allow the MTU to be changed on the bridge */
	error = EINVAL;
	break;

	default:
	/*
	* drop the lock as ether_ioctl() will call bridge_start() and
	* cause the lock to be recursed.
	*/
	error = ether_ioctl(ifp, cmd, data);
	break;
	}

	return (error);
	}

	/*
	* bridge_mutecaps:
	*
	* Clear or restore unwanted capabilities on the member interface
	*/
	static void
	bridge_mutecaps(struct bridge_softc *sc)
	{
	struct bridge_iflist *bif;
	int enabled, mask;

	/* Initial bitmask of capabilities to test */
	mask = BRIDGE_IFCAPS_MASK;

	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	/* Every member must support it or its disabled */
	mask &= bif->bif_savedcaps;
	}

	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	enabled = bif->bif_ifp->if_capenable;
	/* strip off mask bits and enable them again if allowed */
	enabled &= ~BRIDGE_IFCAPS_MASK;
	enabled \|= mask;
	/*
	* Receive offload can only be enabled if all members also
	* support send offload.
	*/
	if ((enabled & IFCAP_TSO) == 0)
	enabled &= ~IFCAP_LRO;

	bridge_set_ifcap(sc, bif, enabled);
	}

	}

	static void
	bridge_set_ifcap(struct bridge_softc sc, struct bridge_iflist bif, int set)
	{
	struct ifnet *ifp = bif->bif_ifp;
	struct ifreq ifr;
	int error;

	bzero(&ifr, sizeof(ifr));
	ifr.ifr_reqcap = set;

	if (ifp->if_capenable != set) {
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr);
	IFF_UNLOCKGIANT(ifp);
	if (error)
	if_printf(sc->sc_ifp,
	"error setting interface capabilities on %s\n",
	ifp->if_xname);
	}
	}

	/*
	* bridge_lookup_member:
	*
	* Lookup a bridge member interface.
	*/
	static struct bridge_iflist *
	bridge_lookup_member(struct bridge_softc sc, const char name)
	{
	struct bridge_iflist *bif;
	struct ifnet *ifp;

	BRIDGE_LOCK_ASSERT(sc);

	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	ifp = bif->bif_ifp;
	if (strcmp(ifp->if_xname, name) == 0)
	return (bif);
	}

	return (NULL);
	}

	/*
	* bridge_lookup_member_if:
	*
	* Lookup a bridge member interface by ifnet*.
	*/
	static struct bridge_iflist *
	bridge_lookup_member_if(struct bridge_softc sc, struct ifnet member_ifp)
	{
	struct bridge_iflist *bif;

	BRIDGE_LOCK_ASSERT(sc);

	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	if (bif->bif_ifp == member_ifp)
	return (bif);
	}

	return (NULL);
	}

	/*
	* bridge_delete_member:
	*
	* Delete the specified member interface.
	*/
	static void
	bridge_delete_member(struct bridge_softc sc, struct bridge_iflist bif,
	int gone)
	{
	struct ifnet *ifs = bif->bif_ifp;
	struct ifnet *fif = NULL;

	BRIDGE_LOCK_ASSERT(sc);

	if (!gone) {
	switch (ifs->if_type) {
	case IFT_ETHER:
	case IFT_L2VLAN:
	/*
	* Take the interface out of promiscuous mode.
	*/
	(void) ifpromisc(ifs, 0);
	break;

	case IFT_GIF:
	break;

	default:
	#ifdef DIAGNOSTIC
	panic("bridge_delete_member: impossible");
	#endif
	break;
	}
	/* reneable any interface capabilities */
	bridge_set_ifcap(sc, bif, bif->bif_savedcaps);
	}

	if (bif->bif_flags & IFBIF_STP)
	bstp_disable(&bif->bif_stp);

	ifs->if_bridge = NULL;
	BRIDGE_XLOCK(sc);
	LIST_REMOVE(bif, bif_next);
	BRIDGE_XDROP(sc);

	/*
	* If removing the interface that gave the bridge its mac address, set
	* the mac address of the bridge to the address of the next member, or
	* to its default address if no members are left.
	*/
	if (bridge_inherit_mac &&
	!memcmp(IF_LLADDR(sc->sc_ifp), IF_LLADDR(ifs), ETHER_ADDR_LEN)) {
	if (LIST_EMPTY(&sc->sc_iflist))
	bcopy(sc->sc_defaddr,
	IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
	else {
	fif = LIST_FIRST(&sc->sc_iflist)->bif_ifp;
	bcopy(IF_LLADDR(fif),
	IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);
	}
	}

	bridge_mutecaps(sc); /* recalcuate now this interface is removed */
	bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
	KASSERT(bif->bif_addrcnt == 0,
	("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt));

	BRIDGE_UNLOCK(sc);
	bstp_destroy(&bif->bif_stp); /* prepare to free */
	BRIDGE_LOCK(sc);
	free(bif, M_DEVBUF);
	}

	/*
	* bridge_delete_span:
	*
	* Delete the specified span interface.
	*/
	static void
	bridge_delete_span(struct bridge_softc sc, struct bridge_iflist bif)
	{
	BRIDGE_LOCK_ASSERT(sc);

	KASSERT(bif->bif_ifp->if_bridge == NULL,
	("%s: not a span interface", __func__));

	LIST_REMOVE(bif, bif_next);
	free(bif, M_DEVBUF);
	}

	static int
	bridge_ioctl_add(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif = NULL;
	struct ifnet *ifs;
	int error = 0;

	ifs = ifunit(req->ifbr_ifsname);
	if (ifs == NULL)
	return (ENOENT);
	if (ifs->if_ioctl == NULL) /* must be supported */
	return (EINVAL);

	/* If it's in the span list, it can't be a member. */
	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
	if (ifs == bif->bif_ifp)
	return (EBUSY);

	/* Allow the first Ethernet member to define the MTU */
	if (ifs->if_type != IFT_GIF) {
	if (LIST_EMPTY(&sc->sc_iflist))
	sc->sc_ifp->if_mtu = ifs->if_mtu;
	else if (sc->sc_ifp->if_mtu != ifs->if_mtu) {
	if_printf(sc->sc_ifp, "invalid MTU for %s\n",
	ifs->if_xname);
	return (EINVAL);
	}
	}

	if (ifs->if_bridge == sc)
	return (EEXIST);

	if (ifs->if_bridge != NULL)
	return (EBUSY);

	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT\|M_ZERO);
	if (bif == NULL)
	return (ENOMEM);

	bif->bif_ifp = ifs;
	bif->bif_flags = IFBIF_LEARNING \| IFBIF_DISCOVER;
	bif->bif_savedcaps = ifs->if_capenable;

	switch (ifs->if_type) {
	case IFT_ETHER:
	case IFT_L2VLAN:
	/*
	* Place the interface into promiscuous mode.
	*/
	error = ifpromisc(ifs, 1);
	if (error)
	goto out;
	break;

	case IFT_GIF:
	break;

	default:
	error = EINVAL;
	goto out;
	}

	/*
	* Assign the interface's MAC address to the bridge if it's the first
	* member and the MAC address of the bridge has not been changed from
	* the default randomly generated one.
	*/
	if (bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) &&
	!memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN))
	bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN);

	ifs->if_bridge = sc;
	bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp);
	/*
	* XXX: XLOCK HERE!?!
	*
	* NOTE: insert_*HEAD* should be safe for the traversals.
	*/
	LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next);

	/* Set interface capabilities to the intersection set of all members */
	bridge_mutecaps(sc);
	out:
	if (error) {
	if (bif != NULL)
	free(bif, M_DEVBUF);
	}
	return (error);
	}

	static int
	bridge_ioctl_del(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;

	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
	if (bif == NULL)
	return (ENOENT);

	bridge_delete_member(sc, bif, 0);

	return (0);
	}

	static int
	bridge_ioctl_gifflags(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;
	struct bstp_port *bp;

	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
	if (bif == NULL)
	return (ENOENT);

	bp = &bif->bif_stp;
	req->ifbr_ifsflags = bif->bif_flags;
	req->ifbr_state = bp->bp_state;
	req->ifbr_priority = bp->bp_priority;
	req->ifbr_path_cost = bp->bp_path_cost;
	req->ifbr_portno = bif->bif_ifp->if_index & 0xfff;
	req->ifbr_proto = bp->bp_protover;
	req->ifbr_role = bp->bp_role;
	req->ifbr_stpflags = bp->bp_flags;
	req->ifbr_addrcnt = bif->bif_addrcnt;
	req->ifbr_addrmax = bif->bif_addrmax;
	req->ifbr_addrexceeded = bif->bif_addrexceeded;

	/* Copy STP state options as flags */
	if (bp->bp_operedge)
	req->ifbr_ifsflags \|= IFBIF_BSTP_EDGE;
	if (bp->bp_flags & BSTP_PORT_AUTOEDGE)
	req->ifbr_ifsflags \|= IFBIF_BSTP_AUTOEDGE;
	if (bp->bp_ptp_link)
	req->ifbr_ifsflags \|= IFBIF_BSTP_PTP;
	if (bp->bp_flags & BSTP_PORT_AUTOPTP)
	req->ifbr_ifsflags \|= IFBIF_BSTP_AUTOPTP;
	if (bp->bp_flags & BSTP_PORT_ADMEDGE)
	req->ifbr_ifsflags \|= IFBIF_BSTP_ADMEDGE;
	if (bp->bp_flags & BSTP_PORT_ADMCOST)
	req->ifbr_ifsflags \|= IFBIF_BSTP_ADMCOST;
	return (0);
	}

	static int
	bridge_ioctl_sifflags(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;
	struct bstp_port *bp;
	int error;

	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
	if (bif == NULL)
	return (ENOENT);
	bp = &bif->bif_stp;

	if (req->ifbr_ifsflags & IFBIF_SPAN)
	/* SPAN is readonly */
	return (EINVAL);

	if (req->ifbr_ifsflags & IFBIF_STP) {
	if ((bif->bif_flags & IFBIF_STP) == 0) {
	error = bstp_enable(&bif->bif_stp);
	if (error)
	return (error);
	}
	} else {
	if ((bif->bif_flags & IFBIF_STP) != 0)
	bstp_disable(&bif->bif_stp);
	}

	/* Pass on STP flags */
	bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0);
	bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0);
	bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0);
	bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0);

	/* Save the bits relating to the bridge */
	bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK;

	return (0);
	}

	static int
	bridge_ioctl_scache(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	sc->sc_brtmax = param->ifbrp_csize;
	bridge_rttrim(sc);

	return (0);
	}

	static int
	bridge_ioctl_gcache(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	param->ifbrp_csize = sc->sc_brtmax;

	return (0);
	}

	static int
	bridge_ioctl_gifs(struct bridge_softc sc, void arg)
	{
	struct ifbifconf *bifc = arg;
	struct bridge_iflist *bif;
	struct ifbreq breq;
	char buf, outbuf;
	int count, buflen, len, error = 0;

	count = 0;
	LIST_FOREACH(bif, &sc->sc_iflist, bif_next)
	count++;
	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
	count++;

	buflen = sizeof(breq) * count;
	if (bifc->ifbic_len == 0) {
	bifc->ifbic_len = buflen;
	return (0);
	}
	BRIDGE_UNLOCK(sc);
	outbuf = malloc(buflen, M_TEMP, M_WAITOK \| M_ZERO);
	BRIDGE_LOCK(sc);

	count = 0;
	buf = outbuf;
	len = min(bifc->ifbic_len, buflen);
	bzero(&breq, sizeof(breq));
	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	if (len < sizeof(breq))
	break;

	strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
	sizeof(breq.ifbr_ifsname));
	/* Fill in the ifbreq structure */
	error = bridge_ioctl_gifflags(sc, &breq);
	if (error)
	break;
	memcpy(buf, &breq, sizeof(breq));
	count++;
	buf += sizeof(breq);
	len -= sizeof(breq);
	}
	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
	if (len < sizeof(breq))
	break;

	strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname,
	sizeof(breq.ifbr_ifsname));
	breq.ifbr_ifsflags = bif->bif_flags;
	breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff;
	memcpy(buf, &breq, sizeof(breq));
	count++;
	buf += sizeof(breq);
	len -= sizeof(breq);
	}

	BRIDGE_UNLOCK(sc);
	bifc->ifbic_len = sizeof(breq) * count;
	error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len);
	BRIDGE_LOCK(sc);
	free(outbuf, M_TEMP);
	return (error);
	}

	static int
	bridge_ioctl_rts(struct bridge_softc sc, void arg)
	{
	struct ifbaconf *bac = arg;
	struct bridge_rtnode *brt;
	struct ifbareq bareq;
	char buf, outbuf;
	int count, buflen, len, error = 0;

	if (bac->ifbac_len == 0)
	return (0);

	count = 0;
	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list)
	count++;
	buflen = sizeof(bareq) * count;

	BRIDGE_UNLOCK(sc);
	outbuf = malloc(buflen, M_TEMP, M_WAITOK \| M_ZERO);
	BRIDGE_LOCK(sc);

	count = 0;
	buf = outbuf;
	len = min(bac->ifbac_len, buflen);
	bzero(&bareq, sizeof(bareq));
	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
	if (len < sizeof(bareq))
	goto out;
	strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname,
	sizeof(bareq.ifba_ifsname));
	memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr));
	bareq.ifba_vlan = brt->brt_vlan;
	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
	time_uptime < brt->brt_expire)
	bareq.ifba_expire = brt->brt_expire - time_uptime;
	else
	bareq.ifba_expire = 0;
	bareq.ifba_flags = brt->brt_flags;

	memcpy(buf, &bareq, sizeof(bareq));
	count++;
	buf += sizeof(bareq);
	len -= sizeof(bareq);
	}
	out:
	BRIDGE_UNLOCK(sc);
	bac->ifbac_len = sizeof(bareq) * count;
	error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len);
	BRIDGE_LOCK(sc);
	free(outbuf, M_TEMP);
	return (error);
	}

	static int
	bridge_ioctl_saddr(struct bridge_softc sc, void arg)
	{
	struct ifbareq *req = arg;
	struct bridge_iflist *bif;
	int error;

	bif = bridge_lookup_member(sc, req->ifba_ifsname);
	if (bif == NULL)
	return (ENOENT);

	error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1,
	req->ifba_flags);

	return (error);
	}

	static int
	bridge_ioctl_sto(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	sc->sc_brttimeout = param->ifbrp_ctime;
	return (0);
	}

	static int
	bridge_ioctl_gto(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	param->ifbrp_ctime = sc->sc_brttimeout;
	return (0);
	}

	static int
	bridge_ioctl_daddr(struct bridge_softc sc, void arg)
	{
	struct ifbareq *req = arg;

	return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan));
	}

	static int
	bridge_ioctl_flush(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;

	bridge_rtflush(sc, req->ifbr_ifsflags);
	return (0);
	}

	static int
	bridge_ioctl_gpri(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;
	struct bstp_state *bs = &sc->sc_stp;

	param->ifbrp_prio = bs->bs_bridge_priority;
	return (0);
	}

	static int
	bridge_ioctl_spri(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio));
	}

	static int
	bridge_ioctl_ght(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;
	struct bstp_state *bs = &sc->sc_stp;

	param->ifbrp_hellotime = bs->bs_bridge_htime >> 8;
	return (0);
	}

	static int
	bridge_ioctl_sht(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime));
	}

	static int
	bridge_ioctl_gfd(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;
	struct bstp_state *bs = &sc->sc_stp;

	param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8;
	return (0);
	}

	static int
	bridge_ioctl_sfd(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay));
	}

	static int
	bridge_ioctl_gma(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;
	struct bstp_state *bs = &sc->sc_stp;

	param->ifbrp_maxage = bs->bs_bridge_max_age >> 8;
	return (0);
	}

	static int
	bridge_ioctl_sma(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage));
	}

	static int
	bridge_ioctl_sifprio(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;

	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
	if (bif == NULL)
	return (ENOENT);

	return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority));
	}

	static int
	bridge_ioctl_sifcost(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;

	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
	if (bif == NULL)
	return (ENOENT);

	return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost));
	}

	static int
	bridge_ioctl_sifmaxaddr(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;

	bif = bridge_lookup_member(sc, req->ifbr_ifsname);
	if (bif == NULL)
	return (ENOENT);

	bif->bif_addrmax = req->ifbr_addrmax;
	return (0);
	}

	static int
	bridge_ioctl_addspan(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif = NULL;
	struct ifnet *ifs;

	ifs = ifunit(req->ifbr_ifsname);
	if (ifs == NULL)
	return (ENOENT);

	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
	if (ifs == bif->bif_ifp)
	return (EBUSY);

	if (ifs->if_bridge != NULL)
	return (EBUSY);

	switch (ifs->if_type) {
	case IFT_ETHER:
	case IFT_GIF:
	case IFT_L2VLAN:
	break;
	default:
	return (EINVAL);
	}

	bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT\|M_ZERO);
	if (bif == NULL)
	return (ENOMEM);

	bif->bif_ifp = ifs;
	bif->bif_flags = IFBIF_SPAN;

	LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next);

	return (0);
	}

	static int
	bridge_ioctl_delspan(struct bridge_softc sc, void arg)
	{
	struct ifbreq *req = arg;
	struct bridge_iflist *bif;
	struct ifnet *ifs;

	ifs = ifunit(req->ifbr_ifsname);
	if (ifs == NULL)
	return (ENOENT);

	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
	if (ifs == bif->bif_ifp)
	break;

	if (bif == NULL)
	return (ENOENT);

	bridge_delete_span(sc, bif);

	return (0);
	}

	static int
	bridge_ioctl_gbparam(struct bridge_softc sc, void arg)
	{
	struct ifbropreq *req = arg;
	struct bstp_state *bs = &sc->sc_stp;
	struct bstp_port *root_port;

	req->ifbop_maxage = bs->bs_bridge_max_age >> 8;
	req->ifbop_hellotime = bs->bs_bridge_htime >> 8;
	req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8;

	root_port = bs->bs_root_port;
	if (root_port == NULL)
	req->ifbop_root_port = 0;
	else
	req->ifbop_root_port = root_port->bp_ifp->if_index;

	req->ifbop_holdcount = bs->bs_txholdcount;
	req->ifbop_priority = bs->bs_bridge_priority;
	req->ifbop_protocol = bs->bs_protover;
	req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost;
	req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id;
	req->ifbop_designated_root = bs->bs_root_pv.pv_root_id;
	req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id;
	req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec;
	req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec;

	return (0);
	}

	static int
	bridge_ioctl_grte(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	param->ifbrp_cexceeded = sc->sc_brtexceeded;
	return (0);
	}

	static int
	bridge_ioctl_gifsstp(struct bridge_softc sc, void arg)
	{
	struct ifbpstpconf *bifstp = arg;
	struct bridge_iflist *bif;
	struct bstp_port *bp;
	struct ifbpstpreq bpreq;
	char buf, outbuf;
	int count, buflen, len, error = 0;

	count = 0;
	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	if ((bif->bif_flags & IFBIF_STP) != 0)
	count++;
	}

	buflen = sizeof(bpreq) * count;
	if (bifstp->ifbpstp_len == 0) {
	bifstp->ifbpstp_len = buflen;
	return (0);
	}

	BRIDGE_UNLOCK(sc);
	outbuf = malloc(buflen, M_TEMP, M_WAITOK \| M_ZERO);
	BRIDGE_LOCK(sc);

	count = 0;
	buf = outbuf;
	len = min(bifstp->ifbpstp_len, buflen);
	bzero(&bpreq, sizeof(bpreq));
	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	if (len < sizeof(bpreq))
	break;

	if ((bif->bif_flags & IFBIF_STP) == 0)
	continue;

	bp = &bif->bif_stp;
	bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff;
	bpreq.ifbp_fwd_trans = bp->bp_forward_transitions;
	bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost;
	bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id;
	bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id;
	bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id;

	memcpy(buf, &bpreq, sizeof(bpreq));
	count++;
	buf += sizeof(bpreq);
	len -= sizeof(bpreq);
	}

	BRIDGE_UNLOCK(sc);
	bifstp->ifbpstp_len = sizeof(bpreq) * count;
	error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len);
	BRIDGE_LOCK(sc);
	free(outbuf, M_TEMP);
	return (error);
	}

	static int
	bridge_ioctl_sproto(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto));
	}

	static int
	bridge_ioctl_stxhc(struct bridge_softc sc, void arg)
	{
	struct ifbrparam *param = arg;

	return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc));
	}

	/*
	* bridge_ifdetach:
	*
	* Detach an interface from a bridge. Called when a member
	* interface is detaching.
	*/
	static void
	bridge_ifdetach(void arg __unused, struct ifnet ifp)
	{
	struct bridge_softc *sc = ifp->if_bridge;
	struct bridge_iflist *bif;

	/* Check if the interface is a bridge member */
	if (sc != NULL) {
	BRIDGE_LOCK(sc);

	bif = bridge_lookup_member_if(sc, ifp);
	if (bif != NULL)
	bridge_delete_member(sc, bif, 1);

	BRIDGE_UNLOCK(sc);
	return;
	}

	/* Check if the interface is a span port */
	mtx_lock(&bridge_list_mtx);
	LIST_FOREACH(sc, &bridge_list, sc_list) {
	BRIDGE_LOCK(sc);
	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next)
	if (ifp == bif->bif_ifp) {
	bridge_delete_span(sc, bif);
	break;
	}

	BRIDGE_UNLOCK(sc);
	}
	mtx_unlock(&bridge_list_mtx);
	}

	/*
	* bridge_init:
	*
	* Initialize a bridge interface.
	*/
	static void
	bridge_init(void *xsc)
	{
	struct bridge_softc sc = (struct bridge_softc )xsc;
	struct ifnet *ifp = sc->sc_ifp;

	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
	return;

	BRIDGE_LOCK(sc);
	callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz,
	bridge_timer, sc);

	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */

	BRIDGE_UNLOCK(sc);
	}

	/*
	* bridge_stop:
	*
	* Stop the bridge interface.
	*/
	static void
	bridge_stop(struct ifnet *ifp, int disable)
	{
	struct bridge_softc *sc = ifp->if_softc;

	BRIDGE_LOCK_ASSERT(sc);

	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
	return;

	callout_stop(&sc->sc_brcallout);
	bstp_stop(&sc->sc_stp);

	bridge_rtflush(sc, IFBF_FLUSHDYN);

	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	}

	/*
	* bridge_enqueue:
	*
	* Enqueue a packet on a bridge member interface.
	*
	*/
	static void
	bridge_enqueue(struct bridge_softc sc, struct ifnet dst_ifp, struct mbuf *m)
	{
	int len, err = 0;
	short mflags;
	struct mbuf *m0;

	len = m->m_pkthdr.len;
	mflags = m->m_flags;

	/* We may be sending a fragment so traverse the mbuf */
	for (; m; m = m0) {
	m0 = m->m_nextpkt;
	m->m_nextpkt = NULL;

	/*
	* If underlying interface can not do VLAN tag insertion itself
	* then attach a packet tag that holds it.
	*/
	if ((m->m_flags & M_VLANTAG) &&
	(dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) {
	m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
	if (m == NULL) {
	if_printf(dst_ifp,
	"unable to prepend VLAN header\n");
	dst_ifp->if_oerrors++;
	continue;
	}
	m->m_flags &= ~M_VLANTAG;
	}

	if (err == 0)
	IFQ_ENQUEUE(&dst_ifp->if_snd, m, err);
	}

	if (err == 0) {

	sc->sc_ifp->if_opackets++;
	sc->sc_ifp->if_obytes += len;

	dst_ifp->if_obytes += len;

	if (mflags & M_MCAST) {
	sc->sc_ifp->if_omcasts++;
	dst_ifp->if_omcasts++;
	}
	}

	if ((dst_ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
	(*dst_ifp->if_start)(dst_ifp);
	}

	/*
	* bridge_dummynet:
	*
	* Receive a queued packet from dummynet and pass it on to the output
	* interface.
	*
	* The mbuf has the Ethernet header already attached.
	*/
	static void
	bridge_dummynet(struct mbuf m, struct ifnet ifp)
	{
	struct bridge_softc *sc;

	sc = ifp->if_bridge;

	/*
	* The packet didnt originate from a member interface. This should only
	* ever happen if a member interface is removed while packets are
	* queued for it.
	*/
	if (sc == NULL) {
	m_freem(m);
	return;
	}

	if (PFIL_HOOKED(&inet_pfil_hook)
	#ifdef INET6
	\|\| PFIL_HOOKED(&inet6_pfil_hook)
	#endif
	) {
	if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0)
	return;
	if (m == NULL)
	return;
	}

	bridge_enqueue(sc, ifp, m);
	}

	/*
	* bridge_output:
	*
	* Send output from a bridge member interface. This
	* performs the bridging function for locally originated
	* packets.
	*
	* The mbuf has the Ethernet header already attached. We must
	* enqueue or free the mbuf before returning.
	*/
	static int
	bridge_output(struct ifnet ifp, struct mbuf m, struct sockaddr *sa,
	struct rtentry *rt)
	{
	struct ether_header *eh;
	struct ifnet *dst_if;
	struct bridge_softc *sc;
	uint16_t vlan;

	if (m->m_len < ETHER_HDR_LEN) {
	m = m_pullup(m, ETHER_HDR_LEN);
	if (m == NULL)
	return (0);
	}

	eh = mtod(m, struct ether_header *);
	sc = ifp->if_bridge;
	vlan = VLANTAGOF(m);

	BRIDGE_LOCK(sc);

	/*
	* If bridge is down, but the original output interface is up,
	* go ahead and send out that interface. Otherwise, the packet
	* is dropped below.
	*/
	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
	dst_if = ifp;
	goto sendunicast;
	}

	/*
	* If the packet is a multicast, or we don't know a better way to
	* get there, send to all interfaces.
	*/
	if (ETHER_IS_MULTICAST(eh->ether_dhost))
	dst_if = NULL;
	else
	dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan);
	if (dst_if == NULL) {
	struct bridge_iflist *bif;
	struct mbuf *mc;
	int error = 0, used = 0;

	bridge_span(sc, m);

	BRIDGE_LOCK2REF(sc, error);
	if (error) {
	m_freem(m);
	return (0);
	}

	LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
	dst_if = bif->bif_ifp;

	if (dst_if->if_type == IFT_GIF)
	continue;
	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
	continue;

	/*
	* If this is not the original output interface,
	* and the interface is participating in spanning
	* tree, make sure the port is in a state that
	* allows forwarding.
	*/
	if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) &&
	bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
	continue;

	if (LIST_NEXT(bif, bif_next) == NULL) {
	used = 1;
	mc = m;
	} else {
	mc = m_copypacket(m, M_DONTWAIT);
	if (mc == NULL) {
	sc->sc_ifp->if_oerrors++;
	continue;
	}
	}

	bridge_enqueue(sc, dst_if, mc);
	}
	if (used == 0)
	m_freem(m);
	BRIDGE_UNREF(sc);
	return (0);
	}

	sendunicast:
	/*
	* XXX Spanning tree consideration here?
	*/

	bridge_span(sc, m);
	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) {
	m_freem(m);
	BRIDGE_UNLOCK(sc);
	return (0);
	}

	BRIDGE_UNLOCK(sc);
	bridge_enqueue(sc, dst_if, m);
	return (0);
	}

	/*
	* bridge_start:
	*
	* Start output on a bridge.
	*
	*/
	static void
	bridge_start(struct ifnet *ifp)
	{
	struct bridge_softc *sc;
	struct mbuf *m;
	struct ether_header *eh;
	struct ifnet *dst_if;

	sc = ifp->if_softc;

	ifp->if_drv_flags \|= IFF_DRV_OACTIVE;
	for (;;) {
	IFQ_DEQUEUE(&ifp->if_snd, m);
	if (m == 0)
	break;
	ETHER_BPF_MTAP(ifp, m);

	eh = mtod(m, struct ether_header *);
	dst_if = NULL;

	BRIDGE_LOCK(sc);
	if ((m->m_flags & (M_BCAST\|M_MCAST)) == 0) {
	dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1);
	}

	if (dst_if == NULL)
	bridge_broadcast(sc, ifp, m, 0);
	else {
	BRIDGE_UNLOCK(sc);
	bridge_enqueue(sc, dst_if, m);
	}
	}
	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
	}

	/*
	* bridge_forward:
	*
	* The forwarding function of the bridge.
	*
	* NOTE: Releases the lock on return.
	*/
	static void
	bridge_forward(struct bridge_softc sc, struct bridge_iflist sbif,
	struct mbuf *m)
	{
	struct bridge_iflist *dbif;
	struct ifnet src_if, dst_if, *ifp;
	struct ether_header *eh;
	uint16_t vlan;
	uint8_t *dst;
	int error;

	src_if = m->m_pkthdr.rcvif;
	ifp = sc->sc_ifp;

	ifp->if_ipackets++;
	ifp->if_ibytes += m->m_pkthdr.len;
	vlan = VLANTAGOF(m);

	if ((sbif->bif_flags & IFBIF_STP) &&
	sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
	goto drop;

	eh = mtod(m, struct ether_header *);
	dst = eh->ether_dhost;

	/* If the interface is learning, record the address. */
	if (sbif->bif_flags & IFBIF_LEARNING) {
	error = bridge_rtupdate(sc, eh->ether_shost, vlan,
	sbif, 0, IFBAF_DYNAMIC);
	/*
	* If the interface has addresses limits then deny any source
	* that is not in the cache.
	*/
	if (error && sbif->bif_addrmax)
	goto drop;
	}

	if ((sbif->bif_flags & IFBIF_STP) != 0 &&
	sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING)
	goto drop;

	/*
	* At this point, the port either doesn't participate
	* in spanning tree or it is in the forwarding state.
	*/

	/*
	* If the packet is unicast, destined for someone on
	* "this" side of the bridge, drop it.
	*/
	if ((m->m_flags & (M_BCAST\|M_MCAST)) == 0) {
	dst_if = bridge_rtlookup(sc, dst, vlan);
	if (src_if == dst_if)
	goto drop;
	} else {
	/*
	* Check if its a reserved multicast address, any address
	* listed in 802.1D section 7.12.6 may not be forwarded by the
	* bridge.
	* This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F
	*/
	if (dst[0] == 0x01 && dst[1] == 0x80 &&
	dst[2] == 0xc2 && dst[3] == 0x00 &&
	dst[4] == 0x00 && dst[5] <= 0x0f)
	goto drop;

	/* ...forward it to all interfaces. */
	ifp->if_imcasts++;
	dst_if = NULL;
	}

	/*
	* If we have a destination interface which is a member of our bridge,
	* OR this is a unicast packet, push it through the bpf(4) machinery.
	* For broadcast or multicast packets, don't bother because it will
	* be reinjected into ether_input. We do this before we pass the packets
	* through the pfil(9) framework, as it is possible that pfil(9) will
	* drop the packet, or possibly modify it, making it difficult to debug
	* firewall issues on the bridge.
	*/
	if (dst_if != NULL \|\| (m->m_flags & (M_BCAST \| M_MCAST)) == 0)
	ETHER_BPF_MTAP(ifp, m);

	/* run the packet filter */
	if (PFIL_HOOKED(&inet_pfil_hook)
	#ifdef INET6
	\|\| PFIL_HOOKED(&inet6_pfil_hook)
	#endif
	) {
	BRIDGE_UNLOCK(sc);
	if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0)
	return;
	if (m == NULL)
	return;
	BRIDGE_LOCK(sc);
	}

	if (dst_if == NULL) {
	bridge_broadcast(sc, src_if, m, 1);
	return;
	}

	/*
	* At this point, we're dealing with a unicast frame
	* going to a different interface.
	*/
	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
	goto drop;

	dbif = bridge_lookup_member_if(sc, dst_if);
	if (dbif == NULL)
	/* Not a member of the bridge (anymore?) */
	goto drop;

	/* Private segments can not talk to each other */
	if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)
	goto drop;

	if ((dbif->bif_flags & IFBIF_STP) &&
	dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
	goto drop;

	BRIDGE_UNLOCK(sc);

	if (PFIL_HOOKED(&inet_pfil_hook)
	#ifdef INET6
	\|\| PFIL_HOOKED(&inet6_pfil_hook)
	#endif
	) {
	if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0)
	return;
	if (m == NULL)
	return;
	}

	bridge_enqueue(sc, dst_if, m);
	return;

	drop:
	BRIDGE_UNLOCK(sc);
	m_freem(m);
	}

	/*
	* bridge_input:
	*
	* Receive input from a member interface. Queue the packet for
	* bridging if it is not for us.
	*/
	static struct mbuf *
	bridge_input(struct ifnet ifp, struct mbuf m)
	{
	struct bridge_softc *sc = ifp->if_bridge;
	struct bridge_iflist bif, bif2;
	struct ifnet *bifp;
	struct ether_header *eh;
	struct mbuf mc, mc2;
	uint16_t vlan;
	int error;

	if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
	return (m);

	bifp = sc->sc_ifp;
	vlan = VLANTAGOF(m);

	/*
	* Implement support for bridge monitoring. If this flag has been
	* set on this interface, discard the packet once we push it through
	* the bpf(4) machinery, but before we do, increment the byte and
	* packet counters associated with this interface.
	*/
	if ((bifp->if_flags & IFF_MONITOR) != 0) {
	m->m_pkthdr.rcvif = bifp;
	ETHER_BPF_MTAP(bifp, m);
	bifp->if_ipackets++;
	bifp->if_ibytes += m->m_pkthdr.len;
	m_freem(m);
	return (NULL);
	}
	BRIDGE_LOCK(sc);
	bif = bridge_lookup_member_if(sc, ifp);
	if (bif == NULL) {
	BRIDGE_UNLOCK(sc);
	return (m);
	}

	eh = mtod(m, struct ether_header *);

	bridge_span(sc, m);

	if (m->m_flags & (M_BCAST\|M_MCAST)) {
	/* Tap off 802.1D packets; they do not get forwarded. */
	if (memcmp(eh->ether_dhost, bstp_etheraddr,
	ETHER_ADDR_LEN) == 0) {
	m = bstp_input(&bif->bif_stp, ifp, m);
	if (m == NULL) {
	BRIDGE_UNLOCK(sc);
	return (NULL);
	}
	}

	if ((bif->bif_flags & IFBIF_STP) &&
	bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
	BRIDGE_UNLOCK(sc);
	return (m);
	}

	/*
	* Make a deep copy of the packet and enqueue the copy
	* for bridge processing; return the original packet for
	* local processing.
	*/
	mc = m_dup(m, M_DONTWAIT);
	if (mc == NULL) {
	BRIDGE_UNLOCK(sc);
	return (m);
	}

	/* Perform the bridge forwarding function with the copy. */
	bridge_forward(sc, bif, mc);

	/*
	* Reinject the mbuf as arriving on the bridge so we have a
	* chance at claiming multicast packets. We can not loop back
	* here from ether_input as a bridge is never a member of a
	* bridge.
	*/
	KASSERT(bifp->if_bridge == NULL,
	("loop created in bridge_input"));
	mc2 = m_dup(m, M_DONTWAIT);
	if (mc2 != NULL) {
	/* Keep the layer3 header aligned */
	int i = min(mc2->m_pkthdr.len, max_protohdr);
	mc2 = m_copyup(mc2, i, ETHER_ALIGN);
	}
	if (mc2 != NULL) {
	mc2->m_pkthdr.rcvif = bifp;
	(*bifp->if_input)(bifp, mc2);
	}

	/* Return the original packet for local processing. */
	return (m);
	}

	if ((bif->bif_flags & IFBIF_STP) &&
	bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) {
	BRIDGE_UNLOCK(sc);
	return (m);
	}

	#ifdef DEV_CARP
	# define OR_CARP_CHECK_WE_ARE_DST(iface) \
	\|\| ((iface)->if_carp \
	&& carp_forus((iface)->if_carp, eh->ether_dhost))
	# define OR_CARP_CHECK_WE_ARE_SRC(iface) \
	\|\| ((iface)->if_carp \
	&& carp_forus((iface)->if_carp, eh->ether_shost))
	#else
	# define OR_CARP_CHECK_WE_ARE_DST(iface)
	# define OR_CARP_CHECK_WE_ARE_SRC(iface)
	#endif

	#ifdef INET6
	# define OR_PFIL_HOOKED_INET6 \
	\|\| PFIL_HOOKED(&inet6_pfil_hook)
	#else
	# define OR_PFIL_HOOKED_INET6
	#endif

	#define GRAB_OUR_PACKETS(iface) \
	if ((iface)->if_type == IFT_GIF) \
	continue; \
	/* It is destined for us. */ \
	if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \
	OR_CARP_CHECK_WE_ARE_DST((iface)) \
	) { \
	if ((iface)->if_type == IFT_BRIDGE) { \
	ETHER_BPF_MTAP(iface, m); \
	iface->if_ipackets++; \
	/* Filter on the physical interface. */ \
	if (pfil_local_phys && \
	(PFIL_HOOKED(&inet_pfil_hook) \
	OR_PFIL_HOOKED_INET6)) { \
	if (bridge_pfil(&m, NULL, ifp, \
	PFIL_IN) != 0 \|\| m == NULL) { \
	BRIDGE_UNLOCK(sc); \
	return (NULL); \
	} \
	} \
	} \
	if (bif->bif_flags & IFBIF_LEARNING) { \
	error = bridge_rtupdate(sc, eh->ether_shost, \
	vlan, bif, 0, IFBAF_DYNAMIC); \
	if (error && bif->bif_addrmax) { \
	BRIDGE_UNLOCK(sc); \
	m_freem(m); \
	return (NULL); \
	} \
	} \
	m->m_pkthdr.rcvif = iface; \
	BRIDGE_UNLOCK(sc); \
	return (m); \
	} \
	\
	/* We just received a packet that we sent out. */ \
	if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \
	OR_CARP_CHECK_WE_ARE_SRC((iface)) \
	) { \
	BRIDGE_UNLOCK(sc); \
	m_freem(m); \
	return (NULL); \
	}

	/*
	* Unicast. Make sure it's not for the bridge.
	*/
	do { GRAB_OUR_PACKETS(bifp) } while (0);

	/*
	* Give a chance for ifp at first priority. This will help when the
	* packet comes through the interface like VLAN's with the same MACs
	* on several interfaces from the same bridge. This also will save
	* some CPU cycles in case the destination interface and the input
	* interface (eq ifp) are the same.
	*/
	do { GRAB_OUR_PACKETS(ifp) } while (0);

	/* Now check the all bridge members. */
	LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) {
	GRAB_OUR_PACKETS(bif2->bif_ifp)
	}

	#undef OR_CARP_CHECK_WE_ARE_DST
	#undef OR_CARP_CHECK_WE_ARE_SRC
	#undef OR_PFIL_HOOKED_INET6
	#undef GRAB_OUR_PACKETS

	/* Perform the bridge forwarding function. */
	bridge_forward(sc, bif, m);

	return (NULL);
	}

	/*
	* bridge_broadcast:
	*
	* Send a frame to all interfaces that are members of
	* the bridge, except for the one on which the packet
	* arrived.
	*
	* NOTE: Releases the lock on return.
	*/
	static void
	bridge_broadcast(struct bridge_softc sc, struct ifnet src_if,
	struct mbuf *m, int runfilt)
	{
	struct bridge_iflist dbif, sbif;
	struct mbuf *mc;
	struct ifnet *dst_if;
	int error = 0, used = 0, i;

	sbif = bridge_lookup_member_if(sc, src_if);

	BRIDGE_LOCK2REF(sc, error);
	if (error) {
	m_freem(m);
	return;
	}

	/* Filter on the bridge interface before broadcasting */
	if (runfilt && (PFIL_HOOKED(&inet_pfil_hook)
	#ifdef INET6
	\|\| PFIL_HOOKED(&inet6_pfil_hook)
	#endif
	)) {
	if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0)
	goto out;
	if (m == NULL)
	goto out;
	}

	LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) {
	dst_if = dbif->bif_ifp;
	if (dst_if == src_if)
	continue;

	/* Private segments can not talk to each other */
	if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE))
	continue;

	if ((dbif->bif_flags & IFBIF_STP) &&
	dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
	continue;

	if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 &&
	(m->m_flags & (M_BCAST\|M_MCAST)) == 0)
	continue;

	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
	continue;

	if (LIST_NEXT(dbif, bif_next) == NULL) {
	mc = m;
	used = 1;
	} else {
	mc = m_dup(m, M_DONTWAIT);
	if (mc == NULL) {
	sc->sc_ifp->if_oerrors++;
	continue;
	}
	}

	/*
	* Filter on the output interface. Pass a NULL bridge interface
	* pointer so we do not redundantly filter on the bridge for
	* each interface we broadcast on.
	*/
	if (runfilt && (PFIL_HOOKED(&inet_pfil_hook)
	#ifdef INET6
	\|\| PFIL_HOOKED(&inet6_pfil_hook)
	#endif
	)) {
	if (used == 0) {
	/* Keep the layer3 header aligned */
	i = min(mc->m_pkthdr.len, max_protohdr);
	mc = m_copyup(mc, i, ETHER_ALIGN);
	if (mc == NULL) {
	sc->sc_ifp->if_oerrors++;
	continue;
	}
	}
	if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0)
	continue;
	if (mc == NULL)
	continue;
	}

	bridge_enqueue(sc, dst_if, mc);
	}
	if (used == 0)
	m_freem(m);

	out:
	BRIDGE_UNREF(sc);
	}

	/*
	* bridge_span:
	*
	* Duplicate a packet out one or more interfaces that are in span mode,
	* the original mbuf is unmodified.
	*/
	static void
	bridge_span(struct bridge_softc sc, struct mbuf m)
	{
	struct bridge_iflist *bif;
	struct ifnet *dst_if;
	struct mbuf *mc;

	if (LIST_EMPTY(&sc->sc_spanlist))
	return;

	LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) {
	dst_if = bif->bif_ifp;

	if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0)
	continue;

	mc = m_copypacket(m, M_DONTWAIT);
	if (mc == NULL) {
	sc->sc_ifp->if_oerrors++;
	continue;
	}

	bridge_enqueue(sc, dst_if, mc);
	}
	}

	/*
	* bridge_rtupdate:
	*
	* Add a bridge routing entry.
	*/
	static int
	bridge_rtupdate(struct bridge_softc sc, const uint8_t dst, uint16_t vlan,
	struct bridge_iflist *bif, int setflags, uint8_t flags)
	{
	struct bridge_rtnode *brt;
	int error;

	BRIDGE_LOCK_ASSERT(sc);

	/* Check the source address is valid and not multicast. */
	if (ETHER_IS_MULTICAST(dst) \|\|
	(dst[0] == 0 && dst[1] == 0 && dst[2] == 0 &&
	dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0)
	return (EINVAL);

	/* 802.1p frames map to vlan 1 */
	if (vlan == 0)
	vlan = 1;

	/*
	* A route for this destination might already exist. If so,
	* update it, otherwise create a new one.
	*/
	if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) {
	if (sc->sc_brtcnt >= sc->sc_brtmax) {
	sc->sc_brtexceeded++;
	return (ENOSPC);
	}
	/* Check per interface address limits (if enabled) */
	if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) {
	bif->bif_addrexceeded++;
	return (ENOSPC);
	}

	/*
	* Allocate a new bridge forwarding node, and
	* initialize the expiration time and Ethernet
	* address.
	*/
	brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT \| M_ZERO);
	if (brt == NULL)
	return (ENOMEM);

	if (bif->bif_flags & IFBIF_STICKY)
	brt->brt_flags = IFBAF_STICKY;
	else
	brt->brt_flags = IFBAF_DYNAMIC;

	memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN);
	brt->brt_vlan = vlan;

	if ((error = bridge_rtnode_insert(sc, brt)) != 0) {
	uma_zfree(bridge_rtnode_zone, brt);
	return (error);
	}
	brt->brt_dst = bif;
	bif->bif_addrcnt++;
	}

	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC &&
	brt->brt_dst != bif) {
	brt->brt_dst->bif_addrcnt--;
	brt->brt_dst = bif;
	brt->brt_dst->bif_addrcnt++;
	}

	if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
	brt->brt_expire = time_uptime + sc->sc_brttimeout;
	if (setflags)
	brt->brt_flags = flags;

	return (0);
	}

	/*
	* bridge_rtlookup:
	*
	* Lookup the destination interface for an address.
	*/
	static struct ifnet *
	bridge_rtlookup(struct bridge_softc sc, const uint8_t addr, uint16_t vlan)
	{
	struct bridge_rtnode *brt;

	BRIDGE_LOCK_ASSERT(sc);

	if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL)
	return (NULL);

	return (brt->brt_ifp);
	}

	/*
	* bridge_rttrim:
	*
	* Trim the routine table so that we have a number
	* of routing entries less than or equal to the
	* maximum number.
	*/
	static void
	bridge_rttrim(struct bridge_softc *sc)
	{
	struct bridge_rtnode brt, nbrt;

	BRIDGE_LOCK_ASSERT(sc);

	/* Make sure we actually need to do this. */
	if (sc->sc_brtcnt <= sc->sc_brtmax)
	return;

	/* Force an aging cycle; this might trim enough addresses. */
	bridge_rtage(sc);
	if (sc->sc_brtcnt <= sc->sc_brtmax)
	return;

	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
	bridge_rtnode_destroy(sc, brt);
	if (sc->sc_brtcnt <= sc->sc_brtmax)
	return;
	}
	}
	}

	/*
	* bridge_timer:
	*
	* Aging timer for the bridge.
	*/
	static void
	bridge_timer(void *arg)
	{
	struct bridge_softc *sc = arg;

	BRIDGE_LOCK_ASSERT(sc);

	bridge_rtage(sc);

	if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING)
	callout_reset(&sc->sc_brcallout,
	bridge_rtable_prune_period * hz, bridge_timer, sc);
	}

	/*
	* bridge_rtage:
	*
	* Perform an aging cycle.
	*/
	static void
	bridge_rtage(struct bridge_softc *sc)
	{
	struct bridge_rtnode brt, nbrt;

	BRIDGE_LOCK_ASSERT(sc);

	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
	if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
	if (time_uptime >= brt->brt_expire)
	bridge_rtnode_destroy(sc, brt);
	}
	}
	}

	/*
	* bridge_rtflush:
	*
	* Remove all dynamic addresses from the bridge.
	*/
	static void
	bridge_rtflush(struct bridge_softc *sc, int full)
	{
	struct bridge_rtnode brt, nbrt;

	BRIDGE_LOCK_ASSERT(sc);

	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
	if (full \|\| (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
	bridge_rtnode_destroy(sc, brt);
	}
	}

	/*
	* bridge_rtdaddr:
	*
	* Remove an address from the table.
	*/
	static int
	bridge_rtdaddr(struct bridge_softc sc, const uint8_t addr, uint16_t vlan)
	{
	struct bridge_rtnode *brt;
	int found = 0;

	BRIDGE_LOCK_ASSERT(sc);

	/*
	* If vlan is zero then we want to delete for all vlans so the lookup
	* may return more than one.
	*/
	while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) {
	bridge_rtnode_destroy(sc, brt);
	found = 1;
	}

	return (found ? 0 : ENOENT);
	}

	/*
	* bridge_rtdelete:
	*
	* Delete routes to a speicifc member interface.
	*/
	static void
	bridge_rtdelete(struct bridge_softc sc, struct ifnet ifp, int full)
	{
	struct bridge_rtnode brt, nbrt;

	BRIDGE_LOCK_ASSERT(sc);

	LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) {
	if (brt->brt_ifp == ifp && (full \|\|
	(brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC))
	bridge_rtnode_destroy(sc, brt);
	}
	}

	/*
	* bridge_rtable_init:
	*
	* Initialize the route table for this bridge.
	*/
	static int
	bridge_rtable_init(struct bridge_softc *sc)
	{
	int i;

	sc->sc_rthash = malloc(sizeof(sc->sc_rthash) BRIDGE_RTHASH_SIZE,
	M_DEVBUF, M_NOWAIT);
	if (sc->sc_rthash == NULL)
	return (ENOMEM);

	for (i = 0; i < BRIDGE_RTHASH_SIZE; i++)
	LIST_INIT(&sc->sc_rthash[i]);

	sc->sc_rthash_key = arc4random();

	LIST_INIT(&sc->sc_rtlist);

	return (0);
	}

	/*
	* bridge_rtable_fini:
	*
	* Deconstruct the route table for this bridge.
	*/
	static void
	bridge_rtable_fini(struct bridge_softc *sc)
	{

	KASSERT(sc->sc_brtcnt == 0,
	("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt));
	free(sc->sc_rthash, M_DEVBUF);
	}

	/*
	* The following hash function is adapted from "Hash Functions" by Bob Jenkins
	* ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
	*/
	#define mix(a, b, c) \
	do { \
	a -= b; a -= c; a ^= (c >> 13); \
	b -= c; b -= a; b ^= (a << 8); \
	c -= a; c -= b; c ^= (b >> 13); \
	a -= b; a -= c; a ^= (c >> 12); \
	b -= c; b -= a; b ^= (a << 16); \
	c -= a; c -= b; c ^= (b >> 5); \
	a -= b; a -= c; a ^= (c >> 3); \
	b -= c; b -= a; b ^= (a << 10); \
	c -= a; c -= b; c ^= (b >> 15); \
	} while (/CONSTCOND/0)

	static __inline uint32_t
	bridge_rthash(struct bridge_softc sc, const uint8_t addr)
	{
	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key;

	b += addr[5] << 8;
	b += addr[4];
	a += addr[3] << 24;
	a += addr[2] << 16;
	a += addr[1] << 8;
	a += addr[0];

	mix(a, b, c);

	return (c & BRIDGE_RTHASH_MASK);
	}

	#undef mix

	static int
	bridge_rtnode_addr_cmp(const uint8_t a, const uint8_t b)
	{
	int i, d;

	for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) {
	d = ((int)a[i]) - ((int)b[i]);
	}

	return (d);
	}

	/*
	* bridge_rtnode_lookup:
	*
	* Look up a bridge route node for the specified destination. Compare the
	* vlan id or if zero then just return the first match.
	*/
	static struct bridge_rtnode *
	bridge_rtnode_lookup(struct bridge_softc sc, const uint8_t addr, uint16_t vlan)
	{
	struct bridge_rtnode *brt;
	uint32_t hash;
	int dir;

	BRIDGE_LOCK_ASSERT(sc);

	hash = bridge_rthash(sc, addr);
	LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) {
	dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr);
	if (dir == 0 && (brt->brt_vlan == vlan \|\| vlan == 0))
	return (brt);
	if (dir > 0)
	return (NULL);
	}

	return (NULL);
	}

	/*
	* bridge_rtnode_insert:
	*
	* Insert the specified bridge node into the route table. We
	* assume the entry is not already in the table.
	*/
	static int
	bridge_rtnode_insert(struct bridge_softc sc, struct bridge_rtnode brt)
	{
	struct bridge_rtnode *lbrt;
	uint32_t hash;
	int dir;

	BRIDGE_LOCK_ASSERT(sc);

	hash = bridge_rthash(sc, brt->brt_addr);

	lbrt = LIST_FIRST(&sc->sc_rthash[hash]);
	if (lbrt == NULL) {
	LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash);
	goto out;
	}

	do {
	dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr);
	if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan)
	return (EEXIST);
	if (dir > 0) {
	LIST_INSERT_BEFORE(lbrt, brt, brt_hash);
	goto out;
	}
	if (LIST_NEXT(lbrt, brt_hash) == NULL) {
	LIST_INSERT_AFTER(lbrt, brt, brt_hash);
	goto out;
	}
	lbrt = LIST_NEXT(lbrt, brt_hash);
	} while (lbrt != NULL);

	#ifdef DIAGNOSTIC
	panic("bridge_rtnode_insert: impossible");
	#endif

	out:
	LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list);
	sc->sc_brtcnt++;

	return (0);
	}

	/*
	* bridge_rtnode_destroy:
	*
	* Destroy a bridge rtnode.
	*/
	static void
	bridge_rtnode_destroy(struct bridge_softc sc, struct bridge_rtnode brt)
	{
	BRIDGE_LOCK_ASSERT(sc);

	LIST_REMOVE(brt, brt_hash);

	LIST_REMOVE(brt, brt_list);
	sc->sc_brtcnt--;
	brt->brt_dst->bif_addrcnt--;
	uma_zfree(bridge_rtnode_zone, brt);
	}

	/*
	* bridge_rtable_expire:
	*
	* Set the expiry time for all routes on an interface.
	*/
	static void
	bridge_rtable_expire(struct ifnet *ifp, int age)
	{
	struct bridge_softc *sc = ifp->if_bridge;
	struct bridge_rtnode *brt;

	BRIDGE_LOCK(sc);

	/*
	* If the age is zero then flush, otherwise set all the expiry times to
	* age for the interface
	*/
	if (age == 0)
	bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN);
	else {
	LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) {
	/* Cap the expiry time to 'age' */
	if (brt->brt_ifp == ifp &&
	brt->brt_expire > time_uptime + age &&
	(brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
	brt->brt_expire = time_uptime + age;
	}
	}
	BRIDGE_UNLOCK(sc);
	}

	/*
	* bridge_state_change:
	*
	* Callback from the bridgestp code when a port changes states.
	*/
	static void
	bridge_state_change(struct ifnet *ifp, int state)
	{
	struct bridge_softc *sc = ifp->if_bridge;
	static const char *stpstates[] = {
	"disabled",
	"listening",
	"learning",
	"forwarding",
	"blocking",
	"discarding"
	};

	if (log_stp)
	log(LOG_NOTICE, "%s: state changed to %s on %s\n",
	sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname);
	}

	/*
	* Send bridge packets through pfil if they are one of the types pfil can deal
	* with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without
	* question.) If bifp or ifp are NULL then packet filtering is skipped for
	* that interface.
	*/
	static int
	bridge_pfil(struct mbuf *mp, struct ifnet bifp, struct ifnet *ifp, int dir)
	{
	int snap, error, i, hlen;
	struct ether_header *eh1, eh2;
	struct ip_fw_args args;
	struct ip *ip;
	struct llc llc1;
	u_int16_t ether_type;

	snap = 0;
	error = -1; /* Default error if not error == 0 */

	#if 0
	/* we may return with the IP fields swapped, ensure its not shared */
	KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__));
	#endif

	if (pfil_bridge == 0 && pfil_member == 0 && pfil_ipfw == 0)
	return (0); /* filtering is disabled */

	i = min((*mp)->m_pkthdr.len, max_protohdr);
	if ((*mp)->m_len < i) {
	mp = m_pullup(mp, i);
	if (*mp == NULL) {
	printf("%s: m_pullup failed\n", __func__);
	return (-1);
	}
	}

	eh1 = mtod(mp, struct ether_header );
	ether_type = ntohs(eh1->ether_type);

	/*
	* Check for SNAP/LLC.
	*/
	if (ether_type < ETHERMTU) {
	struct llc llc2 = (struct llc )(eh1 + 1);

	if ((*mp)->m_len >= ETHER_HDR_LEN + 8 &&
	llc2->llc_dsap == LLC_SNAP_LSAP &&
	llc2->llc_ssap == LLC_SNAP_LSAP &&
	llc2->llc_control == LLC_UI) {
	ether_type = htons(llc2->llc_un.type_snap.ether_type);
	snap = 1;
	}
	}

	/*
	* If we're trying to filter bridge traffic, don't look at anything
	* other than IP and ARP traffic. If the filter doesn't understand
	* IPv6, don't allow IPv6 through the bridge either. This is lame
	* since if we really wanted, say, an AppleTalk filter, we are hosed,
	* but of course we don't have an AppleTalk filter to begin with.
	* (Note that since pfil doesn't understand ARP it will pass ALL
	* ARP traffic.)
	*/
	switch (ether_type) {
	case ETHERTYPE_ARP:
	case ETHERTYPE_REVARP:
	if (pfil_ipfw_arp == 0)
	return (0); /* Automatically pass */
	break;

	case ETHERTYPE_IP:
	#ifdef INET6
	case ETHERTYPE_IPV6:
	#endif /* INET6 */
	break;
	default:
	/*
	* Check to see if the user wants to pass non-ip
	* packets, these will not be checked by pfil(9) and
	* passed unconditionally so the default is to drop.
	*/
	if (pfil_onlyip)
	goto bad;
	}

	/* Strip off the Ethernet header and keep a copy. */
	m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2);
	m_adj(*mp, ETHER_HDR_LEN);

	/* Strip off snap header, if present */
	if (snap) {
	m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1);
	m_adj(*mp, sizeof(struct llc));
	}

	/*
	* Check the IP header for alignment and errors
	*/
	if (dir == PFIL_IN) {
	switch (ether_type) {
	case ETHERTYPE_IP:
	error = bridge_ip_checkbasic(mp);
	break;
	#ifdef INET6
	case ETHERTYPE_IPV6:
	error = bridge_ip6_checkbasic(mp);
	break;
	#endif /* INET6 */
	default:
	error = 0;
	}
	if (error)
	goto bad;
	}

	if (IPFW_LOADED && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) {
	+ INIT_VNET_IPFW(curvnet);
	+
	error = -1;
	args.rule = ip_dn_claim_rule(*mp);
	if (args.rule != NULL && V_fw_one_pass)
	goto ipfwpass; /* packet already partially processed */

	args.m = *mp;
	args.oif = ifp;
	args.next_hop = NULL;
	args.eh = &eh2;
	args.inp = NULL; /* used by ipfw uid/gid/jail rules */
	i = ip_fw_chk_ptr(&args);
	*mp = args.m;

	if (*mp == NULL)
	return (error);

	if (DUMMYNET_LOADED && (i == IP_FW_DUMMYNET)) {

	/* put the Ethernet header back on */
	M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT);
	if (*mp == NULL)
	return (error);
	bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN);

	/*
	* Pass the pkt to dummynet, which consumes it. The
	* packet will return to us via bridge_dummynet().
	*/
	args.oif = ifp;
	ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args);
	return (error);
	}

	if (i != IP_FW_PASS) /* drop */
	goto bad;
	}

	ipfwpass:
	error = 0;

	/*
	* Run the packet through pfil
	*/
	switch (ether_type) {
	case ETHERTYPE_IP:
	/*
	* before calling the firewall, swap fields the same as
	* IP does. here we assume the header is contiguous
	*/
	ip = mtod(mp, struct ip );

	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);

	/*
	* Run pfil on the member interface and the bridge, both can
	* be skipped by clearing pfil_member or pfil_bridge.
	*
	* Keep the order:
	* in_if -> bridge_if -> out_if
	*/
	if (pfil_bridge && dir == PFIL_OUT && bifp != NULL)
	error = pfil_run_hooks(&inet_pfil_hook, mp, bifp,
	dir, NULL);

	if (mp == NULL \|\| error != 0) / filter may consume */
	break;

	if (pfil_member && ifp != NULL)
	error = pfil_run_hooks(&inet_pfil_hook, mp, ifp,
	dir, NULL);

	if (mp == NULL \|\| error != 0) / filter may consume */
	break;

	if (pfil_bridge && dir == PFIL_IN && bifp != NULL)
	error = pfil_run_hooks(&inet_pfil_hook, mp, bifp,
	dir, NULL);

	if (mp == NULL \|\| error != 0) / filter may consume */
	break;

	/* check if we need to fragment the packet */
	if (pfil_member && ifp != NULL && dir == PFIL_OUT) {
	i = (*mp)->m_pkthdr.len;
	if (i > ifp->if_mtu) {
	error = bridge_fragment(ifp, *mp, &eh2, snap,
	&llc1);
	return (error);
	}
	}

	/* Recalculate the ip checksum and restore byte ordering */
	ip = mtod(mp, struct ip );
	hlen = ip->ip_hl << 2;
	if (hlen < sizeof(struct ip))
	goto bad;
	if (hlen > (*mp)->m_len) {
	if ((mp = m_pullup(mp, hlen)) == 0)
	goto bad;
	ip = mtod(mp, struct ip );
	if (ip == NULL)
	goto bad;
	}
	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);
	ip->ip_sum = 0;
	if (hlen == sizeof(struct ip))
	ip->ip_sum = in_cksum_hdr(ip);
	else
	ip->ip_sum = in_cksum(*mp, hlen);

	break;
	#ifdef INET6
	case ETHERTYPE_IPV6:
	if (pfil_bridge && dir == PFIL_OUT && bifp != NULL)
	error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp,
	dir, NULL);

	if (mp == NULL \|\| error != 0) / filter may consume */
	break;

	if (pfil_member && ifp != NULL)
	error = pfil_run_hooks(&inet6_pfil_hook, mp, ifp,
	dir, NULL);

	if (mp == NULL \|\| error != 0) / filter may consume */
	break;

	if (pfil_bridge && dir == PFIL_IN && bifp != NULL)
	error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp,
	dir, NULL);
	break;
	#endif
	default:
	error = 0;
	break;
	}

	if (*mp == NULL)
	return (error);
	if (error != 0)
	goto bad;

	error = -1;

	/*
	* Finally, put everything back the way it was and return
	*/
	if (snap) {
	M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT);
	if (*mp == NULL)
	return (error);
	bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc));
	}

	M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT);
	if (*mp == NULL)
	return (error);
	bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN);

	return (0);

	bad:
	m_freem(*mp);
	*mp = NULL;
	return (error);
	}

	/*
	* Perform basic checks on header size since
	* pfil assumes ip_input has already processed
	* it for it. Cut-and-pasted from ip_input.c.
	* Given how simple the IPv6 version is,
	* does the IPv4 version really need to be
	* this complicated?
	*
	* XXX Should we update ipstat here, or not?
	* XXX Right now we update ipstat but not
	* XXX csum_counter.
	*/
	static int
	bridge_ip_checkbasic(struct mbuf **mp)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf m = mp;
	struct ip *ip;
	int len, hlen;
	u_short sum;

	if (*mp == NULL)
	return (-1);

	if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
	if ((m = m_copyup(m, sizeof(struct ip),
	(max_linkhdr + 3) & ~3)) == NULL) {
	/* XXXJRT new stat, please */
	V_ipstat.ips_toosmall++;
	goto bad;
	}
	} else if (__predict_false(m->m_len < sizeof (struct ip))) {
	if ((m = m_pullup(m, sizeof (struct ip))) == NULL) {
	V_ipstat.ips_toosmall++;
	goto bad;
	}
	}
	ip = mtod(m, struct ip *);
	if (ip == NULL) goto bad;

	if (ip->ip_v != IPVERSION) {
	V_ipstat.ips_badvers++;
	goto bad;
	}
	hlen = ip->ip_hl << 2;
	if (hlen < sizeof(struct ip)) { /* minimum header length */
	V_ipstat.ips_badhlen++;
	goto bad;
	}
	if (hlen > m->m_len) {
	if ((m = m_pullup(m, hlen)) == 0) {
	V_ipstat.ips_badhlen++;
	goto bad;
	}
	ip = mtod(m, struct ip *);
	if (ip == NULL) goto bad;
	}

	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
	sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
	} else {
	if (hlen == sizeof(struct ip)) {
	sum = in_cksum_hdr(ip);
	} else {
	sum = in_cksum(m, hlen);
	}
	}
	if (sum) {
	V_ipstat.ips_badsum++;
	goto bad;
	}

	/* Retrieve the packet length. */
	len = ntohs(ip->ip_len);

	/*
	* Check for additional length bogosity
	*/
	if (len < hlen) {
	V_ipstat.ips_badlen++;
	goto bad;
	}

	/*
	* Check that the amount of data in the buffers
	* is as at least much as the IP header would have us expect.
	* Drop packet if shorter than we expect.
	*/
	if (m->m_pkthdr.len < len) {
	V_ipstat.ips_tooshort++;
	goto bad;
	}

	/* Checks out, proceed */
	*mp = m;
	return (0);

	bad:
	*mp = m;
	return (-1);
	}

	#ifdef INET6
	/*
	* Same as above, but for IPv6.
	* Cut-and-pasted from ip6_input.c.
	* XXX Should we update ip6stat, or not?
	*/
	static int
	bridge_ip6_checkbasic(struct mbuf **mp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	struct ip6_hdr *ip6;

	/*
	* If the IPv6 header is not aligned, slurp it up into a new
	* mbuf with space for link headers, in the event we forward
	* it. Otherwise, if it is aligned, make sure the entire base
	* IPv6 header is in the first mbuf of the chain.
	*/
	if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) {
	struct ifnet *inifp = m->m_pkthdr.rcvif;
	if ((m = m_copyup(m, sizeof(struct ip6_hdr),
	(max_linkhdr + 3) & ~3)) == NULL) {
	/* XXXJRT new stat, please */
	V_ip6stat.ip6s_toosmall++;
	in6_ifstat_inc(inifp, ifs6_in_hdrerr);
	goto bad;
	}
	} else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) {
	struct ifnet *inifp = m->m_pkthdr.rcvif;
	if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
	V_ip6stat.ip6s_toosmall++;
	in6_ifstat_inc(inifp, ifs6_in_hdrerr);
	goto bad;
	}
	}

	ip6 = mtod(m, struct ip6_hdr *);

	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
	V_ip6stat.ip6s_badvers++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
	goto bad;
	}

	/* Checks out, proceed */
	*mp = m;
	return (0);

	bad:
	*mp = m;
	return (-1);
	}
	#endif /* INET6 */

	/*
	* bridge_fragment:
	*
	* Return a fragmented mbuf chain.
	*/
	static int
	bridge_fragment(struct ifnet ifp, struct mbuf m, struct ether_header *eh,
	int snap, struct llc *llc)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *m0;
	struct ip *ip;
	int error = -1;

	if (m->m_len < sizeof(struct ip) &&
	(m = m_pullup(m, sizeof(struct ip))) == NULL)
	goto out;
	ip = mtod(m, struct ip *);

	error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist,
	CSUM_DELAY_IP);
	if (error)
	goto out;

	/* walk the chain and re-add the Ethernet header */
	for (m0 = m; m0; m0 = m0->m_nextpkt) {
	if (error == 0) {
	if (snap) {
	M_PREPEND(m0, sizeof(struct llc), M_DONTWAIT);
	if (m0 == NULL) {
	error = ENOBUFS;
	continue;
	}
	bcopy(llc, mtod(m0, caddr_t),
	sizeof(struct llc));
	}
	M_PREPEND(m0, ETHER_HDR_LEN, M_DONTWAIT);
	if (m0 == NULL) {
	error = ENOBUFS;
	continue;
	}
	bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN);
	} else
	m_freem(m);
	}

	if (error == 0)
	V_ipstat.ips_fragmented++;

	return (error);

	out:
	if (m != NULL)
	m_freem(m);
	return (error);
	}
	Index: head/sys/net/if_ef.c
	===================================================================
	--- head/sys/net/if_ef.c (revision 183549)
	+++ head/sys/net/if_ef.c (revision 183550)
	@@ -1,590 +1,598 @@
	/*-
	* Copyright (c) 1999, 2000 Boris Popov
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include "opt_inet.h"
	#include "opt_ipx.h"
	#include "opt_ef.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sockio.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/vimage.h>

	#include <net/ethernet.h>
	#include <net/if_llc.h>
	#include <net/if.h>
	#include <net/if_arp.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/bpf.h>

	#ifdef INET
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>
	#endif

	#ifdef IPX
	#include <netipx/ipx.h>
	#include <netipx/ipx_if.h>
	#endif

	/* If none of the supported layers is enabled explicitly enable them all */
	#if !defined(ETHER_II) && !defined(ETHER_8023) && !defined(ETHER_8022) && \
	!defined(ETHER_SNAP)
	#define ETHER_II 1
	#define ETHER_8023 1
	#define ETHER_8022 1
	#define ETHER_SNAP 1
	#endif

	/* internal frame types */
	#define ETHER_FT_EII 0 /* Ethernet_II - default */
	#define ETHER_FT_8023 1 /* 802.3 (Novell) */
	#define ETHER_FT_8022 2 /* 802.2 */
	#define ETHER_FT_SNAP 3 /* SNAP */
	#define EF_NFT 4 /* total number of frame types */

	#ifdef EF_DEBUG
	#define EFDEBUG(format, args...) printf("%s: "format, __func__ ,## args)
	#else
	#define EFDEBUG(format, args...)
	#endif

	#define EFERROR(format, args...) printf("%s: "format, __func__ ,## args)

	struct efnet {
	struct ifnet *ef_ifp;
	struct ifnet *ef_pifp;
	int ef_frametype;
	};

	struct ef_link {
	SLIST_ENTRY(ef_link) el_next;
	struct ifnet el_ifp; / raw device for this clones */
	struct efnet el_units[EF_NFT]; / our clones */
	};

	static SLIST_HEAD(ef_link_head, ef_link) efdev = {NULL};
	static int efcount;

	extern int (ef_inputp)(struct ifnet, struct ether_header eh, struct mbuf m);
	extern int (ef_outputp)(struct ifnet ifp, struct mbuf **mp,
	struct sockaddr dst, short tp, int *hlen);

	/*
	static void ef_reset (struct ifnet *);
	*/
	static int ef_attach(struct efnet *sc);
	static int ef_detach(struct efnet *sc);
	static void ef_init(void *);
	static int ef_ioctl(struct ifnet *, u_long, caddr_t);
	static void ef_start(struct ifnet *);
	static int ef_input(struct ifnet, struct ether_header , struct mbuf *);
	static int ef_output(struct ifnet ifp, struct mbuf *mp,
	struct sockaddr dst, short tp, int *hlen);

	static int ef_load(void);
	static int ef_unload(void);

	/*
	* Install the interface, most of structure initialization done in ef_clone()
	*/
	static int
	ef_attach(struct efnet *sc)
	{
	struct ifnet *ifp = sc->ef_ifp;

	ifp->if_start = ef_start;
	ifp->if_watchdog = NULL;
	ifp->if_init = ef_init;
	ifp->if_snd.ifq_maxlen = IFQ_MAXLEN;
	ifp->if_flags = (IFF_BROADCAST \| IFF_SIMPLEX \| IFF_MULTICAST);
	/*
	* Attach the interface
	*/
	ether_ifattach(ifp, IF_LLADDR(sc->ef_pifp));

	ifp->if_resolvemulti = 0;
	ifp->if_type = IFT_XETHER;
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;

	EFDEBUG("%s: attached\n", ifp->if_xname);
	return 1;
	}

	/*
	* This is for _testing_only_, just removes interface from interfaces list
	*/
	static int
	ef_detach(struct efnet *sc)
	{
	struct ifnet *ifp = sc->ef_ifp;
	int s;

	s = splimp();

	ether_ifdetach(ifp);
	if_free(ifp);

	splx(s);
	return 0;
	}

	static void
	ef_init(void *foo) {
	return;
	}

	static int
	ef_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct efnet *sc = ifp->if_softc;
	struct ifaddr ifa = (struct ifaddr)data;
	int s, error;

	EFDEBUG("IOCTL %ld for %s\n", cmd, ifp->if_xname);
	error = 0;
	s = splimp();
	switch (cmd) {
	case SIOCSIFFLAGS:
	error = 0;
	break;
	case SIOCSIFADDR:
	if (sc->ef_frametype == ETHER_FT_8023 &&
	ifa->ifa_addr->sa_family != AF_IPX) {
	error = EAFNOSUPPORT;
	break;
	}
	ifp->if_flags \|= IFF_UP;
	/* FALL THROUGH */
	default:
	error = ether_ioctl(ifp, cmd, data);
	break;
	}
	splx(s);
	return error;
	}

	/*
	* Currently packet prepared in the ether_output(), but this can be a better
	* place.
	*/
	static void
	ef_start(struct ifnet *ifp)
	{
	struct efnet sc = (struct efnet)ifp->if_softc;
	struct ifnet *p;
	struct mbuf *m;
	int error;

	ifp->if_drv_flags \|= IFF_DRV_OACTIVE;
	p = sc->ef_pifp;

	EFDEBUG("\n");
	for (;;) {
	IF_DEQUEUE(&ifp->if_snd, m);
	if (m == 0)
	break;
	BPF_MTAP(ifp, m);
	IFQ_HANDOFF(p, m, error);
	if (error) {
	ifp->if_oerrors++;
	continue;
	}
	ifp->if_opackets++;
	}
	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
	return;
	}

	/*
	* Inline functions do not put additional overhead to procedure call or
	* parameter passing but simplify the code
	*/
	static int __inline
	ef_inputEII(struct mbuf m, struct ether_header eh, u_short ether_type)
	{
	int isr;

	switch(ether_type) {
	#ifdef IPX
	case ETHERTYPE_IPX:
	isr = NETISR_IPX;
	break;
	#endif
	#ifdef INET
	case ETHERTYPE_IP:
	if ((m = ip_fastforward(m)) == NULL)
	return (0);
	isr = NETISR_IP;
	break;

	case ETHERTYPE_ARP:
	isr = NETISR_ARP;
	break;
	#endif
	default:
	return (EPROTONOSUPPORT);
	}
	netisr_dispatch(isr, m);
	return (0);
	}

	static int __inline
	ef_inputSNAP(struct mbuf m, struct ether_header eh, struct llc* l,
	u_short ether_type)
	{
	int isr;

	switch(ether_type) {
	#ifdef IPX
	case ETHERTYPE_IPX:
	m_adj(m, 8);
	isr = NETISR_IPX;
	break;
	#endif
	default:
	return (EPROTONOSUPPORT);
	}
	netisr_dispatch(isr, m);
	return (0);
	}

	static int __inline
	ef_input8022(struct mbuf m, struct ether_header eh, struct llc* l,
	u_short ether_type)
	{
	int isr;

	switch(ether_type) {
	#ifdef IPX
	case 0xe0:
	m_adj(m, 3);
	isr = NETISR_IPX;
	break;
	#endif
	default:
	return (EPROTONOSUPPORT);
	}
	netisr_dispatch(isr, m);
	return (0);
	}

	/*
	* Called from ether_input()
	*/
	static int
	ef_input(struct ifnet ifp, struct ether_header eh, struct mbuf *m)
	{
	u_short ether_type;
	int ft = -1;
	struct efnet *efp;
	struct ifnet *eifp;
	struct llc *l;
	struct ef_link *efl;
	int isr;

	ether_type = ntohs(eh->ether_type);
	l = NULL;
	if (ether_type < ETHERMTU) {
	l = mtod(m, struct llc*);
	if (l->llc_dsap == 0xff && l->llc_ssap == 0xff) {
	/*
	* Novell's "802.3" frame
	*/
	ft = ETHER_FT_8023;
	} else if (l->llc_dsap == 0xaa && l->llc_ssap == 0xaa) {
	/*
	* 802.2/SNAP
	*/
	ft = ETHER_FT_SNAP;
	ether_type = ntohs(l->llc_un.type_snap.ether_type);
	} else if (l->llc_dsap == l->llc_ssap) {
	/*
	* 802.3/802.2
	*/
	ft = ETHER_FT_8022;
	ether_type = l->llc_ssap;
	}
	} else
	ft = ETHER_FT_EII;

	if (ft == -1) {
	EFDEBUG("Unrecognised ether_type %x\n", ether_type);
	return EPROTONOSUPPORT;
	}

	/*
	* Check if interface configured for the given frame
	*/
	efp = NULL;
	SLIST_FOREACH(efl, &efdev, el_next) {
	if (efl->el_ifp == ifp) {
	efp = efl->el_units[ft];
	break;
	}
	}
	if (efp == NULL) {
	EFDEBUG("Can't find if for %d\n", ft);
	return EPROTONOSUPPORT;
	}
	eifp = efp->ef_ifp;
	if ((eifp->if_flags & IFF_UP) == 0)
	return EPROTONOSUPPORT;
	eifp->if_ibytes += m->m_pkthdr.len + sizeof (*eh);
	m->m_pkthdr.rcvif = eifp;

	BPF_MTAP2(eifp, eh, ETHER_HDR_LEN, m);
	/*
	* Now we ready to adjust mbufs and pass them to protocol intr's
	*/
	switch(ft) {
	case ETHER_FT_EII:
	return (ef_inputEII(m, eh, ether_type));
	#ifdef IPX
	case ETHER_FT_8023: /* only IPX can be here */
	isr = NETISR_IPX;
	break;
	#endif
	case ETHER_FT_SNAP:
	return (ef_inputSNAP(m, eh, l, ether_type));
	case ETHER_FT_8022:
	return (ef_input8022(m, eh, l, ether_type));
	default:
	EFDEBUG("No support for frame %d and proto %04x\n",
	ft, ether_type);
	return (EPROTONOSUPPORT);
	}
	netisr_dispatch(isr, m);
	return (0);
	}

	static int
	ef_output(struct ifnet ifp, struct mbuf mp, struct sockaddr dst, short *tp,
	int *hlen)
	{
	struct efnet sc = (struct efnet)ifp->if_softc;
	struct mbuf m = mp;
	u_char *cp;
	short type;

	if (ifp->if_type != IFT_XETHER)
	return ENETDOWN;
	switch (sc->ef_frametype) {
	case ETHER_FT_EII:
	#ifdef IPX
	type = htons(ETHERTYPE_IPX);
	#else
	return EPFNOSUPPORT;
	#endif
	break;
	case ETHER_FT_8023:
	type = htons(m->m_pkthdr.len);
	break;
	case ETHER_FT_8022:
	M_PREPEND(m, ETHER_HDR_LEN + 3, M_WAIT);
	/*
	* Ensure that ethernet header and next three bytes
	* will fit into single mbuf
	*/
	m = m_pullup(m, ETHER_HDR_LEN + 3);
	if (m == NULL) {
	*mp = NULL;
	return ENOBUFS;
	}
	m_adj(m, ETHER_HDR_LEN);
	type = htons(m->m_pkthdr.len);
	cp = mtod(m, u_char *);
	*cp++ = 0xE0;
	*cp++ = 0xE0;
	*cp++ = 0x03;
	*hlen += 3;
	break;
	case ETHER_FT_SNAP:
	M_PREPEND(m, 8, M_WAIT);
	type = htons(m->m_pkthdr.len);
	cp = mtod(m, u_char *);
	bcopy("\xAA\xAA\x03\x00\x00\x00\x81\x37", cp, 8);
	*hlen += 8;
	break;
	default:
	return EPFNOSUPPORT;
	}
	*mp = m;
	*tp = type;
	return 0;
	}

	/*
	* Create clone from the given interface
	*/
	static int
	ef_clone(struct ef_link *efl, int ft)
	{
	struct efnet *efp;
	struct ifnet *eifp;
	struct ifnet *ifp = efl->el_ifp;

	efp = (struct efnet*)malloc(sizeof(struct efnet), M_IFADDR,
	M_WAITOK \| M_ZERO);
	if (efp == NULL)
	return ENOMEM;
	efp->ef_pifp = ifp;
	efp->ef_frametype = ft;
	eifp = efp->ef_ifp = if_alloc(IFT_ETHER);
	if (eifp == NULL) {
	free(efp, M_IFADDR);
	return (ENOSPC);
	}
	snprintf(eifp->if_xname, IFNAMSIZ,
	"%sf%d", ifp->if_xname, efp->ef_frametype);
	eifp->if_dname = "ef";
	eifp->if_dunit = IF_DUNIT_NONE;
	eifp->if_softc = efp;
	if (ifp->if_ioctl)
	eifp->if_ioctl = ef_ioctl;
	efl->el_units[ft] = efp;
	return 0;
	}

	static int
	ef_load(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct ifnet *ifp;
	struct efnet *efp;
	struct ef_link efl = NULL, efl_temp;
	int error = 0, d;

	- IFNET_RLOCK();
	- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	- if (ifp->if_type != IFT_ETHER) continue;
	- EFDEBUG("Found interface %s\n", ifp->if_xname);
	- efl = (struct ef_link*)malloc(sizeof(struct ef_link),
	- M_IFADDR, M_WAITOK \| M_ZERO);
	- if (efl == NULL) {
	- error = ENOMEM;
	- break;
	- }
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_NET(vnet_iter);
	+ IFNET_RLOCK();
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	+ if (ifp->if_type != IFT_ETHER) continue;
	+ EFDEBUG("Found interface %s\n", ifp->if_xname);
	+ efl = (struct ef_link*)malloc(sizeof(struct ef_link),
	+ M_IFADDR, M_WAITOK \| M_ZERO);
	+ if (efl == NULL) {
	+ error = ENOMEM;
	+ break;
	+ }

	- efl->el_ifp = ifp;
	+ efl->el_ifp = ifp;
	#ifdef ETHER_II
	- error = ef_clone(efl, ETHER_FT_EII);
	- if (error) break;
	+ error = ef_clone(efl, ETHER_FT_EII);
	+ if (error) break;
	#endif
	#ifdef ETHER_8023
	- error = ef_clone(efl, ETHER_FT_8023);
	- if (error) break;
	+ error = ef_clone(efl, ETHER_FT_8023);
	+ if (error) break;
	#endif
	#ifdef ETHER_8022
	- error = ef_clone(efl, ETHER_FT_8022);
	- if (error) break;
	+ error = ef_clone(efl, ETHER_FT_8022);
	+ if (error) break;
	#endif
	#ifdef ETHER_SNAP
	- error = ef_clone(efl, ETHER_FT_SNAP);
	- if (error) break;
	+ error = ef_clone(efl, ETHER_FT_SNAP);
	+ if (error) break;
	#endif
	- efcount++;
	- SLIST_INSERT_HEAD(&efdev, efl, el_next);
	+ efcount++;
	+ SLIST_INSERT_HEAD(&efdev, efl, el_next);
	+ }
	+ IFNET_RUNLOCK();
	+ CURVNET_RESTORE();
	}
	- IFNET_RUNLOCK();
	+ VNET_LIST_RUNLOCK();
	if (error) {
	if (efl)
	SLIST_INSERT_HEAD(&efdev, efl, el_next);
	SLIST_FOREACH_SAFE(efl, &efdev, el_next, efl_temp) {
	for (d = 0; d < EF_NFT; d++)
	if (efl->el_units[d]) {
	if (efl->el_units[d]->ef_pifp != NULL)
	if_free(efl->el_units[d]->ef_pifp);
	free(efl->el_units[d], M_IFADDR);
	}
	free(efl, M_IFADDR);
	}
	return error;
	}
	SLIST_FOREACH(efl, &efdev, el_next) {
	for (d = 0; d < EF_NFT; d++) {
	efp = efl->el_units[d];
	if (efp)
	ef_attach(efp);
	}
	}
	ef_inputp = ef_input;
	ef_outputp = ef_output;
	EFDEBUG("Loaded\n");
	return 0;
	}

	static int
	ef_unload(void)
	{
	struct efnet *efp;
	struct ef_link *efl;
	int d;

	ef_inputp = NULL;
	ef_outputp = NULL;
	SLIST_FOREACH(efl, &efdev, el_next) {
	for (d = 0; d < EF_NFT; d++) {
	efp = efl->el_units[d];
	if (efp) {
	ef_detach(efp);
	}
	}
	}
	EFDEBUG("Unloaded\n");
	return 0;
	}

	static int
	if_ef_modevent(module_t mod, int type, void *data)
	{
	switch ((modeventtype_t)type) {
	case MOD_LOAD:
	return ef_load();
	case MOD_UNLOAD:
	return ef_unload();
	default:
	return EOPNOTSUPP;
	}
	return 0;
	}

	static moduledata_t if_ef_mod = {
	"if_ef", if_ef_modevent, NULL
	};

	DECLARE_MODULE(if_ef, if_ef_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
	Index: head/sys/net/if_ethersubr.c
	===================================================================
	--- head/sys/net/if_ethersubr.c (revision 183549)
	+++ head/sys/net/if_ethersubr.c (revision 183550)
	@@ -1,1288 +1,1291 @@
	/*-
	* Copyright (c) 1982, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93
	* $FreeBSD$
	*/

	#include "opt_atalk.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipx.h"
	#include "opt_mac.h"
	#include "opt_netgraph.h"
	#include "opt_carp.h"
	#include "opt_mbuf_profiling.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mbuf.h>
	#include <sys/random.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_arp.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/if_llc.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/if_bridgevar.h>
	#include <net/if_vlan_var.h>
	#include <net/pf_mtag.h>

	#if defined(INET) \|\| defined(INET6)
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>
	#include <netinet/ip_fw.h>
	#include <netinet/ip_dummynet.h>
	#endif
	#ifdef INET6
	#include <netinet6/nd6.h>
	#endif

	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif

	#ifdef IPX
	#include <netipx/ipx.h>
	#include <netipx/ipx_if.h>
	#endif
	int (ef_inputp)(struct ifnet, struct ether_header eh, struct mbuf m);
	int (ef_outputp)(struct ifnet ifp, struct mbuf **mp,
	struct sockaddr dst, short tp, int *hlen);

	#ifdef NETATALK
	#include <netatalk/at.h>
	#include <netatalk/at_var.h>
	#include <netatalk/at_extern.h>

	#define llc_snap_org_code llc_un.type_snap.org_code
	#define llc_snap_ether_type llc_un.type_snap.ether_type

	extern u_char at_org_code[3];
	extern u_char aarp_org_code[3];
	#endif /* NETATALK */

	#include <security/mac/mac_framework.h>

	#ifdef CTASSERT
	CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2);
	CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN);
	#endif

	/* netgraph node hooks for ng_ether(4) */
	void (ng_ether_input_p)(struct ifnet ifp, struct mbuf **mp);
	void (ng_ether_input_orphan_p)(struct ifnet ifp, struct mbuf *m);
	int (ng_ether_output_p)(struct ifnet ifp, struct mbuf **mp);
	void (ng_ether_attach_p)(struct ifnet ifp);
	void (ng_ether_detach_p)(struct ifnet ifp);

	void (vlan_input_p)(struct ifnet , struct mbuf *);

	/* if_bridge(4) support */
	struct mbuf (bridge_input_p)(struct ifnet , struct mbuf );
	int (bridge_output_p)(struct ifnet , struct mbuf *,
	struct sockaddr , struct rtentry );
	void (bridge_dn_p)(struct mbuf , struct ifnet *);

	/* if_lagg(4) support */
	struct mbuf (lagg_input_p)(struct ifnet , struct mbuf );

	static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

	static int ether_resolvemulti(struct ifnet , struct sockaddr *,
	struct sockaddr *);

	/* XXX: should be in an arp support file, not here */
	MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals");

	#define ETHER_IS_BROADCAST(addr) \
	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)

	#define senderr(e) do { error = (e); goto bad;} while (0)

	#if defined(INET) \|\| defined(INET6)
	int
	ether_ipfw_chk(struct mbuf *m0, struct ifnet dst,
	struct ip_fw **rule, int shared);
	static int ether_ipfw;
	#endif

	/*
	* Ethernet output routine.
	* Encapsulate a packet of type family for the local net.
	* Use trailer local net encapsulation if enough data in first
	* packet leaves a multiple of 512 bytes of data in remainder.
	*/
	int
	ether_output(struct ifnet ifp, struct mbuf m,
	struct sockaddr dst, struct rtentry rt0)
	{
	short type;
	int error, hdrcmplt = 0;
	u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN];
	struct ether_header *eh;
	struct pf_mtag *t;
	int loop_copy = 1;
	int hlen; /* link layer header length */

	#ifdef MAC
	error = mac_ifnet_check_transmit(ifp, m);
	if (error)
	senderr(error);
	#endif

	M_PROFILE(m);
	if (ifp->if_flags & IFF_MONITOR)
	senderr(ENETDOWN);
	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)))
	senderr(ENETDOWN);

	hlen = ETHER_HDR_LEN;
	switch (dst->sa_family) {
	#ifdef INET
	case AF_INET:
	error = arpresolve(ifp, rt0, m, dst, edst);
	if (error)
	return (error == EWOULDBLOCK ? 0 : error);
	type = htons(ETHERTYPE_IP);
	break;
	case AF_ARP:
	{
	struct arphdr *ah;
	ah = mtod(m, struct arphdr *);
	ah->ar_hrd = htons(ARPHRD_ETHER);

	loop_copy = 0; /* if this is for us, don't do it */

	switch(ntohs(ah->ar_op)) {
	case ARPOP_REVREQUEST:
	case ARPOP_REVREPLY:
	type = htons(ETHERTYPE_REVARP);
	break;
	case ARPOP_REQUEST:
	case ARPOP_REPLY:
	default:
	type = htons(ETHERTYPE_ARP);
	break;
	}

	if (m->m_flags & M_BCAST)
	bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN);
	else
	bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN);

	}
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	error = nd6_storelladdr(ifp, rt0, m, dst, (u_char *)edst);
	if (error)
	return error;
	type = htons(ETHERTYPE_IPV6);
	break;
	#endif
	#ifdef IPX
	case AF_IPX:
	if (ef_outputp) {
	error = ef_outputp(ifp, &m, dst, &type, &hlen);
	if (error)
	goto bad;
	} else
	type = htons(ETHERTYPE_IPX);
	bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host),
	(caddr_t)edst, sizeof (edst));
	break;
	#endif
	#ifdef NETATALK
	case AF_APPLETALK:
	{
	struct at_ifaddr *aa;

	if ((aa = at_ifawithnet((struct sockaddr_at *)dst)) == NULL)
	senderr(EHOSTUNREACH); /* XXX */
	if (!aarpresolve(ifp, m, (struct sockaddr_at *)dst, edst))
	return (0);
	/*
	* In the phase 2 case, need to prepend an mbuf for the llc header.
	*/
	if ( aa->aa_flags & AFA_PHASE2 ) {
	struct llc llc;

	M_PREPEND(m, LLC_SNAPFRAMELEN, M_DONTWAIT);
	if (m == NULL)
	senderr(ENOBUFS);
	llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP;
	llc.llc_control = LLC_UI;
	bcopy(at_org_code, llc.llc_snap_org_code, sizeof(at_org_code));
	llc.llc_snap_ether_type = htons( ETHERTYPE_AT );
	bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN);
	type = htons(m->m_pkthdr.len);
	hlen = LLC_SNAPFRAMELEN + ETHER_HDR_LEN;
	} else {
	type = htons(ETHERTYPE_AT);
	}
	break;
	}
	#endif /* NETATALK */

	case pseudo_AF_HDRCMPLT:
	hdrcmplt = 1;
	eh = (struct ether_header *)dst->sa_data;
	(void)memcpy(esrc, eh->ether_shost, sizeof (esrc));
	/* FALLTHROUGH */

	case AF_UNSPEC:
	loop_copy = 0; /* if this is for us, don't do it */
	eh = (struct ether_header *)dst->sa_data;
	(void)memcpy(edst, eh->ether_dhost, sizeof (edst));
	type = eh->ether_type;
	break;

	default:
	if_printf(ifp, "can't handle af%d\n", dst->sa_family);
	senderr(EAFNOSUPPORT);
	}

	/*
	* Add local net header. If no space in first mbuf,
	* allocate another.
	*/
	M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
	if (m == NULL)
	senderr(ENOBUFS);
	eh = mtod(m, struct ether_header *);
	(void)memcpy(&eh->ether_type, &type,
	sizeof(eh->ether_type));
	(void)memcpy(eh->ether_dhost, edst, sizeof (edst));
	if (hdrcmplt)
	(void)memcpy(eh->ether_shost, esrc,
	sizeof(eh->ether_shost));
	else
	(void)memcpy(eh->ether_shost, IF_LLADDR(ifp),
	sizeof(eh->ether_shost));

	/*
	* If a simplex interface, and the packet is being sent to our
	* Ethernet address or a broadcast address, loopback a copy.
	* XXX To make a simplex device behave exactly like a duplex
	* device, we should copy in the case of sending to our own
	* ethernet address (thus letting the original actually appear
	* on the wire). However, we don't do that here for security
	* reasons and compatibility with the original behavior.
	*/
	if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy &&
	((t = pf_find_mtag(m)) == NULL \|\| !t->routed)) {
	int csum_flags = 0;

	if (m->m_pkthdr.csum_flags & CSUM_IP)
	csum_flags \|= (CSUM_IP_CHECKED\|CSUM_IP_VALID);
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
	csum_flags \|= (CSUM_DATA_VALID\|CSUM_PSEUDO_HDR);

	if (m->m_flags & M_BCAST) {
	struct mbuf *n;

	/*
	* Because if_simloop() modifies the packet, we need a
	* writable copy through m_dup() instead of a readonly
	* one as m_copy[m] would give us. The alternative would
	* be to modify if_simloop() to handle the readonly mbuf,
	* but performancewise it is mostly equivalent (trading
	* extra data copying vs. extra locking).
	*
	* XXX This is a local workaround. A number of less
	* often used kernel parts suffer from the same bug.
	* See PR kern/105943 for a proposed general solution.
	*/
	if ((n = m_dup(m, M_DONTWAIT)) != NULL) {
	n->m_pkthdr.csum_flags \|= csum_flags;
	if (csum_flags & CSUM_DATA_VALID)
	n->m_pkthdr.csum_data = 0xffff;
	(void)if_simloop(ifp, n, dst->sa_family, hlen);
	} else
	ifp->if_iqdrops++;
	} else if (bcmp(eh->ether_dhost, eh->ether_shost,
	ETHER_ADDR_LEN) == 0) {
	m->m_pkthdr.csum_flags \|= csum_flags;
	if (csum_flags & CSUM_DATA_VALID)
	m->m_pkthdr.csum_data = 0xffff;
	(void) if_simloop(ifp, m, dst->sa_family, hlen);
	return (0); /* XXX */
	}
	}

	/*
	* Bridges require special output handling.
	*/
	if (ifp->if_bridge) {
	BRIDGE_OUTPUT(ifp, m, error);
	return (error);
	}

	#ifdef DEV_CARP
	if (ifp->if_carp &&
	(error = carp_output(ifp, m, dst, NULL)))
	goto bad;
	#endif

	/* Handle ng_ether(4) processing, if any */
	if (IFP2AC(ifp)->ac_netgraph != NULL) {
	KASSERT(ng_ether_output_p != NULL,
	("ng_ether_output_p is NULL"));
	if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
	bad: if (m != NULL)
	m_freem(m);
	return (error);
	}
	if (m == NULL)
	return (0);
	}

	/* Continue with link-layer output */
	return ether_output_frame(ifp, m);
	}

	/*
	* Ethernet link layer output routine to send a raw frame to the device.
	*
	* This assumes that the 14 byte Ethernet header is present and contiguous
	* in the first mbuf (if BRIDGE'ing).
	*/
	int
	ether_output_frame(struct ifnet ifp, struct mbuf m)
	{
	int error;
	#if defined(INET) \|\| defined(INET6)
	+ INIT_VNET_NET(ifp->if_vnet);
	struct ip_fw *rule = ip_dn_claim_rule(m);

	if (IPFW_LOADED && V_ether_ipfw != 0) {
	if (ether_ipfw_chk(&m, ifp, &rule, 0) == 0) {
	if (m) {
	m_freem(m);
	return EACCES; /* pkt dropped */
	} else
	return 0; /* consumed e.g. in a pipe */
	}
	}
	#endif

	/*
	* Queue message on interface, update output statistics if
	* successful, and start output if interface not yet active.
	*/
	IFQ_HANDOFF(ifp, m, error);
	return (error);
	}

	#if defined(INET) \|\| defined(INET6)
	/*
	* ipfw processing for ethernet packets (in and out).
	* The second parameter is NULL from ether_demux, and ifp from
	* ether_output_frame.
	*/
	int
	ether_ipfw_chk(struct mbuf *m0, struct ifnet dst,
	struct ip_fw **rule, int shared)
	{
	+ INIT_VNET_IPFW(dst->if_vnet);
	struct ether_header *eh;
	struct ether_header save_eh;
	struct mbuf *m;
	int i;
	struct ip_fw_args args;

	if (*rule != NULL && V_fw_one_pass)
	return 1; /* dummynet packet, already partially processed */

	/*
	* I need some amt of data to be contiguous, and in case others need
	* the packet (shared==1) also better be in the first mbuf.
	*/
	m = *m0;
	i = min( m->m_pkthdr.len, max_protohdr);
	if ( shared \|\| m->m_len < i) {
	m = m_pullup(m, i);
	if (m == NULL) {
	*m0 = m;
	return 0;
	}
	}
	eh = mtod(m, struct ether_header *);
	save_eh = eh; / save copy for restore below */
	m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */

	args.m = m; /* the packet we are looking at */
	args.oif = dst; /* destination, if any */
	args.rule = rule; / matching rule to restart */
	args.next_hop = NULL; /* we do not support forward yet */
	args.eh = &save_eh; /* MAC header for bridged/MAC packets */
	args.inp = NULL; /* used by ipfw uid/gid/jail rules */
	i = ip_fw_chk_ptr(&args);
	m = args.m;
	if (m != NULL) {
	/*
	* Restore Ethernet header, as needed, in case the
	* mbuf chain was replaced by ipfw.
	*/
	M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
	if (m == NULL) {
	*m0 = m;
	return 0;
	}
	if (eh != mtod(m, struct ether_header *))
	bcopy(&save_eh, mtod(m, struct ether_header *),
	ETHER_HDR_LEN);
	}
	*m0 = m;
	*rule = args.rule;

	if (i == IP_FW_DENY) /* drop */
	return 0;

	KASSERT(m != NULL, ("ether_ipfw_chk: m is NULL"));

	if (i == IP_FW_PASS) /* a PASS rule. */
	return 1;

	if (DUMMYNET_LOADED && (i == IP_FW_DUMMYNET)) {
	/*
	* Pass the pkt to dummynet, which consumes it.
	* If shared, make a copy and keep the original.
	*/
	if (shared) {
	m = m_copypacket(m, M_DONTWAIT);
	if (m == NULL)
	return 0;
	} else {
	/*
	* Pass the original to dummynet and
	* nothing back to the caller
	*/
	*m0 = NULL ;
	}
	ip_dn_io_ptr(&m, dst ? DN_TO_ETH_OUT: DN_TO_ETH_DEMUX, &args);
	return 0;
	}
	/*
	* XXX at some point add support for divert/forward actions.
	* If none of the above matches, we have to drop the pkt.
	*/
	return 0;
	}
	#endif

	/*
	* Process a received Ethernet packet; the packet is in the
	* mbuf chain m with the ethernet header at the front.
	*/
	static void
	ether_input(struct ifnet ifp, struct mbuf m)
	{
	struct ether_header *eh;
	u_short etype;

	if ((ifp->if_flags & IFF_UP) == 0) {
	m_freem(m);
	return;
	}
	#ifdef DIAGNOSTIC
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
	if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n");
	m_freem(m);
	return;
	}
	#endif
	/*
	* Do consistency checks to verify assumptions
	* made by code past this point.
	*/
	if ((m->m_flags & M_PKTHDR) == 0) {
	if_printf(ifp, "discard frame w/o packet header\n");
	ifp->if_ierrors++;
	m_freem(m);
	return;
	}
	if (m->m_len < ETHER_HDR_LEN) {
	/* XXX maybe should pullup? */
	if_printf(ifp, "discard frame w/o leading ethernet "
	"header (len %u pkt len %u)\n",
	m->m_len, m->m_pkthdr.len);
	ifp->if_ierrors++;
	m_freem(m);
	return;
	}
	eh = mtod(m, struct ether_header *);
	etype = ntohs(eh->ether_type);
	if (m->m_pkthdr.rcvif == NULL) {
	if_printf(ifp, "discard frame w/o interface pointer\n");
	ifp->if_ierrors++;
	m_freem(m);
	return;
	}
	#ifdef DIAGNOSTIC
	if (m->m_pkthdr.rcvif != ifp) {
	if_printf(ifp, "Warning, frame marked as received on %s\n",
	m->m_pkthdr.rcvif->if_xname);
	}
	#endif

	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
	if (ETHER_IS_BROADCAST(eh->ether_dhost))
	m->m_flags \|= M_BCAST;
	else
	m->m_flags \|= M_MCAST;
	ifp->if_imcasts++;
	}

	#ifdef MAC
	/*
	* Tag the mbuf with an appropriate MAC label before any other
	* consumers can get to it.
	*/
	mac_ifnet_create_mbuf(ifp, m);
	#endif

	/*
	* Give bpf a chance at the packet.
	*/
	ETHER_BPF_MTAP(ifp, m);

	/*
	* If the CRC is still on the packet, trim it off. We do this once
	* and once only in case we are re-entered. Nothing else on the
	* Ethernet receive path expects to see the FCS.
	*/
	if (m->m_flags & M_HASFCS) {
	m_adj(m, -ETHER_CRC_LEN);
	m->m_flags &= ~M_HASFCS;
	}

	ifp->if_ibytes += m->m_pkthdr.len;

	/* Allow monitor mode to claim this frame, after stats are updated. */
	if (ifp->if_flags & IFF_MONITOR) {
	m_freem(m);
	return;
	}

	/* Handle input from a lagg(4) port */
	if (ifp->if_type == IFT_IEEE8023ADLAG) {
	KASSERT(lagg_input_p != NULL,
	("%s: if_lagg not loaded!", __func__));
	m = (*lagg_input_p)(ifp, m);
	if (m != NULL)
	ifp = m->m_pkthdr.rcvif;
	else
	return;
	}

	/*
	* If the hardware did not process an 802.1Q tag, do this now,
	* to allow 802.1P priority frames to be passed to the main input
	* path correctly.
	* TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels.
	*/
	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) {
	struct ether_vlan_header *evl;

	if (m->m_len < sizeof(*evl) &&
	(m = m_pullup(m, sizeof(*evl))) == NULL) {
	#ifdef DIAGNOSTIC
	if_printf(ifp, "cannot pullup VLAN header\n");
	#endif
	ifp->if_ierrors++;
	m_freem(m);
	return;
	}

	evl = mtod(m, struct ether_vlan_header *);
	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
	m->m_flags \|= M_VLANTAG;

	bcopy((char )evl, (char )evl + ETHER_VLAN_ENCAP_LEN,
	ETHER_HDR_LEN - ETHER_TYPE_LEN);
	m_adj(m, ETHER_VLAN_ENCAP_LEN);
	}

	/* Allow ng_ether(4) to claim this frame. */
	if (IFP2AC(ifp)->ac_netgraph != NULL) {
	KASSERT(ng_ether_input_p != NULL,
	("%s: ng_ether_input_p is NULL", __func__));
	m->m_flags &= ~M_PROMISC;
	(*ng_ether_input_p)(ifp, &m);
	if (m == NULL)
	return;
	}

	/*
	* Allow if_bridge(4) to claim this frame.
	* The BRIDGE_INPUT() macro will update ifp if the bridge changed it
	* and the frame should be delivered locally.
	*/
	if (ifp->if_bridge != NULL) {
	m->m_flags &= ~M_PROMISC;
	BRIDGE_INPUT(ifp, m);
	if (m == NULL)
	return;
	}

	#ifdef DEV_CARP
	/*
	* Clear M_PROMISC on frame so that carp(4) will see it when the
	* mbuf flows up to Layer 3.
	* FreeBSD's implementation of carp(4) uses the inprotosw
	* to dispatch IPPROTO_CARP. carp(4) also allocates its own
	* Ethernet addresses of the form 00:00:5e:00:01:xx, which
	* is outside the scope of the M_PROMISC test below.
	* TODO: Maintain a hash table of ethernet addresses other than
	* ether_dhost which may be active on this ifp.
	*/
	if (ifp->if_carp && carp_forus(ifp->if_carp, eh->ether_dhost)) {
	m->m_flags &= ~M_PROMISC;
	} else
	#endif
	{
	/*
	* If the frame received was not for our MAC address, set the
	* M_PROMISC flag on the mbuf chain. The frame may need to
	* be seen by the rest of the Ethernet input path in case of
	* re-entry (e.g. bridge, vlan, netgraph) but should not be
	* seen by upper protocol layers.
	*/
	if (!ETHER_IS_MULTICAST(eh->ether_dhost) &&
	bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0)
	m->m_flags \|= M_PROMISC;
	}

	/* First chunk of an mbuf contains good entropy */
	if (harvest.ethernet)
	random_harvest(m, 16, 3, 0, RANDOM_NET);

	ether_demux(ifp, m);
	}

	/*
	* Upper layer processing for a received Ethernet packet.
	*/
	void
	ether_demux(struct ifnet ifp, struct mbuf m)
	{
	struct ether_header *eh;
	int isr;
	u_short ether_type;
	#if defined(NETATALK)
	struct llc *l;
	#endif

	KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__));

	#if defined(INET) \|\| defined(INET6)
	+ INIT_VNET_NET(ifp->if_vnet);
	/*
	* Allow dummynet and/or ipfw to claim the frame.
	* Do not do this for PROMISC frames in case we are re-entered.
	*/
	if (IPFW_LOADED && V_ether_ipfw != 0 && !(m->m_flags & M_PROMISC)) {
	struct ip_fw *rule = ip_dn_claim_rule(m);

	if (ether_ipfw_chk(&m, NULL, &rule, 0) == 0) {
	if (m)
	m_freem(m); /* dropped; free mbuf chain */
	return; /* consumed */
	}
	}
	#endif
	eh = mtod(m, struct ether_header *);
	ether_type = ntohs(eh->ether_type);

	/*
	* If this frame has a VLAN tag other than 0, call vlan_input()
	* if its module is loaded. Otherwise, drop.
	*/
	if ((m->m_flags & M_VLANTAG) &&
	EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) {
	if (ifp->if_vlantrunk == NULL) {
	ifp->if_noproto++;
	m_freem(m);
	return;
	}
	KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!",
	__func__));
	/* Clear before possibly re-entering ether_input(). */
	m->m_flags &= ~M_PROMISC;
	(*vlan_input_p)(ifp, m);
	return;
	}

	/*
	* Pass promiscuously received frames to the upper layer if the user
	* requested this by setting IFF_PPROMISC. Otherwise, drop them.
	*/
	if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) {
	m_freem(m);
	return;
	}

	/*
	* Reset layer specific mbuf flags to avoid confusing upper layers.
	* Strip off Ethernet header.
	*/
	m->m_flags &= ~M_VLANTAG;
	m->m_flags &= ~(M_PROTOFLAGS);
	m_adj(m, ETHER_HDR_LEN);

	/*
	* Dispatch frame to upper layer.
	*/
	switch (ether_type) {
	#ifdef INET
	case ETHERTYPE_IP:
	if ((m = ip_fastforward(m)) == NULL)
	return;
	isr = NETISR_IP;
	break;

	case ETHERTYPE_ARP:
	if (ifp->if_flags & IFF_NOARP) {
	/* Discard packet if ARP is disabled on interface */
	m_freem(m);
	return;
	}
	isr = NETISR_ARP;
	break;
	#endif
	#ifdef IPX
	case ETHERTYPE_IPX:
	if (ef_inputp && ef_inputp(ifp, eh, m) == 0)
	return;
	isr = NETISR_IPX;
	break;
	#endif
	#ifdef INET6
	case ETHERTYPE_IPV6:
	isr = NETISR_IPV6;
	break;
	#endif
	#ifdef NETATALK
	case ETHERTYPE_AT:
	isr = NETISR_ATALK1;
	break;
	case ETHERTYPE_AARP:
	isr = NETISR_AARP;
	break;
	#endif /* NETATALK */
	default:
	#ifdef IPX
	if (ef_inputp && ef_inputp(ifp, eh, m) == 0)
	return;
	#endif /* IPX */
	#if defined(NETATALK)
	if (ether_type > ETHERMTU)
	goto discard;
	l = mtod(m, struct llc *);
	if (l->llc_dsap == LLC_SNAP_LSAP &&
	l->llc_ssap == LLC_SNAP_LSAP &&
	l->llc_control == LLC_UI) {
	if (bcmp(&(l->llc_snap_org_code)[0], at_org_code,
	sizeof(at_org_code)) == 0 &&
	ntohs(l->llc_snap_ether_type) == ETHERTYPE_AT) {
	m_adj(m, LLC_SNAPFRAMELEN);
	isr = NETISR_ATALK2;
	break;
	}
	if (bcmp(&(l->llc_snap_org_code)[0], aarp_org_code,
	sizeof(aarp_org_code)) == 0 &&
	ntohs(l->llc_snap_ether_type) == ETHERTYPE_AARP) {
	m_adj(m, LLC_SNAPFRAMELEN);
	isr = NETISR_AARP;
	break;
	}
	}
	#endif /* NETATALK */
	goto discard;
	}
	netisr_dispatch(isr, m);
	return;

	discard:
	/*
	* Packet is to be discarded. If netgraph is present,
	* hand the packet to it for last chance processing;
	* otherwise dispose of it.
	*/
	if (IFP2AC(ifp)->ac_netgraph != NULL) {
	KASSERT(ng_ether_input_orphan_p != NULL,
	("ng_ether_input_orphan_p is NULL"));
	/*
	* Put back the ethernet header so netgraph has a
	* consistent view of inbound packets.
	*/
	M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
	(*ng_ether_input_orphan_p)(ifp, m);
	return;
	}
	m_freem(m);
	}

	/*
	* Convert Ethernet address to printable (loggable) representation.
	* This routine is for compatibility; it's better to just use
	*
	* printf("%6D", <pointer to address>, ":");
	*
	* since there's no static buffer involved.
	*/
	char *
	ether_sprintf(const u_char *ap)
	{
	static char etherbuf[18];
	snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":");
	return (etherbuf);
	}

	/*
	* Perform common duties while attaching to interface list
	*/
	void
	ether_ifattach(struct ifnet ifp, const u_int8_t lla)
	{
	int i;
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;

	ifp->if_addrlen = ETHER_ADDR_LEN;
	ifp->if_hdrlen = ETHER_HDR_LEN;
	if_attach(ifp);
	ifp->if_mtu = ETHERMTU;
	ifp->if_output = ether_output;
	ifp->if_input = ether_input;
	ifp->if_resolvemulti = ether_resolvemulti;
	if (ifp->if_baudrate == 0)
	ifp->if_baudrate = IF_Mbps(10); /* just a default */
	ifp->if_broadcastaddr = etherbroadcastaddr;

	ifa = ifp->if_addr;
	KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__));
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	sdl->sdl_type = IFT_ETHER;
	sdl->sdl_alen = ifp->if_addrlen;
	bcopy(lla, LLADDR(sdl), ifp->if_addrlen);

	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
	if (ng_ether_attach_p != NULL)
	(*ng_ether_attach_p)(ifp);

	/* Announce Ethernet MAC address if non-zero. */
	for (i = 0; i < ifp->if_addrlen; i++)
	if (lla[i] != 0)
	break;
	if (i != ifp->if_addrlen)
	if_printf(ifp, "Ethernet address: %6D\n", lla, ":");
	}

	/*
	* Perform common duties while detaching an Ethernet interface
	*/
	void
	ether_ifdetach(struct ifnet *ifp)
	{
	if (IFP2AC(ifp)->ac_netgraph != NULL) {
	KASSERT(ng_ether_detach_p != NULL,
	("ng_ether_detach_p is NULL"));
	(*ng_ether_detach_p)(ifp);
	}

	bpfdetach(ifp);
	if_detach(ifp);
	}

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet");
	#if defined(INET) \|\| defined(INET6)
	-SYSCTL_INT(_net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW,
	- &ether_ipfw,0,"Pass ether pkts through firewall");
	+SYSCTL_V_INT(V_NET, vnet_net, _net_link_ether, OID_AUTO, ipfw, CTLFLAG_RW,
	+ ether_ipfw, 0, "Pass ether pkts through firewall");
	#endif

	#if 0
	/*
	* This is for reference. We have a table-driven version
	* of the little-endian crc32 generator, which is faster
	* than the double-loop.
	*/
	uint32_t
	ether_crc32_le(const uint8_t *buf, size_t len)
	{
	size_t i;
	uint32_t crc;
	int bit;
	uint8_t data;

	crc = 0xffffffff; /* initial value */

	for (i = 0; i < len; i++) {
	for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
	carry = (crc ^ data) & 1;
	crc >>= 1;
	if (carry)
	crc = (crc ^ ETHER_CRC_POLY_LE);
	}
	}

	return (crc);
	}
	#else
	uint32_t
	ether_crc32_le(const uint8_t *buf, size_t len)
	{
	static const uint32_t crctab[] = {
	0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac,
	0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
	0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
	0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c
	};
	size_t i;
	uint32_t crc;

	crc = 0xffffffff; /* initial value */

	for (i = 0; i < len; i++) {
	crc ^= buf[i];
	crc = (crc >> 4) ^ crctab[crc & 0xf];
	crc = (crc >> 4) ^ crctab[crc & 0xf];
	}

	return (crc);
	}
	#endif

	uint32_t
	ether_crc32_be(const uint8_t *buf, size_t len)
	{
	size_t i;
	uint32_t crc, carry;
	int bit;
	uint8_t data;

	crc = 0xffffffff; /* initial value */

	for (i = 0; i < len; i++) {
	for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) {
	carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01);
	crc <<= 1;
	if (carry)
	crc = (crc ^ ETHER_CRC_POLY_BE) \| carry;
	}
	}

	return (crc);
	}

	int
	ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
	{
	struct ifaddr ifa = (struct ifaddr ) data;
	struct ifreq ifr = (struct ifreq ) data;
	int error = 0;

	switch (command) {
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;

	switch (ifa->ifa_addr->sa_family) {
	#ifdef INET
	case AF_INET:
	ifp->if_init(ifp->if_softc); /* before arpwhohas */
	arp_ifinit(ifp, ifa);
	break;
	#endif
	#ifdef IPX
	/*
	* XXX - This code is probably wrong
	*/
	case AF_IPX:
	{
	struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr);

	if (ipx_nullhost(*ina))
	ina->x_host =
	(union ipx_host )
	IF_LLADDR(ifp);
	else {
	bcopy((caddr_t) ina->x_host.c_host,
	(caddr_t) IF_LLADDR(ifp),
	ETHER_ADDR_LEN);
	}

	/*
	* Set new address
	*/
	ifp->if_init(ifp->if_softc);
	break;
	}
	#endif
	default:
	ifp->if_init(ifp->if_softc);
	break;
	}
	break;

	case SIOCGIFADDR:
	{
	struct sockaddr *sa;

	sa = (struct sockaddr *) & ifr->ifr_data;
	bcopy(IF_LLADDR(ifp),
	(caddr_t) sa->sa_data, ETHER_ADDR_LEN);
	}
	break;

	case SIOCSIFMTU:
	/*
	* Set the interface MTU.
	*/
	if (ifr->ifr_mtu > ETHERMTU) {
	error = EINVAL;
	} else {
	ifp->if_mtu = ifr->ifr_mtu;
	}
	break;
	default:
	error = EINVAL; /* XXX netbsd has ENOTTY??? */
	break;
	}
	return (error);
	}

	static int
	ether_resolvemulti(struct ifnet ifp, struct sockaddr *llsa,
	struct sockaddr *sa)
	{
	struct sockaddr_dl *sdl;
	#ifdef INET
	struct sockaddr_in *sin;
	#endif
	#ifdef INET6
	struct sockaddr_in6 *sin6;
	#endif
	u_char *e_addr;

	switch(sa->sa_family) {
	case AF_LINK:
	/*
	* No mapping needed. Just check that it's a valid MC address.
	*/
	sdl = (struct sockaddr_dl *)sa;
	e_addr = LLADDR(sdl);
	if (!ETHER_IS_MULTICAST(e_addr))
	return EADDRNOTAVAIL;
	*llsa = 0;
	return 0;

	#ifdef INET
	case AF_INET:
	sin = (struct sockaddr_in *)sa;
	if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
	return EADDRNOTAVAIL;
	MALLOC(sdl, struct sockaddr_dl , sizeof sdl, M_IFMADDR,
	M_NOWAIT\|M_ZERO);
	if (sdl == NULL)
	return ENOMEM;
	sdl->sdl_len = sizeof *sdl;
	sdl->sdl_family = AF_LINK;
	sdl->sdl_index = ifp->if_index;
	sdl->sdl_type = IFT_ETHER;
	sdl->sdl_alen = ETHER_ADDR_LEN;
	e_addr = LLADDR(sdl);
	ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr);
	llsa = (struct sockaddr )sdl;
	return 0;
	#endif
	#ifdef INET6
	case AF_INET6:
	sin6 = (struct sockaddr_in6 *)sa;
	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
	/*
	* An IP6 address of 0 means listen to all
	* of the Ethernet multicast address used for IP6.
	* (This is used for multicast routers.)
	*/
	ifp->if_flags \|= IFF_ALLMULTI;
	*llsa = 0;
	return 0;
	}
	if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
	return EADDRNOTAVAIL;
	MALLOC(sdl, struct sockaddr_dl , sizeof sdl, M_IFMADDR,
	M_NOWAIT\|M_ZERO);
	if (sdl == NULL)
	return (ENOMEM);
	sdl->sdl_len = sizeof *sdl;
	sdl->sdl_family = AF_LINK;
	sdl->sdl_index = ifp->if_index;
	sdl->sdl_type = IFT_ETHER;
	sdl->sdl_alen = ETHER_ADDR_LEN;
	e_addr = LLADDR(sdl);
	ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr);
	llsa = (struct sockaddr )sdl;
	return 0;
	#endif

	default:
	/*
	* Well, the text isn't quite right, but it's the name
	* that counts...
	*/
	return EAFNOSUPPORT;
	}
	}

	static void*
	ether_alloc(u_char type, struct ifnet *ifp)
	{
	struct arpcom *ac;

	ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK \| M_ZERO);
	ac->ac_ifp = ifp;

	return (ac);
	}

	static void
	ether_free(void *com, u_char type)
	{

	free(com, M_ARPCOM);
	}

	static int
	ether_modevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free);
	break;
	case MOD_UNLOAD:
	if_deregister_com_alloc(IFT_ETHER);
	break;
	default:
	return EOPNOTSUPP;
	}

	return (0);
	}

	static moduledata_t ether_mod = {
	"ether",
	ether_modevent,
	0
	};

	void
	ether_vlan_mtap(struct bpf_if bp, struct mbuf m, void *data, u_int dlen)
	{
	struct ether_vlan_header vlan;
	struct mbuf mv, mb;

	KASSERT((m->m_flags & M_VLANTAG) != 0,
	("%s: vlan information not present", __func__));
	KASSERT(m->m_len >= sizeof(struct ether_header),
	("%s: mbuf not large enough for header", __func__));
	bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header));
	vlan.evl_proto = vlan.evl_encap_proto;
	vlan.evl_encap_proto = htons(ETHERTYPE_VLAN);
	vlan.evl_tag = htons(m->m_pkthdr.ether_vtag);
	m->m_len -= sizeof(struct ether_header);
	m->m_data += sizeof(struct ether_header);
	/*
	* If a data link has been supplied by the caller, then we will need to
	* re-create a stack allocated mbuf chain with the following structure:
	*
	* (1) mbuf #1 will contain the supplied data link
	* (2) mbuf #2 will contain the vlan header
	* (3) mbuf #3 will contain the original mbuf's packet data
	*
	* Otherwise, submit the packet and vlan header via bpf_mtap2().
	*/
	if (data != NULL) {
	mv.m_next = m;
	mv.m_data = (caddr_t)&vlan;
	mv.m_len = sizeof(vlan);
	mb.m_next = &mv;
	mb.m_data = data;
	mb.m_len = dlen;
	bpf_mtap(bp, &mb);
	} else
	bpf_mtap2(bp, &vlan, sizeof(vlan), m);
	m->m_len += sizeof(struct ether_header);
	m->m_data -= sizeof(struct ether_header);
	}

	struct mbuf *
	ether_vlanencap(struct mbuf *m, uint16_t tag)
	{
	struct ether_vlan_header *evl;

	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
	if (m == NULL)
	return (NULL);
	/* M_PREPEND takes care of m_len, m_pkthdr.len for us */

	if (m->m_len < sizeof(*evl)) {
	m = m_pullup(m, sizeof(*evl));
	if (m == NULL)
	return (NULL);
	}

	/*
	* Transform the Ethernet header into an Ethernet header
	* with 802.1Q encapsulation.
	*/
	evl = mtod(m, struct ether_vlan_header *);
	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
	(char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
	evl->evl_tag = htons(tag);
	return (m);
	}

	DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY);
	MODULE_VERSION(ether, 1);
	Index: head/sys/net/if_faith.c
	===================================================================
	--- head/sys/net/if_faith.c (revision 183549)
	+++ head/sys/net/if_faith.c (revision 183550)
	@@ -1,348 +1,349 @@
	/* $KAME: if_faith.c,v 1.23 2001/12/17 13:55:29 sumikawa Exp $ */

	/*-
	* Copyright (c) 1982, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/
	/*
	* derived from
	* @(#)if_loop.c 8.1 (Berkeley) 6/10/93
	* Id: if_loop.c,v 1.22 1996/06/19 16:24:10 wollman Exp
	*/

	/*
	* Loopback interface driver for protocol testing and timing.
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/socket.h>
	#include <sys/errno.h>
	#include <sys/sockio.h>
	#include <sys/time.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/malloc.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_types.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/bpf.h>

	#ifdef INET
	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#endif

	#ifdef INET6
	#ifndef INET
	#include <netinet/in.h>
	#endif
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif

	#define FAITHNAME "faith"

	struct faith_softc {
	struct ifnet *sc_ifp;
	};

	static int faithioctl(struct ifnet *, u_long, caddr_t);
	int faithoutput(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *);
	static void faithrtrequest(int, struct rtentry , struct rt_addrinfo );
	#ifdef INET6
	static int faithprefix(struct in6_addr *);
	#endif

	static int faithmodevent(module_t, int, void *);

	static MALLOC_DEFINE(M_FAITH, FAITHNAME, "Firewall Assisted Tunnel Interface");

	static int faith_clone_create(struct if_clone *, int, caddr_t);
	static void faith_clone_destroy(struct ifnet *);

	IFC_SIMPLE_DECLARE(faith, 0);

	#define FAITHMTU 1500

	static int
	faithmodevent(mod, type, data)
	module_t mod;
	int type;
	void *data;
	{

	switch (type) {
	case MOD_LOAD:
	if_clone_attach(&faith_cloner);

	#ifdef INET6
	faithprefix_p = faithprefix;
	#endif

	break;
	case MOD_UNLOAD:
	#ifdef INET6
	faithprefix_p = NULL;
	#endif

	if_clone_detach(&faith_cloner);
	break;
	default:
	return EOPNOTSUPP;
	}
	return 0;
	}

	static moduledata_t faith_mod = {
	"if_faith",
	faithmodevent,
	0
	};

	DECLARE_MODULE(if_faith, faith_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_VERSION(if_faith, 1);

	static int
	faith_clone_create(ifc, unit, params)
	struct if_clone *ifc;
	int unit;
	caddr_t params;
	{
	struct ifnet *ifp;
	struct faith_softc *sc;

	sc = malloc(sizeof(struct faith_softc), M_FAITH, M_WAITOK \| M_ZERO);
	ifp = sc->sc_ifp = if_alloc(IFT_FAITH);
	if (ifp == NULL) {
	free(sc, M_FAITH);
	return (ENOSPC);
	}

	ifp->if_softc = sc;
	if_initname(sc->sc_ifp, ifc->ifc_name, unit);

	ifp->if_mtu = FAITHMTU;
	/* Change to BROADCAST experimentaly to announce its prefix. */
	ifp->if_flags = /* IFF_LOOPBACK */ IFF_BROADCAST \| IFF_MULTICAST;
	ifp->if_ioctl = faithioctl;
	ifp->if_output = faithoutput;
	ifp->if_hdrlen = 0;
	ifp->if_addrlen = 0;
	ifp->if_snd.ifq_maxlen = ifqmaxlen;
	if_attach(ifp);
	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
	return (0);
	}

	static void
	faith_clone_destroy(ifp)
	struct ifnet *ifp;
	{
	struct faith_softc *sc = ifp->if_softc;

	bpfdetach(ifp);
	if_detach(ifp);
	if_free(ifp);
	free(sc, M_FAITH);
	}

	int
	faithoutput(ifp, m, dst, rt)
	struct ifnet *ifp;
	struct mbuf *m;
	struct sockaddr *dst;
	struct rtentry *rt;
	{
	int isr;
	u_int32_t af;

	M_ASSERTPKTHDR(m);

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	if (bpf_peers_present(ifp->if_bpf)) {
	af = dst->sa_family;
	bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
	}

	if (rt && rt->rt_flags & (RTF_REJECT\|RTF_BLACKHOLE)) {
	m_freem(m);
	return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
	rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
	}
	ifp->if_opackets++;
	ifp->if_obytes += m->m_pkthdr.len;
	switch (dst->sa_family) {
	#ifdef INET
	case AF_INET:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	isr = NETISR_IPV6;
	break;
	#endif
	default:
	m_freem(m);
	return EAFNOSUPPORT;
	}

	/* XXX do we need more sanity checks? */

	m->m_pkthdr.rcvif = ifp;
	ifp->if_ipackets++;
	ifp->if_ibytes += m->m_pkthdr.len;
	netisr_dispatch(isr, m);
	return (0);
	}

	/* ARGSUSED */
	static void
	faithrtrequest(cmd, rt, info)
	int cmd;
	struct rtentry *rt;
	struct rt_addrinfo *info;
	{
	RT_LOCK_ASSERT(rt);
	rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
	}

	/*
	* Process an ioctl request.
	*/
	/* ARGSUSED */
	static int
	faithioctl(ifp, cmd, data)
	struct ifnet *ifp;
	u_long cmd;
	caddr_t data;
	{
	struct ifaddr *ifa;
	struct ifreq ifr = (struct ifreq )data;
	int error = 0;

	switch (cmd) {

	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	ifa = (struct ifaddr *)data;
	ifa->ifa_rtrequest = faithrtrequest;
	/*
	* Everything else is done at a higher level.
	*/
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if (ifr == 0) {
	error = EAFNOSUPPORT; /* XXX */
	break;
	}
	switch (ifr->ifr_addr.sa_family) {
	#ifdef INET
	case AF_INET:
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	break;
	#endif

	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;

	#ifdef SIOCSIFMTU
	case SIOCSIFMTU:
	ifp->if_mtu = ifr->ifr_mtu;
	break;
	#endif

	case SIOCSIFFLAGS:
	break;

	default:
	error = EINVAL;
	}
	return (error);
	}

	#ifdef INET6
	/*
	* XXX could be slow
	* XXX could be layer violation to call sys/net from sys/netinet6
	*/
	static int
	faithprefix(in6)
	struct in6_addr *in6;
	{
	+ INIT_VNET_INET6(curvnet);
	struct rtentry *rt;
	struct sockaddr_in6 sin6;
	int ret;

	if (V_ip6_keepfaith == 0)
	return 0;

	bzero(&sin6, sizeof(sin6));
	sin6.sin6_family = AF_INET6;
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_addr = *in6;
	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
	if (rt && rt->rt_ifp && rt->rt_ifp->if_type == IFT_FAITH &&
	(rt->rt_ifp->if_flags & IFF_UP) != 0)
	ret = 1;
	else
	ret = 0;
	if (rt)
	RTFREE_LOCKED(rt);
	return ret;
	}
	#endif
	Index: head/sys/net/if_gif.c
	===================================================================
	--- head/sys/net/if_gif.c (revision 183549)
	+++ head/sys/net/if_gif.c (revision 183550)
	@@ -1,982 +1,993 @@
	/* $FreeBSD$ */
	/* $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/conf.h>
	#include <sys/vimage.h>
	#include <machine/cpu.h>

	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_types.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/bpf.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#ifdef INET
	#include <netinet/in_var.h>
	#include <netinet/in_gif.h>
	#include <netinet/ip_var.h>
	#endif /* INET */

	#ifdef INET6
	#ifndef INET
	#include <netinet/in.h>
	#endif
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/in6_gif.h>
	#include <netinet6/ip6protosw.h>
	#endif /* INET6 */

	#include <netinet/ip_encap.h>
	#include <net/ethernet.h>
	#include <net/if_bridgevar.h>
	#include <net/if_gif.h>

	#include <security/mac/mac_framework.h>

	#define GIFNAME "gif"

	/*
	* gif_mtx protects the global gif_softc_list.
	*/
	static struct mtx gif_mtx;
	static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface");
	static LIST_HEAD(, gif_softc) gif_softc_list;

	void (ng_gif_input_p)(struct ifnet ifp, struct mbuf **mp, int af);
	void (ng_gif_input_orphan_p)(struct ifnet ifp, struct mbuf *m, int af);
	void (ng_gif_attach_p)(struct ifnet ifp);
	void (ng_gif_detach_p)(struct ifnet ifp);

	static void gif_start(struct ifnet *);
	static int gif_clone_create(struct if_clone *, int, caddr_t);
	static void gif_clone_destroy(struct ifnet *);

	IFC_SIMPLE_DECLARE(gif, 0);

	static int gifmodevent(module_t, int, void *);

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0,
	"Generic Tunnel Interface");
	#ifndef MAX_GIF_NEST
	/*
	* This macro controls the default upper limitation on nesting of gif tunnels.
	* Since, setting a large value to this macro with a careless configuration
	* may introduce system crash, we don't allow any nestings by default.
	* If you need to configure nested gif tunnels, you can define this macro
	* in your kernel configuration file. However, if you do so, please be
	* careful to configure the tunnels so that it won't make a loop.
	*/
	#define MAX_GIF_NEST 1
	#endif
	+#ifndef VIMAGE
	static int max_gif_nesting = MAX_GIF_NEST;
	-SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW,
	- &max_gif_nesting, 0, "Max nested tunnels");
	+#endif
	+SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, max_nesting,
	+ CTLFLAG_RW, max_gif_nesting, 0, "Max nested tunnels");

	+#ifdef INET6
	+SYSCTL_DECL(_net_inet6_ip6);
	+SYSCTL_V_INT(V_NET, vnet_gif, _net_inet6_ip6, IPV6CTL_GIF_HLIM,
	+ gifhlim, CTLFLAG_RW, ip6_gif_hlim, 0, "");
	+#endif
	+
	/*
	* By default, we disallow creation of multiple tunnels between the same
	* pair of addresses. Some applications require this functionality so
	* we allow control over this check here.
	*/
	#ifdef XBONEHACK
	static int parallel_tunnels = 1;
	#else
	static int parallel_tunnels = 0;
	#endif
	-SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW,
	- &parallel_tunnels, 0, "Allow parallel tunnels?");
	+SYSCTL_V_INT(V_NET, vnet_gif, _net_link_gif, OID_AUTO, parallel_tunnels,
	+ CTLFLAG_RW, parallel_tunnels, 0, "Allow parallel tunnels?");

	/* copy from src/sys/net/if_ethersubr.c */
	static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] =
	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
	#ifndef ETHER_IS_BROADCAST
	#define ETHER_IS_BROADCAST(addr) \
	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
	#endif

	static int
	gif_clone_create(ifc, unit, params)
	struct if_clone *ifc;
	int unit;
	caddr_t params;
	{
	+ INIT_VNET_GIF(curvnet);
	struct gif_softc *sc;

	sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK \| M_ZERO);
	sc->gif_fibnum = curthread->td_proc->p_fibnum;
	GIF2IFP(sc) = if_alloc(IFT_GIF);
	if (GIF2IFP(sc) == NULL) {
	free(sc, M_GIF);
	return (ENOSPC);
	}

	GIF_LOCK_INIT(sc);

	GIF2IFP(sc)->if_softc = sc;
	if_initname(GIF2IFP(sc), ifc->ifc_name, unit);

	sc->encap_cookie4 = sc->encap_cookie6 = NULL;

	GIF2IFP(sc)->if_addrlen = 0;
	GIF2IFP(sc)->if_mtu = GIF_MTU;
	GIF2IFP(sc)->if_flags = IFF_POINTOPOINT \| IFF_MULTICAST;
	#if 0
	/* turn off ingress filter */
	GIF2IFP(sc)->if_flags \|= IFF_LINK2;
	#endif
	GIF2IFP(sc)->if_ioctl = gif_ioctl;
	GIF2IFP(sc)->if_start = gif_start;
	GIF2IFP(sc)->if_output = gif_output;
	GIF2IFP(sc)->if_snd.ifq_maxlen = IFQ_MAXLEN;
	if_attach(GIF2IFP(sc));
	bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t));
	if (ng_gif_attach_p != NULL)
	(*ng_gif_attach_p)(GIF2IFP(sc));

	mtx_lock(&gif_mtx);
	LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list);
	mtx_unlock(&gif_mtx);

	return (0);
	}

	static void
	gif_clone_destroy(ifp)
	struct ifnet *ifp;
	{
	int err;
	struct gif_softc *sc = ifp->if_softc;

	mtx_lock(&gif_mtx);
	LIST_REMOVE(sc, gif_list);
	mtx_unlock(&gif_mtx);

	gif_delete_tunnel(ifp);
	#ifdef INET6
	if (sc->encap_cookie6 != NULL) {
	err = encap_detach(sc->encap_cookie6);
	KASSERT(err == 0, ("Unexpected error detaching encap_cookie6"));
	}
	#endif
	#ifdef INET
	if (sc->encap_cookie4 != NULL) {
	err = encap_detach(sc->encap_cookie4);
	KASSERT(err == 0, ("Unexpected error detaching encap_cookie4"));
	}
	#endif

	if (ng_gif_detach_p != NULL)
	(*ng_gif_detach_p)(ifp);
	bpfdetach(ifp);
	if_detach(ifp);
	if_free(ifp);

	GIF_LOCK_DESTROY(sc);

	free(sc, M_GIF);
	}

	static int
	gifmodevent(mod, type, data)
	module_t mod;
	int type;
	void *data;
	{

	switch (type) {
	case MOD_LOAD:
	mtx_init(&gif_mtx, "gif_mtx", NULL, MTX_DEF);
	LIST_INIT(&V_gif_softc_list);
	if_clone_attach(&gif_cloner);

	#ifdef INET6
	V_ip6_gif_hlim = GIF_HLIM;
	#endif

	break;
	case MOD_UNLOAD:
	if_clone_detach(&gif_cloner);
	mtx_destroy(&gif_mtx);
	#ifdef INET6
	V_ip6_gif_hlim = 0;
	#endif
	break;
	default:
	return EOPNOTSUPP;
	}
	return 0;
	}

	static moduledata_t gif_mod = {
	"if_gif",
	gifmodevent,
	0
	};

	DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_VERSION(if_gif, 1);

	int
	gif_encapcheck(m, off, proto, arg)
	const struct mbuf *m;
	int off;
	int proto;
	void *arg;
	{
	struct ip ip;
	struct gif_softc *sc;

	sc = (struct gif_softc *)arg;
	if (sc == NULL)
	return 0;

	if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0)
	return 0;

	/* no physical address */
	if (!sc->gif_psrc \|\| !sc->gif_pdst)
	return 0;

	switch (proto) {
	#ifdef INET
	case IPPROTO_IPV4:
	break;
	#endif
	#ifdef INET6
	case IPPROTO_IPV6:
	break;
	#endif
	case IPPROTO_ETHERIP:
	break;

	default:
	return 0;
	}

	/* Bail on short packets */
	if (m->m_pkthdr.len < sizeof(ip))
	return 0;

	m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);

	switch (ip.ip_v) {
	#ifdef INET
	case 4:
	if (sc->gif_psrc->sa_family != AF_INET \|\|
	sc->gif_pdst->sa_family != AF_INET)
	return 0;
	return gif_encapcheck4(m, off, proto, arg);
	#endif
	#ifdef INET6
	case 6:
	if (m->m_pkthdr.len < sizeof(struct ip6_hdr))
	return 0;
	if (sc->gif_psrc->sa_family != AF_INET6 \|\|
	sc->gif_pdst->sa_family != AF_INET6)
	return 0;
	return gif_encapcheck6(m, off, proto, arg);
	#endif
	default:
	return 0;
	}
	}

	static void
	gif_start(struct ifnet *ifp)
	{
	struct gif_softc *sc;
	struct mbuf *m;

	sc = ifp->if_softc;

	ifp->if_drv_flags \|= IFF_DRV_OACTIVE;
	for (;;) {
	IFQ_DEQUEUE(&ifp->if_snd, m);
	if (m == 0)
	break;

	gif_output(ifp, m, sc->gif_pdst, NULL);

	}
	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;

	return;
	}

	int
	gif_output(ifp, m, dst, rt)
	struct ifnet *ifp;
	struct mbuf *m;
	struct sockaddr *dst;
	struct rtentry rt; / added in net2 */
	{
	+ INIT_VNET_GIF(ifp->if_vnet);
	struct gif_softc *sc = ifp->if_softc;
	struct m_tag *mtag;
	int error = 0;
	int gif_called;
	u_int32_t af;

	#ifdef MAC
	error = mac_ifnet_check_transmit(ifp, m);
	if (error) {
	m_freem(m);
	goto end;
	}
	#endif

	/*
	* gif may cause infinite recursion calls when misconfigured.
	* We'll prevent this by detecting loops.
	*
	* High nesting level may cause stack exhaustion.
	* We'll prevent this by introducing upper limit.
	*/
	gif_called = 1;
	mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, NULL);
	while (mtag != NULL) {
	if ((struct ifnet *)(mtag + 1) == ifp) {
	log(LOG_NOTICE,
	"gif_output: loop detected on %s\n",
	((struct ifnet *)(mtag + 1))->if_xname);
	m_freem(m);
	error = EIO; /* is there better errno? */
	goto end;
	}
	mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag);
	gif_called++;
	}
	if (gif_called > V_max_gif_nesting) {
	log(LOG_NOTICE,
	"gif_output: recursively called too many times(%d)\n",
	gif_called);
	m_freem(m);
	error = EIO; /* is there better errno? */
	goto end;
	}
	mtag = m_tag_alloc(MTAG_GIF, MTAG_GIF_CALLED, sizeof(struct ifnet *),
	M_NOWAIT);
	if (mtag == NULL) {
	m_freem(m);
	error = ENOMEM;
	goto end;
	}
	(struct ifnet *)(mtag + 1) = ifp;
	m_tag_prepend(m, mtag);

	m->m_flags &= ~(M_BCAST\|M_MCAST);

	GIF_LOCK(sc);

	if (!(ifp->if_flags & IFF_UP) \|\|
	sc->gif_psrc == NULL \|\| sc->gif_pdst == NULL) {
	GIF_UNLOCK(sc);
	m_freem(m);
	error = ENETDOWN;
	goto end;
	}

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	af = dst->sa_family;
	BPF_MTAP2(ifp, &af, sizeof(af), m);
	ifp->if_opackets++;
	ifp->if_obytes += m->m_pkthdr.len;

	/* override to IPPROTO_ETHERIP for bridged traffic */
	if (ifp->if_bridge)
	af = AF_LINK;

	M_SETFIB(m, sc->gif_fibnum);
	/* inner AF-specific encapsulation */

	/* XXX should we check if our outer source is legal? */

	/* dispatch to output logic based on outer AF */
	switch (sc->gif_psrc->sa_family) {
	#ifdef INET
	case AF_INET:
	error = in_gif_output(ifp, af, m);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	error = in6_gif_output(ifp, af, m);
	break;
	#endif
	default:
	m_freem(m);
	error = ENETDOWN;
	}

	GIF_UNLOCK(sc);
	end:
	if (error)
	ifp->if_oerrors++;
	return (error);
	}

	void
	gif_input(m, af, ifp)
	struct mbuf *m;
	int af;
	struct ifnet *ifp;
	{
	int isr, n;
	struct etherip_header *eip;
	struct ether_header *eh;
	struct ifnet *oldifp;

	if (ifp == NULL) {
	/* just in case */
	m_freem(m);
	return;
	}

	m->m_pkthdr.rcvif = ifp;

	#ifdef MAC
	mac_ifnet_create_mbuf(ifp, m);
	#endif

	if (bpf_peers_present(ifp->if_bpf)) {
	u_int32_t af1 = af;
	bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m);
	}

	if (ng_gif_input_p != NULL) {
	(*ng_gif_input_p)(ifp, &m, af);
	if (m == NULL)
	return;
	}

	/*
	* Put the packet to the network layer input queue according to the
	* specified address family.
	* Note: older versions of gif_input directly called network layer
	* input functions, e.g. ip6_input, here. We changed the policy to
	* prevent too many recursive calls of such input functions, which
	* might cause kernel panic. But the change may introduce another
	* problem; if the input queue is full, packets are discarded.
	* The kernel stack overflow really happened, and we believed
	* queue-full rarely occurs, so we changed the policy.
	*/
	switch (af) {
	#ifdef INET
	case AF_INET:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	isr = NETISR_IPV6;
	break;
	#endif
	case AF_LINK:
	n = sizeof(struct etherip_header) + sizeof(struct ether_header);
	if (n > m->m_len) {
	m = m_pullup(m, n);
	if (m == NULL) {
	ifp->if_ierrors++;
	return;
	}
	}

	eip = mtod(m, struct etherip_header *);
	if (eip->eip_ver !=
	(ETHERIP_VERSION & ETHERIP_VER_VERS_MASK)) {
	/* discard unknown versions */
	m_freem(m);
	return;
	}
	m_adj(m, sizeof(struct etherip_header));

	m->m_flags &= ~(M_BCAST\|M_MCAST);
	m->m_pkthdr.rcvif = ifp;

	if (ifp->if_bridge) {
	oldifp = ifp;
	eh = mtod(m, struct ether_header *);
	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
	if (ETHER_IS_BROADCAST(eh->ether_dhost))
	m->m_flags \|= M_BCAST;
	else
	m->m_flags \|= M_MCAST;
	ifp->if_imcasts++;
	}
	BRIDGE_INPUT(ifp, m);

	if (m != NULL && ifp != oldifp) {
	/*
	* The bridge gave us back itself or one of the
	* members for which the frame is addressed.
	*/
	ether_demux(ifp, m);
	return;
	}
	}
	if (m != NULL)
	m_freem(m);
	return;

	default:
	if (ng_gif_input_orphan_p != NULL)
	(*ng_gif_input_orphan_p)(ifp, m, af);
	else
	m_freem(m);
	return;
	}

	ifp->if_ipackets++;
	ifp->if_ibytes += m->m_pkthdr.len;
	netisr_dispatch(isr, m);
	}

	/* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */
	int
	gif_ioctl(ifp, cmd, data)
	struct ifnet *ifp;
	u_long cmd;
	caddr_t data;
	{
	struct gif_softc *sc = ifp->if_softc;
	struct ifreq ifr = (struct ifreq)data;
	int error = 0, size;
	struct sockaddr dst, src;
	#ifdef SIOCSIFMTU /* xxx */
	u_long mtu;
	#endif

	switch (cmd) {
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;
	break;

	case SIOCSIFDSTADDR:
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	break;

	#ifdef SIOCSIFMTU /* xxx */
	case SIOCGIFMTU:
	break;

	case SIOCSIFMTU:
	mtu = ifr->ifr_mtu;
	if (mtu < GIF_MTU_MIN \|\| mtu > GIF_MTU_MAX)
	return (EINVAL);
	ifp->if_mtu = mtu;
	break;
	#endif /* SIOCSIFMTU */

	#ifdef INET
	case SIOCSIFPHYADDR:
	#endif
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	#endif /* INET6 */
	case SIOCSLIFPHYADDR:
	switch (cmd) {
	#ifdef INET
	case SIOCSIFPHYADDR:
	src = (struct sockaddr *)
	&(((struct in_aliasreq *)data)->ifra_addr);
	dst = (struct sockaddr *)
	&(((struct in_aliasreq *)data)->ifra_dstaddr);
	break;
	#endif
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	src = (struct sockaddr *)
	&(((struct in6_aliasreq *)data)->ifra_addr);
	dst = (struct sockaddr *)
	&(((struct in6_aliasreq *)data)->ifra_dstaddr);
	break;
	#endif
	case SIOCSLIFPHYADDR:
	src = (struct sockaddr *)
	&(((struct if_laddrreq *)data)->addr);
	dst = (struct sockaddr *)
	&(((struct if_laddrreq *)data)->dstaddr);
	break;
	default:
	return EINVAL;
	}

	/* sa_family must be equal */
	if (src->sa_family != dst->sa_family)
	return EINVAL;

	/* validate sa_len */
	switch (src->sa_family) {
	#ifdef INET
	case AF_INET:
	if (src->sa_len != sizeof(struct sockaddr_in))
	return EINVAL;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if (src->sa_len != sizeof(struct sockaddr_in6))
	return EINVAL;
	break;
	#endif
	default:
	return EAFNOSUPPORT;
	}
	switch (dst->sa_family) {
	#ifdef INET
	case AF_INET:
	if (dst->sa_len != sizeof(struct sockaddr_in))
	return EINVAL;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if (dst->sa_len != sizeof(struct sockaddr_in6))
	return EINVAL;
	break;
	#endif
	default:
	return EAFNOSUPPORT;
	}

	/* check sa_family looks sane for the cmd */
	switch (cmd) {
	case SIOCSIFPHYADDR:
	if (src->sa_family == AF_INET)
	break;
	return EAFNOSUPPORT;
	#ifdef INET6
	case SIOCSIFPHYADDR_IN6:
	if (src->sa_family == AF_INET6)
	break;
	return EAFNOSUPPORT;
	#endif /* INET6 */
	case SIOCSLIFPHYADDR:
	/* checks done in the above */
	break;
	}

	error = gif_set_tunnel(GIF2IFP(sc), src, dst);
	break;

	#ifdef SIOCDIFPHYADDR
	case SIOCDIFPHYADDR:
	gif_delete_tunnel(GIF2IFP(sc));
	break;
	#endif

	case SIOCGIFPSRCADDR:
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	#endif /* INET6 */
	if (sc->gif_psrc == NULL) {
	error = EADDRNOTAVAIL;
	goto bad;
	}
	src = sc->gif_psrc;
	switch (cmd) {
	#ifdef INET
	case SIOCGIFPSRCADDR:
	dst = &ifr->ifr_addr;
	size = sizeof(ifr->ifr_addr);
	break;
	#endif /* INET */
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	dst = (struct sockaddr *)
	&(((struct in6_ifreq *)data)->ifr_addr);
	size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
	break;
	#endif /* INET6 */
	default:
	error = EADDRNOTAVAIL;
	goto bad;
	}
	if (src->sa_len > size)
	return EINVAL;
	bcopy((caddr_t)src, (caddr_t)dst, src->sa_len);
	#ifdef INET6
	if (dst->sa_family == AF_INET6) {
	error = sa6_recoverscope((struct sockaddr_in6 *)dst);
	if (error != 0)
	return (error);
	}
	#endif
	break;

	case SIOCGIFPDSTADDR:
	#ifdef INET6
	case SIOCGIFPDSTADDR_IN6:
	#endif /* INET6 */
	if (sc->gif_pdst == NULL) {
	error = EADDRNOTAVAIL;
	goto bad;
	}
	src = sc->gif_pdst;
	switch (cmd) {
	#ifdef INET
	case SIOCGIFPDSTADDR:
	dst = &ifr->ifr_addr;
	size = sizeof(ifr->ifr_addr);
	break;
	#endif /* INET */
	#ifdef INET6
	case SIOCGIFPDSTADDR_IN6:
	dst = (struct sockaddr *)
	&(((struct in6_ifreq *)data)->ifr_addr);
	size = sizeof(((struct in6_ifreq *)data)->ifr_addr);
	break;
	#endif /* INET6 */
	default:
	error = EADDRNOTAVAIL;
	goto bad;
	}
	if (src->sa_len > size)
	return EINVAL;
	bcopy((caddr_t)src, (caddr_t)dst, src->sa_len);
	#ifdef INET6
	if (dst->sa_family == AF_INET6) {
	error = sa6_recoverscope((struct sockaddr_in6 *)dst);
	if (error != 0)
	return (error);
	}
	#endif
	break;

	case SIOCGLIFPHYADDR:
	if (sc->gif_psrc == NULL \|\| sc->gif_pdst == NULL) {
	error = EADDRNOTAVAIL;
	goto bad;
	}

	/* copy src */
	src = sc->gif_psrc;
	dst = (struct sockaddr *)
	&(((struct if_laddrreq *)data)->addr);
	size = sizeof(((struct if_laddrreq *)data)->addr);
	if (src->sa_len > size)
	return EINVAL;
	bcopy((caddr_t)src, (caddr_t)dst, src->sa_len);

	/* copy dst */
	src = sc->gif_pdst;
	dst = (struct sockaddr *)
	&(((struct if_laddrreq *)data)->dstaddr);
	size = sizeof(((struct if_laddrreq *)data)->dstaddr);
	if (src->sa_len > size)
	return EINVAL;
	bcopy((caddr_t)src, (caddr_t)dst, src->sa_len);
	break;

	case SIOCSIFFLAGS:
	/* if_ioctl() takes care of it */
	break;

	default:
	error = EINVAL;
	break;
	}
	bad:
	return error;
	}

	/*
	* XXXRW: There's a general event-ordering issue here: the code to check
	* if a given tunnel is already present happens before we perform a
	* potentially blocking setup of the tunnel. This code needs to be
	* re-ordered so that the check and replacement can be atomic using
	* a mutex.
	*/
	int
	gif_set_tunnel(ifp, src, dst)
	struct ifnet *ifp;
	struct sockaddr *src;
	struct sockaddr *dst;
	{
	+ INIT_VNET_GIF(ifp->if_vnet);
	struct gif_softc *sc = ifp->if_softc;
	struct gif_softc *sc2;
	struct sockaddr osrc, odst, *sa;
	int error = 0;

	mtx_lock(&gif_mtx);
	LIST_FOREACH(sc2, &V_gif_softc_list, gif_list) {
	if (sc2 == sc)
	continue;
	if (!sc2->gif_pdst \|\| !sc2->gif_psrc)
	continue;
	if (sc2->gif_pdst->sa_family != dst->sa_family \|\|
	sc2->gif_pdst->sa_len != dst->sa_len \|\|
	sc2->gif_psrc->sa_family != src->sa_family \|\|
	sc2->gif_psrc->sa_len != src->sa_len)
	continue;

	/*
	* Disallow parallel tunnels unless instructed
	* otherwise.
	*/
	if (!V_parallel_tunnels &&
	bcmp(sc2->gif_pdst, dst, dst->sa_len) == 0 &&
	bcmp(sc2->gif_psrc, src, src->sa_len) == 0) {
	error = EADDRNOTAVAIL;
	mtx_unlock(&gif_mtx);
	goto bad;
	}

	/* XXX both end must be valid? (I mean, not 0.0.0.0) */
	}
	mtx_unlock(&gif_mtx);

	/* XXX we can detach from both, but be polite just in case */
	if (sc->gif_psrc)
	switch (sc->gif_psrc->sa_family) {
	#ifdef INET
	case AF_INET:
	(void)in_gif_detach(sc);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	(void)in6_gif_detach(sc);
	break;
	#endif
	}

	osrc = sc->gif_psrc;
	sa = (struct sockaddr *)malloc(src->sa_len, M_IFADDR, M_WAITOK);
	bcopy((caddr_t)src, (caddr_t)sa, src->sa_len);
	sc->gif_psrc = sa;

	odst = sc->gif_pdst;
	sa = (struct sockaddr *)malloc(dst->sa_len, M_IFADDR, M_WAITOK);
	bcopy((caddr_t)dst, (caddr_t)sa, dst->sa_len);
	sc->gif_pdst = sa;

	switch (sc->gif_psrc->sa_family) {
	#ifdef INET
	case AF_INET:
	error = in_gif_attach(sc);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	/*
	* Check validity of the scope zone ID of the addresses, and
	* convert it into the kernel internal form if necessary.
	*/
	error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_psrc, 0);
	if (error != 0)
	break;
	error = sa6_embedscope((struct sockaddr_in6 *)sc->gif_pdst, 0);
	if (error != 0)
	break;
	error = in6_gif_attach(sc);
	break;
	#endif
	}
	if (error) {
	/* rollback */
	free((caddr_t)sc->gif_psrc, M_IFADDR);
	free((caddr_t)sc->gif_pdst, M_IFADDR);
	sc->gif_psrc = osrc;
	sc->gif_pdst = odst;
	goto bad;
	}

	if (osrc)
	free((caddr_t)osrc, M_IFADDR);
	if (odst)
	free((caddr_t)odst, M_IFADDR);

	bad:
	if (sc->gif_psrc && sc->gif_pdst)
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	else
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;

	return error;
	}

	void
	gif_delete_tunnel(ifp)
	struct ifnet *ifp;
	{
	struct gif_softc *sc = ifp->if_softc;

	if (sc->gif_psrc) {
	free((caddr_t)sc->gif_psrc, M_IFADDR);
	sc->gif_psrc = NULL;
	}
	if (sc->gif_pdst) {
	free((caddr_t)sc->gif_pdst, M_IFADDR);
	sc->gif_pdst = NULL;
	}
	/* it is safe to detach from both */
	#ifdef INET
	(void)in_gif_detach(sc);
	#endif
	#ifdef INET6
	(void)in6_gif_detach(sc);
	#endif
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	}
	Index: head/sys/net/if_gif.h
	===================================================================
	--- head/sys/net/if_gif.h (revision 183549)
	+++ head/sys/net/if_gif.h (revision 183550)
	@@ -1,115 +1,139 @@
	/* $FreeBSD$ */
	/* $KAME: if_gif.h,v 1.17 2000/09/11 11:36:41 sumikawa Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* if_gif.h
	*/

	#ifndef _NET_IF_GIF_H_
	#define _NET_IF_GIF_H_


	#ifdef _KERNEL
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <netinet/in.h>
	/* xxx sigh, why route have struct route instead of pointer? */

	struct encaptab;

	extern void (ng_gif_input_p)(struct ifnet ifp, struct mbuf **mp,
	int af);
	extern void (ng_gif_input_orphan_p)(struct ifnet ifp, struct mbuf *m,
	int af);
	extern int (ng_gif_output_p)(struct ifnet ifp, struct mbuf **mp);
	extern void (ng_gif_attach_p)(struct ifnet ifp);
	extern void (ng_gif_detach_p)(struct ifnet ifp);

	struct gif_softc {
	struct ifnet *gif_ifp;
	struct mtx gif_mtx;
	struct sockaddr gif_psrc; / Physical src addr */
	struct sockaddr gif_pdst; / Physical dst addr */
	union {
	struct route gifscr_ro; /* xxx */
	#ifdef INET6
	struct route_in6 gifscr_ro6; /* xxx */
	#endif
	} gifsc_gifscr;
	int gif_flags;
	u_int gif_fibnum;
	const struct encaptab *encap_cookie4;
	const struct encaptab *encap_cookie6;
	void gif_netgraph; / ng_gif(4) netgraph node info */
	LIST_ENTRY(gif_softc) gif_list; /* all gif's are linked */
	};
	#define GIF2IFP(sc) ((sc)->gif_ifp)
	#define GIF_LOCK_INIT(sc) mtx_init(&(sc)->gif_mtx, "gif softc", \
	NULL, MTX_DEF)
	#define GIF_LOCK_DESTROY(sc) mtx_destroy(&(sc)->gif_mtx)
	#define GIF_LOCK(sc) mtx_lock(&(sc)->gif_mtx)
	#define GIF_UNLOCK(sc) mtx_unlock(&(sc)->gif_mtx)
	#define GIF_LOCK_ASSERT(sc) mtx_assert(&(sc)->gif_mtx, MA_OWNED)

	#define gif_ro gifsc_gifscr.gifscr_ro
	#ifdef INET6
	#define gif_ro6 gifsc_gifscr.gifscr_ro6
	#endif

	#define GIF_MTU (1280) /* Default MTU */
	#define GIF_MTU_MIN (1280) /* Minimum MTU */
	#define GIF_MTU_MAX (8192) /* Maximum MTU */

	#define MTAG_GIF 1080679712
	#define MTAG_GIF_CALLED 0

	struct etherip_header {
	u_int8_t eip_ver; /* version/reserved */
	u_int8_t eip_pad; /* required padding byte */
	};
	#define ETHERIP_VER_VERS_MASK 0x0f
	#define ETHERIP_VER_RSVD_MASK 0xf0
	#define ETHERIP_VERSION 0x03

	/* Prototypes */
	void gif_input(struct mbuf , int, struct ifnet );
	int gif_output(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *);
	int gif_ioctl(struct ifnet *, u_long, caddr_t);
	int gif_set_tunnel(struct ifnet , struct sockaddr , struct sockaddr *);
	void gif_delete_tunnel(struct ifnet *);
	int gif_encapcheck(const struct mbuf , int, int, void );

	+/*
	+ * Virtualization support
	+ */
	+#ifdef VIMAGE
	+struct vnet_gif {
	+ LIST_HEAD(, gif_softc) _gif_softc_list;
	+ int _max_gif_nesting;
	+ int _parallel_tunnels;
	+ int _ip_gif_ttl;
	+ int _ip6_gif_hlim;
	+};
	+#endif
	+
	+#define INIT_VNET_GIF(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_GIF, struct vnet_gif, vnet_gif)
	+
	+#define VNET_GIF(sym) VSYM(vnet_gif, sym)
	+
	+#define V_gif_softc_list VNET_GIF(gif_softc_list)
	+#define V_max_gif_nesting VNET_GIF(max_gif_nesting)
	+#define V_parallel_tunnels VNET_GIF(parallel_tunnels)
	+#define V_ip_gif_ttl VNET_GIF(ip_gif_ttl)
	+#define V_ip6_gif_hlim VNET_GIF(ip6_gif_hlim)
	+
	#endif /* _KERNEL */

	#endif /* _NET_IF_GIF_H_ */
	Index: head/sys/net/if_gre.c
	===================================================================
	--- head/sys/net/if_gre.c (revision 183549)
	+++ head/sys/net/if_gre.c (revision 183550)
	@@ -1,906 +1,909 @@
	/* $NetBSD: if_gre.c,v 1.49 2003/12/11 00:22:29 itojun Exp $ */
	/* $FreeBSD$ */

	/*-
	* Copyright (c) 1998 The NetBSD Foundation, Inc.
	* All rights reserved.
	*
	* This code is derived from software contributed to The NetBSD Foundation
	* by Heiko W.Rupp <hwr@pilhuhn.de>
	*
	* IPv6-over-GRE contributed by Gert Doering <gert@greenie.muc.de>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the NetBSD
	* Foundation, Inc. and its contributors.
	* 4. Neither the name of The NetBSD Foundation nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	* Encapsulate L3 protocols into IP
	* See RFC 2784 (successor of RFC 1701 and 1702) for more details.
	* If_gre is compatible with Cisco GRE tunnels, so you can
	* have a NetBSD box as the other end of a tunnel interface of a Cisco
	* router. See gre(4) for more details.
	* Also supported: IP in IP encaps (proto 55) as of RFC 2004
	*/

	#include "opt_atalk.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>

	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#ifdef INET
	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_gre.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_encap.h>
	#else
	#error "Huh? if_gre without inet?"
	#endif

	#include <net/bpf.h>

	#include <net/if_gre.h>

	/*
	* It is not easy to calculate the right value for a GRE MTU.
	* We leave this task to the admin and use the same default that
	* other vendors use.
	*/
	#define GREMTU 1476

	#define GRENAME "gre"

	/*
	* gre_mtx protects all global variables in if_gre.c.
	* XXX: gre_softc data not protected yet.
	*/
	struct mtx gre_mtx;
	static MALLOC_DEFINE(M_GRE, GRENAME, "Generic Routing Encapsulation");

	struct gre_softc_head gre_softc_list;

	static int gre_clone_create(struct if_clone *, int, caddr_t);
	static void gre_clone_destroy(struct ifnet *);
	static int gre_ioctl(struct ifnet *, u_long, caddr_t);
	static int gre_output(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *rt);

	IFC_SIMPLE_DECLARE(gre, 0);

	static int gre_compute_route(struct gre_softc *sc);

	static void greattach(void);

	#ifdef INET
	extern struct domain inetdomain;
	static const struct protosw in_gre_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inetdomain,
	.pr_protocol = IPPROTO_GRE,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = gre_input,
	.pr_output = (pr_output_t *)rip_output,
	.pr_ctlinput = rip_ctlinput,
	.pr_ctloutput = rip_ctloutput,
	.pr_usrreqs = &rip_usrreqs
	};
	static const struct protosw in_mobile_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inetdomain,
	.pr_protocol = IPPROTO_MOBILE,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = gre_mobile_input,
	.pr_output = (pr_output_t *)rip_output,
	.pr_ctlinput = rip_ctlinput,
	.pr_ctloutput = rip_ctloutput,
	.pr_usrreqs = &rip_usrreqs
	};
	#endif

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_TUNNEL, gre, CTLFLAG_RW, 0,
	"Generic Routing Encapsulation");
	#ifndef MAX_GRE_NEST
	/*
	* This macro controls the default upper limitation on nesting of gre tunnels.
	* Since, setting a large value to this macro with a careless configuration
	* may introduce system crash, we don't allow any nestings by default.
	* If you need to configure nested gre tunnels, you can define this macro
	* in your kernel configuration file. However, if you do so, please be
	* careful to configure the tunnels so that it won't make a loop.
	*/
	#define MAX_GRE_NEST 1
	#endif
	static int max_gre_nesting = MAX_GRE_NEST;
	SYSCTL_INT(_net_link_gre, OID_AUTO, max_nesting, CTLFLAG_RW,
	&max_gre_nesting, 0, "Max nested tunnels");

	/* ARGSUSED */
	static void
	greattach(void)
	{

	mtx_init(&gre_mtx, "gre_mtx", NULL, MTX_DEF);
	LIST_INIT(&gre_softc_list);
	if_clone_attach(&gre_cloner);
	}

	static int
	gre_clone_create(ifc, unit, params)
	struct if_clone *ifc;
	int unit;
	caddr_t params;
	{
	struct gre_softc *sc;

	sc = malloc(sizeof(struct gre_softc), M_GRE, M_WAITOK \| M_ZERO);

	GRE2IFP(sc) = if_alloc(IFT_TUNNEL);
	if (GRE2IFP(sc) == NULL) {
	free(sc, M_GRE);
	return (ENOSPC);
	}

	GRE2IFP(sc)->if_softc = sc;
	if_initname(GRE2IFP(sc), ifc->ifc_name, unit);

	GRE2IFP(sc)->if_snd.ifq_maxlen = IFQ_MAXLEN;
	GRE2IFP(sc)->if_addrlen = 0;
	GRE2IFP(sc)->if_hdrlen = 24; /* IP + GRE */
	GRE2IFP(sc)->if_mtu = GREMTU;
	GRE2IFP(sc)->if_flags = IFF_POINTOPOINT\|IFF_MULTICAST;
	GRE2IFP(sc)->if_output = gre_output;
	GRE2IFP(sc)->if_ioctl = gre_ioctl;
	sc->g_dst.s_addr = sc->g_src.s_addr = INADDR_ANY;
	sc->g_proto = IPPROTO_GRE;
	GRE2IFP(sc)->if_flags \|= IFF_LINK0;
	sc->encap = NULL;
	sc->called = 0;
	sc->gre_fibnum = curthread->td_proc->p_fibnum;
	sc->wccp_ver = WCCP_V1;
	sc->key = 0;
	if_attach(GRE2IFP(sc));
	bpfattach(GRE2IFP(sc), DLT_NULL, sizeof(u_int32_t));
	mtx_lock(&gre_mtx);
	LIST_INSERT_HEAD(&gre_softc_list, sc, sc_list);
	mtx_unlock(&gre_mtx);
	return (0);
	}

	static void
	gre_clone_destroy(ifp)
	struct ifnet *ifp;
	{
	struct gre_softc *sc = ifp->if_softc;

	mtx_lock(&gre_mtx);
	LIST_REMOVE(sc, sc_list);
	mtx_unlock(&gre_mtx);

	#ifdef INET
	if (sc->encap != NULL)
	encap_detach(sc->encap);
	#endif
	bpfdetach(ifp);
	if_detach(ifp);
	if_free(ifp);
	free(sc, M_GRE);
	}

	/*
	* The output routine. Takes a packet and encapsulates it in the protocol
	* given by sc->g_proto. See also RFC 1701 and RFC 2004
	*/
	static int
	gre_output(struct ifnet ifp, struct mbuf m, struct sockaddr *dst,
	struct rtentry *rt)
	{
	+#ifdef INET6
	+ INIT_VNET_INET(ifp->if_vnet);
	+#endif
	int error = 0;
	struct gre_softc *sc = ifp->if_softc;
	struct greip *gh;
	struct ip *ip;
	u_short gre_ip_id = 0;
	uint8_t gre_ip_tos = 0;
	u_int16_t etype = 0;
	struct mobile_h mob_h;
	u_int32_t af;
	int extra = 0;

	/*
	* gre may cause infinite recursion calls when misconfigured.
	* We'll prevent this by introducing upper limit.
	*/
	if (++(sc->called) > max_gre_nesting) {
	printf("%s: gre_output: recursively called too many "
	"times(%d)\n", if_name(GRE2IFP(sc)), sc->called);
	m_freem(m);
	error = EIO; /* is there better errno? */
	goto end;
	}

	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)) \|\|
	sc->g_src.s_addr == INADDR_ANY \|\| sc->g_dst.s_addr == INADDR_ANY) {
	m_freem(m);
	error = ENETDOWN;
	goto end;
	}

	gh = NULL;
	ip = NULL;

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	if (bpf_peers_present(ifp->if_bpf)) {
	af = dst->sa_family;
	bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
	}

	m->m_flags &= ~(M_BCAST\|M_MCAST);

	if (sc->g_proto == IPPROTO_MOBILE) {
	if (dst->sa_family == AF_INET) {
	struct mbuf *m0;
	int msiz;

	ip = mtod(m, struct ip *);

	/*
	* RFC2004 specifies that fragmented diagrams shouldn't
	* be encapsulated.
	*/
	if (ip->ip_off & (IP_MF \| IP_OFFMASK)) {
	_IF_DROP(&ifp->if_snd);
	m_freem(m);
	error = EINVAL; /* is there better errno? */
	goto end;
	}
	memset(&mob_h, 0, MOB_H_SIZ_L);
	mob_h.proto = (ip->ip_p) << 8;
	mob_h.odst = ip->ip_dst.s_addr;
	ip->ip_dst.s_addr = sc->g_dst.s_addr;

	/*
	* If the packet comes from our host, we only change
	* the destination address in the IP header.
	* Else we also need to save and change the source
	*/
	if (in_hosteq(ip->ip_src, sc->g_src)) {
	msiz = MOB_H_SIZ_S;
	} else {
	mob_h.proto \|= MOB_H_SBIT;
	mob_h.osrc = ip->ip_src.s_addr;
	ip->ip_src.s_addr = sc->g_src.s_addr;
	msiz = MOB_H_SIZ_L;
	}
	mob_h.proto = htons(mob_h.proto);
	mob_h.hcrc = gre_in_cksum((u_int16_t *)&mob_h, msiz);

	if ((m->m_data - msiz) < m->m_pktdat) {
	/* need new mbuf */
	MGETHDR(m0, M_DONTWAIT, MT_DATA);
	if (m0 == NULL) {
	_IF_DROP(&ifp->if_snd);
	m_freem(m);
	error = ENOBUFS;
	goto end;
	}
	m0->m_next = m;
	m->m_data += sizeof(struct ip);
	m->m_len -= sizeof(struct ip);
	m0->m_pkthdr.len = m->m_pkthdr.len + msiz;
	m0->m_len = msiz + sizeof(struct ip);
	m0->m_data += max_linkhdr;
	memcpy(mtod(m0, caddr_t), (caddr_t)ip,
	sizeof(struct ip));
	m = m0;
	} else { /* we have some space left in the old one */
	m->m_data -= msiz;
	m->m_len += msiz;
	m->m_pkthdr.len += msiz;
	bcopy(ip, mtod(m, caddr_t),
	sizeof(struct ip));
	}
	ip = mtod(m, struct ip *);
	memcpy((caddr_t)(ip + 1), &mob_h, (unsigned)msiz);
	ip->ip_len = ntohs(ip->ip_len) + msiz;
	} else { /* AF_INET */
	_IF_DROP(&ifp->if_snd);
	m_freem(m);
	error = EINVAL;
	goto end;
	}
	} else if (sc->g_proto == IPPROTO_GRE) {
	switch (dst->sa_family) {
	case AF_INET:
	ip = mtod(m, struct ip *);
	gre_ip_tos = ip->ip_tos;
	gre_ip_id = ip->ip_id;
	if (sc->wccp_ver == WCCP_V2) {
	extra = sizeof(uint32_t);
	etype = WCCP_PROTOCOL_TYPE;
	} else {
	etype = ETHERTYPE_IP;
	}
	break;
	#ifdef INET6
	case AF_INET6:
	gre_ip_id = ip_newid();
	etype = ETHERTYPE_IPV6;
	break;
	#endif
	#ifdef NETATALK
	case AF_APPLETALK:
	etype = ETHERTYPE_ATALK;
	break;
	#endif
	default:
	_IF_DROP(&ifp->if_snd);
	m_freem(m);
	error = EAFNOSUPPORT;
	goto end;
	}

	/* Reserve space for GRE header + optional GRE key */
	int hdrlen = sizeof(struct greip) + extra;
	if (sc->key)
	hdrlen += sizeof(uint32_t);
	M_PREPEND(m, hdrlen, M_DONTWAIT);
	} else {
	_IF_DROP(&ifp->if_snd);
	m_freem(m);
	error = EINVAL;
	goto end;
	}

	if (m == NULL) { /* mbuf allocation failed */
	_IF_DROP(&ifp->if_snd);
	error = ENOBUFS;
	goto end;
	}

	M_SETFIB(m, sc->gre_fibnum); /* The envelope may use a different FIB */

	gh = mtod(m, struct greip *);
	if (sc->g_proto == IPPROTO_GRE) {
	uint32_t *options = gh->gi_options;

	memset((void *)gh, 0, sizeof(struct greip) + extra);
	gh->gi_ptype = htons(etype);
	gh->gi_flags = 0;

	/* Add key option */
	if (sc->key)
	{
	gh->gi_flags \|= htons(GRE_KP);
	*(options++) = htonl(sc->key);
	}
	}

	gh->gi_pr = sc->g_proto;
	if (sc->g_proto != IPPROTO_MOBILE) {
	gh->gi_src = sc->g_src;
	gh->gi_dst = sc->g_dst;
	((struct ip*)gh)->ip_v = IPPROTO_IPV4;
	((struct ip*)gh)->ip_hl = (sizeof(struct ip)) >> 2;
	((struct ip*)gh)->ip_ttl = GRE_TTL;
	((struct ip*)gh)->ip_tos = gre_ip_tos;
	((struct ip*)gh)->ip_id = gre_ip_id;
	gh->gi_len = m->m_pkthdr.len;
	}

	ifp->if_opackets++;
	ifp->if_obytes += m->m_pkthdr.len;
	/*
	* Send it off and with IP_FORWARD flag to prevent it from
	* overwriting the ip_id again. ip_id is already set to the
	* ip_id of the encapsulated packet.
	*/
	error = ip_output(m, NULL, &sc->route, IP_FORWARDING,
	(struct ip_moptions )NULL, (struct inpcb )NULL);
	end:
	sc->called = 0;
	if (error)
	ifp->if_oerrors++;
	return (error);
	}

	static int
	gre_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct ifreq ifr = (struct ifreq )data;
	struct if_laddrreq lifr = (struct if_laddrreq )data;
	struct in_aliasreq aifr = (struct in_aliasreq )data;
	struct gre_softc *sc = ifp->if_softc;
	int s;
	struct sockaddr_in si;
	struct sockaddr *sa = NULL;
	int error, adj;
	struct sockaddr_in sp, sm, dp, dm;
	uint32_t key;

	error = 0;
	adj = 0;

	s = splnet();
	switch (cmd) {
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;
	break;
	case SIOCSIFDSTADDR:
	break;
	case SIOCSIFFLAGS:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_SETIFFLAGS)) != 0)
	break;
	if ((ifr->ifr_flags & IFF_LINK0) != 0)
	sc->g_proto = IPPROTO_GRE;
	else
	sc->g_proto = IPPROTO_MOBILE;
	if ((ifr->ifr_flags & IFF_LINK2) != 0)
	sc->wccp_ver = WCCP_V2;
	else
	sc->wccp_ver = WCCP_V1;
	goto recompute;
	case SIOCSIFMTU:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_SETIFMTU)) != 0)
	break;
	if (ifr->ifr_mtu < 576) {
	error = EINVAL;
	break;
	}
	ifp->if_mtu = ifr->ifr_mtu;
	break;
	case SIOCGIFMTU:
	ifr->ifr_mtu = GRE2IFP(sc)->if_mtu;
	break;
	case SIOCADDMULTI:
	/*
	* XXXRW: Isn't this priv_checkr() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_ADDMULTI)) != 0)
	break;
	if (ifr == 0) {
	error = EAFNOSUPPORT;
	break;
	}
	switch (ifr->ifr_addr.sa_family) {
	#ifdef INET
	case AF_INET:
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	break;
	#endif
	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;
	case SIOCDELMULTI:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_DELIFGROUP)) != 0)
	break;
	if (ifr == 0) {
	error = EAFNOSUPPORT;
	break;
	}
	switch (ifr->ifr_addr.sa_family) {
	#ifdef INET
	case AF_INET:
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	break;
	#endif
	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;
	case GRESPROTO:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0)
	break;
	sc->g_proto = ifr->ifr_flags;
	switch (sc->g_proto) {
	case IPPROTO_GRE:
	ifp->if_flags \|= IFF_LINK0;
	break;
	case IPPROTO_MOBILE:
	ifp->if_flags &= ~IFF_LINK0;
	break;
	default:
	error = EPROTONOSUPPORT;
	break;
	}
	goto recompute;
	case GREGPROTO:
	ifr->ifr_flags = sc->g_proto;
	break;
	case GRESADDRS:
	case GRESADDRD:
	error = priv_check(curthread, PRIV_NET_GRE);
	if (error)
	return (error);
	/*
	* set tunnel endpoints, compute a less specific route
	* to the remote end and mark if as up
	*/
	sa = &ifr->ifr_addr;
	if (cmd == GRESADDRS)
	sc->g_src = (satosin(sa))->sin_addr;
	if (cmd == GRESADDRD)
	sc->g_dst = (satosin(sa))->sin_addr;
	recompute:
	#ifdef INET
	if (sc->encap != NULL) {
	encap_detach(sc->encap);
	sc->encap = NULL;
	}
	#endif
	if ((sc->g_src.s_addr != INADDR_ANY) &&
	(sc->g_dst.s_addr != INADDR_ANY)) {
	bzero(&sp, sizeof(sp));
	bzero(&sm, sizeof(sm));
	bzero(&dp, sizeof(dp));
	bzero(&dm, sizeof(dm));
	sp.sin_len = sm.sin_len = dp.sin_len = dm.sin_len =
	sizeof(struct sockaddr_in);
	sp.sin_family = sm.sin_family = dp.sin_family =
	dm.sin_family = AF_INET;
	sp.sin_addr = sc->g_src;
	dp.sin_addr = sc->g_dst;
	sm.sin_addr.s_addr = dm.sin_addr.s_addr =
	INADDR_BROADCAST;
	#ifdef INET
	sc->encap = encap_attach(AF_INET, sc->g_proto,
	sintosa(&sp), sintosa(&sm), sintosa(&dp),
	sintosa(&dm), (sc->g_proto == IPPROTO_GRE) ?
	&in_gre_protosw : &in_mobile_protosw, sc);
	if (sc->encap == NULL)
	printf("%s: unable to attach encap\n",
	if_name(GRE2IFP(sc)));
	#endif
	if (sc->route.ro_rt != 0) /* free old route */
	RTFREE(sc->route.ro_rt);
	if (gre_compute_route(sc) == 0)
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	else
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	}
	break;
	case GREGADDRS:
	memset(&si, 0, sizeof(si));
	si.sin_family = AF_INET;
	si.sin_len = sizeof(struct sockaddr_in);
	si.sin_addr.s_addr = sc->g_src.s_addr;
	sa = sintosa(&si);
	ifr->ifr_addr = *sa;
	break;
	case GREGADDRD:
	memset(&si, 0, sizeof(si));
	si.sin_family = AF_INET;
	si.sin_len = sizeof(struct sockaddr_in);
	si.sin_addr.s_addr = sc->g_dst.s_addr;
	sa = sintosa(&si);
	ifr->ifr_addr = *sa;
	break;
	case SIOCSIFPHYADDR:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0)
	break;
	if (aifr->ifra_addr.sin_family != AF_INET \|\|
	aifr->ifra_dstaddr.sin_family != AF_INET) {
	error = EAFNOSUPPORT;
	break;
	}
	if (aifr->ifra_addr.sin_len != sizeof(si) \|\|
	aifr->ifra_dstaddr.sin_len != sizeof(si)) {
	error = EINVAL;
	break;
	}
	sc->g_src = aifr->ifra_addr.sin_addr;
	sc->g_dst = aifr->ifra_dstaddr.sin_addr;
	goto recompute;
	case SIOCSLIFPHYADDR:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0)
	break;
	if (lifr->addr.ss_family != AF_INET \|\|
	lifr->dstaddr.ss_family != AF_INET) {
	error = EAFNOSUPPORT;
	break;
	}
	if (lifr->addr.ss_len != sizeof(si) \|\|
	lifr->dstaddr.ss_len != sizeof(si)) {
	error = EINVAL;
	break;
	}
	sc->g_src = (satosin(&lifr->addr))->sin_addr;
	sc->g_dst =
	(satosin(&lifr->dstaddr))->sin_addr;
	goto recompute;
	case SIOCDIFPHYADDR:
	/*
	* XXXRW: Isn't this priv_check() redundant to the ifnet
	* layer check?
	*/
	if ((error = priv_check(curthread, PRIV_NET_SETIFPHYS)) != 0)
	break;
	sc->g_src.s_addr = INADDR_ANY;
	sc->g_dst.s_addr = INADDR_ANY;
	goto recompute;
	case SIOCGLIFPHYADDR:
	if (sc->g_src.s_addr == INADDR_ANY \|\|
	sc->g_dst.s_addr == INADDR_ANY) {
	error = EADDRNOTAVAIL;
	break;
	}
	memset(&si, 0, sizeof(si));
	si.sin_family = AF_INET;
	si.sin_len = sizeof(struct sockaddr_in);
	si.sin_addr.s_addr = sc->g_src.s_addr;
	memcpy(&lifr->addr, &si, sizeof(si));
	si.sin_addr.s_addr = sc->g_dst.s_addr;
	memcpy(&lifr->dstaddr, &si, sizeof(si));
	break;
	case SIOCGIFPSRCADDR:
	#ifdef INET6
	case SIOCGIFPSRCADDR_IN6:
	#endif
	if (sc->g_src.s_addr == INADDR_ANY) {
	error = EADDRNOTAVAIL;
	break;
	}
	memset(&si, 0, sizeof(si));
	si.sin_family = AF_INET;
	si.sin_len = sizeof(struct sockaddr_in);
	si.sin_addr.s_addr = sc->g_src.s_addr;
	bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr));
	break;
	case SIOCGIFPDSTADDR:
	#ifdef INET6
	case SIOCGIFPDSTADDR_IN6:
	#endif
	if (sc->g_dst.s_addr == INADDR_ANY) {
	error = EADDRNOTAVAIL;
	break;
	}
	memset(&si, 0, sizeof(si));
	si.sin_family = AF_INET;
	si.sin_len = sizeof(struct sockaddr_in);
	si.sin_addr.s_addr = sc->g_dst.s_addr;
	bcopy(&si, &ifr->ifr_addr, sizeof(ifr->ifr_addr));
	break;
	case GRESKEY:
	error = priv_check(curthread, PRIV_NET_GRE);
	if (error)
	break;
	error = copyin(ifr->ifr_data, &key, sizeof(key));
	if (error)
	break;
	/* adjust MTU for option header */
	if (key == 0 && sc->key != 0) /* clear */
	adj += sizeof(key);
	else if (key != 0 && sc->key == 0) /* set */
	adj -= sizeof(key);

	if (ifp->if_mtu + adj < 576) {
	error = EINVAL;
	break;
	}
	ifp->if_mtu += adj;
	sc->key = key;
	break;
	case GREGKEY:
	error = copyout(&sc->key, ifr->ifr_data, sizeof(sc->key));
	break;

	default:
	error = EINVAL;
	break;
	}

	splx(s);
	return (error);
	}

	/*
	* computes a route to our destination that is not the one
	* which would be taken by ip_output(), as this one will loop back to
	* us. If the interface is p2p as a--->b, then a routing entry exists
	* If we now send a packet to b (e.g. ping b), this will come down here
	* gets src=a, dst=b tacked on and would from ip_output() sent back to
	* if_gre.
	* Goal here is to compute a route to b that is less specific than
	* a-->b. We know that this one exists as in normal operation we have
	* at least a default route which matches.
	*/
	static int
	gre_compute_route(struct gre_softc *sc)
	{
	struct route *ro;

	ro = &sc->route;

	memset(ro, 0, sizeof(struct route));
	((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst;
	ro->ro_dst.sa_family = AF_INET;
	ro->ro_dst.sa_len = sizeof(ro->ro_dst);

	/*
	* toggle last bit, so our interface is not found, but a less
	* specific route. I'd rather like to specify a shorter mask,
	* but this is not possible. Should work though. XXX
	* XXX MRT Use a different FIB for the tunnel to solve this problem.
	*/
	if ((GRE2IFP(sc)->if_flags & IFF_LINK1) == 0) {
	((struct sockaddr_in *)&ro->ro_dst)->sin_addr.s_addr ^=
	htonl(0x01);
	}

	#ifdef DIAGNOSTIC
	printf("%s: searching for a route to %s", if_name(GRE2IFP(sc)),
	inet_ntoa(((struct sockaddr_in *)&ro->ro_dst)->sin_addr));
	#endif

	rtalloc_fib(ro, sc->gre_fibnum);

	/*
	* check if this returned a route at all and this route is no
	* recursion to ourself
	*/
	if (ro->ro_rt == NULL \|\| ro->ro_rt->rt_ifp->if_softc == sc) {
	#ifdef DIAGNOSTIC
	if (ro->ro_rt == NULL)
	printf(" - no route found!\n");
	else
	printf(" - route loops back to ourself!\n");
	#endif
	return EADDRNOTAVAIL;
	}

	/*
	* now change it back - else ip_output will just drop
	* the route and search one to this interface ...
	*/
	if ((GRE2IFP(sc)->if_flags & IFF_LINK1) == 0)
	((struct sockaddr_in *)&ro->ro_dst)->sin_addr = sc->g_dst;

	#ifdef DIAGNOSTIC
	printf(", choosing %s with gateway %s", if_name(ro->ro_rt->rt_ifp),
	inet_ntoa(((struct sockaddr_in *)(ro->ro_rt->rt_gateway))->sin_addr));
	printf("\n");
	#endif

	return 0;
	}

	/*
	* do a checksum of a buffer - much like in_cksum, which operates on
	* mbufs.
	*/
	u_int16_t
	gre_in_cksum(u_int16_t *p, u_int len)
	{
	u_int32_t sum = 0;
	int nwords = len >> 1;

	while (nwords-- != 0)
	sum += *p++;

	if (len & 1) {
	union {
	u_short w;
	u_char c[2];
	} u;
	u.c[0] = (u_char )p;
	u.c[1] = 0;
	sum += u.w;
	}

	/* end-around-carry */
	sum = (sum >> 16) + (sum & 0xffff);
	sum += (sum >> 16);
	return (~sum);
	}

	static int
	gremodevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	greattach();
	break;
	case MOD_UNLOAD:
	if_clone_detach(&gre_cloner);
	mtx_destroy(&gre_mtx);
	break;
	default:
	return EOPNOTSUPP;
	}
	return 0;
	}

	static moduledata_t gre_mod = {
	"if_gre",
	gremodevent,
	0
	};

	DECLARE_MODULE(if_gre, gre_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_VERSION(if_gre, 1);
	Index: head/sys/net/if_loop.c
	===================================================================
	--- head/sys/net/if_loop.c (revision 183549)
	+++ head/sys/net/if_loop.c (revision 183550)
	@@ -1,369 +1,371 @@
	/*-
	* Copyright (c) 1982, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)if_loop.c 8.2 (Berkeley) 1/9/95
	* $FreeBSD$
	*/

	/*
	* Loopback interface driver for protocol testing and timing.
	*/

	#include "opt_atalk.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipx.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_types.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/bpf.h>

	#ifdef INET
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#endif

	#ifdef IPX
	#include <netipx/ipx.h>
	#include <netipx/ipx_if.h>
	#endif

	#ifdef INET6
	#ifndef INET
	#include <netinet/in.h>
	#endif
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#endif

	#ifdef NETATALK
	#include <netatalk/at.h>
	#include <netatalk/at_var.h>
	#endif

	#ifdef TINY_LOMTU
	#define LOMTU (1024+512)
	#elif defined(LARGE_LOMTU)
	#define LOMTU 131072
	#else
	#define LOMTU 16384
	#endif

	int loioctl(struct ifnet *, u_long, caddr_t);
	static void lortrequest(int, struct rtentry , struct rt_addrinfo );
	int looutput(struct ifnet ifp, struct mbuf m,
	struct sockaddr dst, struct rtentry rt);
	static int lo_clone_create(struct if_clone *, int, caddr_t);
	static void lo_clone_destroy(struct ifnet *);

	struct ifnet loif = NULL; / Used externally */

	IFC_SIMPLE_DECLARE(lo, 1);

	static void
	lo_clone_destroy(struct ifnet *ifp)
	{

	/* XXX: destroying lo0 will lead to panics. */
	KASSERT(V_loif != ifp, ("%s: destroying lo0", __func__));

	bpfdetach(ifp);
	if_detach(ifp);
	if_free(ifp);
	}

	static int
	lo_clone_create(struct if_clone *ifc, int unit, caddr_t params)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;

	ifp = if_alloc(IFT_LOOP);
	if (ifp == NULL)
	return (ENOSPC);

	if_initname(ifp, ifc->ifc_name, unit);
	ifp->if_mtu = LOMTU;
	ifp->if_flags = IFF_LOOPBACK \| IFF_MULTICAST;
	ifp->if_ioctl = loioctl;
	ifp->if_output = looutput;
	ifp->if_snd.ifq_maxlen = ifqmaxlen;
	if_attach(ifp);
	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
	if (V_loif == NULL)
	V_loif = ifp;

	return (0);
	}

	static int
	loop_modevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	if_clone_attach(&lo_cloner);
	break;

	case MOD_UNLOAD:
	printf("loop module unload - not possible for this module type\n");
	return (EINVAL);

	default:
	return (EOPNOTSUPP);
	}
	return (0);
	}

	static moduledata_t loop_mod = {
	"loop",
	loop_modevent,
	0
	};

	DECLARE_MODULE(loop, loop_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);

	int
	looutput(struct ifnet ifp, struct mbuf m, struct sockaddr *dst,
	struct rtentry *rt)
	{
	u_int32_t af;

	M_ASSERTPKTHDR(m); /* check if we have the packet header */

	if (rt && rt->rt_flags & (RTF_REJECT\|RTF_BLACKHOLE)) {
	m_freem(m);
	return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
	rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
	}

	ifp->if_opackets++;
	ifp->if_obytes += m->m_pkthdr.len;

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	#if 1 /* XXX */
	switch (dst->sa_family) {
	case AF_INET:
	case AF_INET6:
	case AF_IPX:
	case AF_APPLETALK:
	break;
	default:
	printf("looutput: af=%d unexpected\n", dst->sa_family);
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	#endif
	return (if_simloop(ifp, m, dst->sa_family, 0));
	}

	/*
	* if_simloop()
	*
	* This function is to support software emulation of hardware loopback,
	* i.e., for interfaces with the IFF_SIMPLEX attribute. Since they can't
	* hear their own broadcasts, we create a copy of the packet that we
	* would normally receive via a hardware loopback.
	*
	* This function expects the packet to include the media header of length hlen.
	*/
	int
	if_simloop(struct ifnet ifp, struct mbuf m, int af, int hlen)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	int isr;

	M_ASSERTPKTHDR(m);
	m_tag_delete_nonpersistent(m);
	m->m_pkthdr.rcvif = ifp;

	/*
	* Let BPF see incoming packet in the following manner:
	* - Emulated packet loopback for a simplex interface
	* (net/if_ethersubr.c)
	* -> passes it to ifp's BPF
	* - IPv4/v6 multicast packet loopback (netinet(6)/ip(6)_output.c)
	* -> not passes it to any BPF
	* - Normal packet loopback from myself to myself (net/if_loop.c)
	* -> passes to lo0's BPF (even in case of IPv6, where ifp!=lo0)
	*/
	if (hlen > 0) {
	if (bpf_peers_present(ifp->if_bpf)) {
	bpf_mtap(ifp->if_bpf, m);
	}
	} else {
	if (bpf_peers_present(V_loif->if_bpf)) {
	if ((m->m_flags & M_MCAST) == 0 \|\| V_loif == ifp) {
	/* XXX beware sizeof(af) != 4 */
	u_int32_t af1 = af;

	/*
	* We need to prepend the address family.
	*/
	bpf_mtap2(V_loif->if_bpf, &af1, sizeof(af1), m);
	}
	}
	}

	/* Strip away media header */
	if (hlen > 0) {
	m_adj(m, hlen);
	#ifndef __NO_STRICT_ALIGNMENT
	/*
	* Some archs do not like unaligned data, so
	* we move data down in the first mbuf.
	*/
	if (mtod(m, vm_offset_t) & 3) {
	KASSERT(hlen >= 3, ("if_simloop: hlen too small"));
	bcopy(m->m_data,
	(char *)(mtod(m, vm_offset_t)
	- (mtod(m, vm_offset_t) & 3)),
	m->m_len);
	m->m_data -= (mtod(m,vm_offset_t) & 3);
	}
	#endif
	}

	/* Deliver to upper layer protocol */
	switch (af) {
	#ifdef INET
	case AF_INET:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	m->m_flags \|= M_LOOP;
	isr = NETISR_IPV6;
	break;
	#endif
	#ifdef IPX
	case AF_IPX:
	isr = NETISR_IPX;
	break;
	#endif
	#ifdef NETATALK
	case AF_APPLETALK:
	isr = NETISR_ATALK2;
	break;
	#endif
	default:
	printf("if_simloop: can't handle af=%d\n", af);
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	ifp->if_ipackets++;
	ifp->if_ibytes += m->m_pkthdr.len;
	netisr_queue(isr, m); /* mbuf is free'd on failure. */
	return (0);
	}

	/* ARGSUSED */
	static void
	lortrequest(int cmd, struct rtentry rt, struct rt_addrinfo info)
	{

	RT_LOCK_ASSERT(rt);
	rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
	}

	/*
	* Process an ioctl request.
	*/
	/* ARGSUSED */
	int
	loioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct ifaddr *ifa;
	struct ifreq ifr = (struct ifreq )data;
	int error = 0;

	switch (cmd) {
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	ifa = (struct ifaddr *)data;
	ifa->ifa_rtrequest = lortrequest;
	/*
	* Everything else is done at a higher level.
	*/
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	if (ifr == 0) {
	error = EAFNOSUPPORT; /* XXX */
	break;
	}
	switch (ifr->ifr_addr.sa_family) {

	#ifdef INET
	case AF_INET:
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	break;
	#endif

	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;

	case SIOCSIFMTU:
	ifp->if_mtu = ifr->ifr_mtu;
	break;

	case SIOCSIFFLAGS:
	break;

	default:
	error = EINVAL;
	}
	return (error);
	}
	Index: head/sys/net/if_mib.c
	===================================================================
	--- head/sys/net/if_mib.c (revision 183549)
	+++ head/sys/net/if_mib.c (revision 183550)
	@@ -1,163 +1,166 @@
	/*-
	* Copyright 1996 Massachusetts Institute of Technology
	*
	* Permission to use, copy, modify, and distribute this software and
	* its documentation for any purpose and without fee is hereby
	* granted, provided that both the above copyright notice and this
	* permission notice appear in all copies, that both the above
	* copyright notice and this permission notice appear in all
	* supporting documentation, and that the name of M.I.T. not be used
	* in advertising or publicity pertaining to distribution of the
	* software without specific, written prior permission. M.I.T. makes
	* no representations about the suitability of this software for any
	* purpose. It is provided "as is" without express or implied
	* warranty.
	*
	* THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
	* ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
	* SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_mib.h>

	/*
	* A sysctl(3) MIB for generic interface information. This information
	* is exported in the net.link.generic branch, which has the following
	* structure:
	*
	* net.link.generic .system - system-wide control variables
	* and statistics (node)
	* .ifdata.<ifindex>.general
	* - what's in `struct ifdata'
	* plus some other info
	* .ifdata.<ifindex>.linkspecific
	* - a link-type-specific data
	* structure (as might be used
	* by an SNMP agent
	*
	* Perhaps someday we will make addresses accessible via this interface
	* as well (then there will be four such...). The reason that the
	* index comes before the last element in the name is because it
	* seems more orthogonal that way, particularly with the possibility
	* of other per-interface data living down here as well (e.g., integrated
	* services stuff).
	*/

	SYSCTL_DECL(_net_link_generic);
	SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0,
	"Variables global to all interfaces");
	-SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD,
	- &if_index, 0, "Number of configured interfaces");

	+SYSCTL_V_INT(V_NET, vnet_net, _net_link_generic_system, IFMIB_IFCOUNT,
	+ ifcount, CTLFLAG_RD, if_index, 0,
	+ "Number of configured interfaces");
	+
	static int
	sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */
	{
	+ INIT_VNET_NET(curvnet);
	int name = (int )arg1;
	int error;
	u_int namelen = arg2;
	struct ifnet *ifp;
	struct ifmibdata ifmd;
	size_t dlen;
	char *dbuf;

	if (namelen != 2)
	return EINVAL;

	if (name[0] <= 0 \|\| name[0] > V_if_index \|\|
	ifnet_byindex(name[0]) == NULL)
	return ENOENT;

	ifp = ifnet_byindex(name[0]);

	switch(name[1]) {
	default:
	return ENOENT;

	case IFDATA_GENERAL:
	bzero(&ifmd, sizeof(ifmd));
	strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name));

	#define COPY(fld) ifmd.ifmd_##fld = ifp->if_##fld
	COPY(pcount);
	COPY(data);
	#undef COPY
	ifmd.ifmd_flags = ifp->if_flags \| ifp->if_drv_flags;
	ifmd.ifmd_snd_len = ifp->if_snd.ifq_len;
	ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen;
	ifmd.ifmd_snd_drops = ifp->if_snd.ifq_drops;

	error = SYSCTL_OUT(req, &ifmd, sizeof ifmd);
	if (error \|\| !req->newptr)
	return error;

	error = SYSCTL_IN(req, &ifmd, sizeof ifmd);
	if (error)
	return error;

	#define DONTCOPY(fld) ifmd.ifmd_data.ifi_##fld = ifp->if_data.ifi_##fld
	DONTCOPY(type);
	DONTCOPY(physical);
	DONTCOPY(addrlen);
	DONTCOPY(hdrlen);
	DONTCOPY(mtu);
	DONTCOPY(metric);
	DONTCOPY(baudrate);
	#undef DONTCOPY
	#define COPY(fld) ifp->if_##fld = ifmd.ifmd_##fld
	COPY(data);
	ifp->if_snd.ifq_maxlen = ifmd.ifmd_snd_maxlen;
	ifp->if_snd.ifq_drops = ifmd.ifmd_snd_drops;
	#undef COPY
	break;

	case IFDATA_LINKSPECIFIC:
	error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen);
	if (error \|\| !req->newptr)
	return error;

	error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen);
	if (error)
	return error;
	break;

	case IFDATA_DRIVERNAME:
	/* 20 is enough for 64bit ints */
	dlen = strlen(ifp->if_dname) + 20 + 1;
	if ((dbuf = malloc(dlen, M_TEMP, M_NOWAIT)) == NULL)
	return (ENOMEM);
	if (ifp->if_dunit == IF_DUNIT_NONE)
	strcpy(dbuf, ifp->if_dname);
	else
	sprintf(dbuf, "%s%d", ifp->if_dname, ifp->if_dunit);

	error = SYSCTL_OUT(req, dbuf, strlen(dbuf) + 1);
	if (error == 0 && req->newptr != NULL)
	error = EPERM;
	free(dbuf, M_TEMP);
	return (error);
	}
	return 0;
	}

	SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW,
	sysctl_ifdata, "Interface table");

	Index: head/sys/net/if_spppsubr.c
	===================================================================
	--- head/sys/net/if_spppsubr.c (revision 183549)
	+++ head/sys/net/if_spppsubr.c (revision 183550)
	@@ -1,5418 +1,5419 @@
	/*
	* Synchronous PPP/Cisco/Frame Relay link level subroutines.
	* Keepalive protocol implemented in both Cisco and PPP modes.
	*/
	/*-
	* Copyright (C) 1994-2000 Cronyx Engineering.
	* Author: Serge Vakulenko, <vak@cronyx.ru>
	*
	* Heavily revamped to conform to RFC 1661.
	* Copyright (C) 1997, 2001 Joerg Wunsch.
	*
	* This software is distributed with NO WARRANTIES, not even the implied
	* warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
	*
	* Authors grant any other persons or organisations permission to use
	* or modify this software as long as this message is kept with the software,
	* all derivative works or modified versions.
	*
	* From: Version 2.4, Thu Apr 30 17:17:21 MSD 1997
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipx.h"

	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/sockio.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/random.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/vimage.h>

	#include <sys/md5.h>

	#include <net/if.h>
	#include <net/netisr.h>
	#include <net/if_types.h>
	#include <net/route.h>
	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <net/slcompress.h>

	#include <machine/stdarg.h>

	#include <netinet/in_var.h>

	#ifdef INET
	#include <netinet/ip.h>
	#include <netinet/tcp.h>
	#endif

	#ifdef INET6
	#include <netinet6/scope6_var.h>
	#endif

	#include <netinet/if_ether.h>

	#ifdef IPX
	#include <netipx/ipx.h>
	#include <netipx/ipx_if.h>
	#endif

	#include <net/if_sppp.h>

	#define IOCTL_CMD_T u_long
	#define MAXALIVECNT 3 /* max. alive packets */

	/*
	* Interface flags that can be set in an ifconfig command.
	*
	* Setting link0 will make the link passive, i.e. it will be marked
	* as being administrative openable, but won't be opened to begin
	* with. Incoming calls will be answered, or subsequent calls with
	* -link1 will cause the administrative open of the LCP layer.
	*
	* Setting link1 will cause the link to auto-dial only as packets
	* arrive to be sent.
	*
	* Setting IFF_DEBUG will syslog the option negotiation and state
	* transitions at level kern.debug. Note: all logs consistently look
	* like
	*
	* <if-name><unit>: <proto-name> <additional info...>
	*
	* with <if-name><unit> being something like "bppp0", and <proto-name>
	* being one of "lcp", "ipcp", "cisco", "chap", "pap", etc.
	*/

	#define IFF_PASSIVE IFF_LINK0 /* wait passively for connection */
	#define IFF_AUTO IFF_LINK1 /* auto-dial on output */
	#define IFF_CISCO IFF_LINK2 /* auto-dial on output */

	#define PPP_ALLSTATIONS 0xff /* All-Stations broadcast address */
	#define PPP_UI 0x03 /* Unnumbered Information */
	#define PPP_IP 0x0021 /* Internet Protocol */
	#define PPP_ISO 0x0023 /* ISO OSI Protocol */
	#define PPP_XNS 0x0025 /* Xerox NS Protocol */
	#define PPP_IPX 0x002b /* Novell IPX Protocol */
	#define PPP_VJ_COMP 0x002d /* VJ compressed TCP/IP */
	#define PPP_VJ_UCOMP 0x002f /* VJ uncompressed TCP/IP */
	#define PPP_IPV6 0x0057 /* Internet Protocol Version 6 */
	#define PPP_LCP 0xc021 /* Link Control Protocol */
	#define PPP_PAP 0xc023 /* Password Authentication Protocol */
	#define PPP_CHAP 0xc223 /* Challenge-Handshake Auth Protocol */
	#define PPP_IPCP 0x8021 /* Internet Protocol Control Protocol */
	#define PPP_IPV6CP 0x8057 /* IPv6 Control Protocol */

	#define CONF_REQ 1 /* PPP configure request */
	#define CONF_ACK 2 /* PPP configure acknowledge */
	#define CONF_NAK 3 /* PPP configure negative ack */
	#define CONF_REJ 4 /* PPP configure reject */
	#define TERM_REQ 5 /* PPP terminate request */
	#define TERM_ACK 6 /* PPP terminate acknowledge */
	#define CODE_REJ 7 /* PPP code reject */
	#define PROTO_REJ 8 /* PPP protocol reject */
	#define ECHO_REQ 9 /* PPP echo request */
	#define ECHO_REPLY 10 /* PPP echo reply */
	#define DISC_REQ 11 /* PPP discard request */

	#define LCP_OPT_MRU 1 /* maximum receive unit */
	#define LCP_OPT_ASYNC_MAP 2 /* async control character map */
	#define LCP_OPT_AUTH_PROTO 3 /* authentication protocol */
	#define LCP_OPT_QUAL_PROTO 4 /* quality protocol */
	#define LCP_OPT_MAGIC 5 /* magic number */
	#define LCP_OPT_RESERVED 6 /* reserved */
	#define LCP_OPT_PROTO_COMP 7 /* protocol field compression */
	#define LCP_OPT_ADDR_COMP 8 /* address/control field compression */

	#define IPCP_OPT_ADDRESSES 1 /* both IP addresses; deprecated */
	#define IPCP_OPT_COMPRESSION 2 /* IP compression protocol (VJ) */
	#define IPCP_OPT_ADDRESS 3 /* local IP address */

	#define IPV6CP_OPT_IFID 1 /* interface identifier */
	#define IPV6CP_OPT_COMPRESSION 2 /* IPv6 compression protocol */

	#define IPCP_COMP_VJ 0x2d /* Code for VJ compression */

	#define PAP_REQ 1 /* PAP name/password request */
	#define PAP_ACK 2 /* PAP acknowledge */
	#define PAP_NAK 3 /* PAP fail */

	#define CHAP_CHALLENGE 1 /* CHAP challenge request */
	#define CHAP_RESPONSE 2 /* CHAP challenge response */
	#define CHAP_SUCCESS 3 /* CHAP response ok */
	#define CHAP_FAILURE 4 /* CHAP response failed */

	#define CHAP_MD5 5 /* hash algorithm - MD5 */

	#define CISCO_MULTICAST 0x8f /* Cisco multicast address */
	#define CISCO_UNICAST 0x0f /* Cisco unicast address */
	#define CISCO_KEEPALIVE 0x8035 /* Cisco keepalive protocol */
	#define CISCO_ADDR_REQ 0 /* Cisco address request */
	#define CISCO_ADDR_REPLY 1 /* Cisco address reply */
	#define CISCO_KEEPALIVE_REQ 2 /* Cisco keepalive request */

	/* states are named and numbered according to RFC 1661 */
	#define STATE_INITIAL 0
	#define STATE_STARTING 1
	#define STATE_CLOSED 2
	#define STATE_STOPPED 3
	#define STATE_CLOSING 4
	#define STATE_STOPPING 5
	#define STATE_REQ_SENT 6
	#define STATE_ACK_RCVD 7
	#define STATE_ACK_SENT 8
	#define STATE_OPENED 9

	MALLOC_DEFINE(M_SPPP, "sppp", "synchronous PPP interface internals");

	struct ppp_header {
	u_char address;
	u_char control;
	u_short protocol;
	} __packed;
	#define PPP_HEADER_LEN sizeof (struct ppp_header)

	struct lcp_header {
	u_char type;
	u_char ident;
	u_short len;
	} __packed;
	#define LCP_HEADER_LEN sizeof (struct lcp_header)

	struct cisco_packet {
	u_long type;
	u_long par1;
	u_long par2;
	u_short rel;
	u_short time0;
	u_short time1;
	} __packed;
	#define CISCO_PACKET_LEN sizeof (struct cisco_packet)

	/*
	* We follow the spelling and capitalization of RFC 1661 here, to make
	* it easier comparing with the standard. Please refer to this RFC in
	* case you can't make sense out of these abbreviation; it will also
	* explain the semantics related to the various events and actions.
	*/
	struct cp {
	u_short proto; /* PPP control protocol number */
	u_char protoidx; /* index into state table in struct sppp */
	u_char flags;
	#define CP_LCP 0x01 /* this is the LCP */
	#define CP_AUTH 0x02 /* this is an authentication protocol */
	#define CP_NCP 0x04 /* this is a NCP */
	#define CP_QUAL 0x08 /* this is a quality reporting protocol */
	const char name; / name of this control protocol */
	/* event handlers */
	void (Up)(struct sppp sp);
	void (Down)(struct sppp sp);
	void (Open)(struct sppp sp);
	void (Close)(struct sppp sp);
	void (TO)(void sp);
	int (RCR)(struct sppp sp, struct lcp_header *h, int len);
	void (RCN_rej)(struct sppp sp, struct lcp_header *h, int len);
	void (RCN_nak)(struct sppp sp, struct lcp_header *h, int len);
	/* actions */
	void (tlu)(struct sppp sp);
	void (tld)(struct sppp sp);
	void (tls)(struct sppp sp);
	void (tlf)(struct sppp sp);
	void (scr)(struct sppp sp);
	};

	#define SPP_FMT "%s: "
	#define SPP_ARGS(ifp) (ifp)->if_xname

	#define SPPP_LOCK(sp) \
	do { \
	if (!(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT)) \
	mtx_lock (&(sp)->mtx); \
	} while (0)
	#define SPPP_UNLOCK(sp) \
	do { \
	if (!(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT)) \
	mtx_unlock (&(sp)->mtx); \
	} while (0)

	#define SPPP_LOCK_ASSERT(sp) \
	do { \
	if (!(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT)) \
	mtx_assert (&(sp)->mtx, MA_OWNED); \
	} while (0)
	#define SPPP_LOCK_OWNED(sp) \
	(!(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) && \
	mtx_owned (&sp->mtx))

	#ifdef INET
	/*
	* The following disgusting hack gets around the problem that IP TOS
	* can't be set yet. We want to put "interactive" traffic on a high
	* priority queue. To decide if traffic is interactive, we check that
	* a) it is TCP and b) one of its ports is telnet, rlogin or ftp control.
	*
	* XXX is this really still necessary? - joerg -
	*/
	static const u_short interactive_ports[8] = {
	0, 513, 0, 0,
	0, 21, 0, 23,
	};
	#define INTERACTIVE(p) (interactive_ports[(p) & 7] == (p))
	#endif

	/* almost every function needs these */
	#define STDDCL \
	struct ifnet *ifp = SP2IFP(sp); \
	int debug = ifp->if_flags & IFF_DEBUG

	static int sppp_output(struct ifnet ifp, struct mbuf m,
	struct sockaddr dst, struct rtentry rt);

	static void sppp_cisco_send(struct sppp *sp, int type, long par1, long par2);
	static void sppp_cisco_input(struct sppp sp, struct mbuf m);

	static void sppp_cp_input(const struct cp cp, struct sppp sp,
	struct mbuf *m);
	static void sppp_cp_send(struct sppp *sp, u_short proto, u_char type,
	u_char ident, u_short len, void *data);
	/* static void sppp_cp_timeout(void arg); /
	static void sppp_cp_change_state(const struct cp cp, struct sppp sp,
	int newstate);
	static void sppp_auth_send(const struct cp *cp,
	struct sppp *sp, unsigned int type, unsigned int id,
	...);

	static void sppp_up_event(const struct cp cp, struct sppp sp);
	static void sppp_down_event(const struct cp cp, struct sppp sp);
	static void sppp_open_event(const struct cp cp, struct sppp sp);
	static void sppp_close_event(const struct cp cp, struct sppp sp);
	static void sppp_to_event(const struct cp cp, struct sppp sp);

	static void sppp_null(struct sppp *sp);

	static void sppp_pp_up(struct sppp *sp);
	static void sppp_pp_down(struct sppp *sp);

	static void sppp_lcp_init(struct sppp *sp);
	static void sppp_lcp_up(struct sppp *sp);
	static void sppp_lcp_down(struct sppp *sp);
	static void sppp_lcp_open(struct sppp *sp);
	static void sppp_lcp_close(struct sppp *sp);
	static void sppp_lcp_TO(void *sp);
	static int sppp_lcp_RCR(struct sppp sp, struct lcp_header h, int len);
	static void sppp_lcp_RCN_rej(struct sppp sp, struct lcp_header h, int len);
	static void sppp_lcp_RCN_nak(struct sppp sp, struct lcp_header h, int len);
	static void sppp_lcp_tlu(struct sppp *sp);
	static void sppp_lcp_tld(struct sppp *sp);
	static void sppp_lcp_tls(struct sppp *sp);
	static void sppp_lcp_tlf(struct sppp *sp);
	static void sppp_lcp_scr(struct sppp *sp);
	static void sppp_lcp_check_and_close(struct sppp *sp);
	static int sppp_ncp_check(struct sppp *sp);

	static void sppp_ipcp_init(struct sppp *sp);
	static void sppp_ipcp_up(struct sppp *sp);
	static void sppp_ipcp_down(struct sppp *sp);
	static void sppp_ipcp_open(struct sppp *sp);
	static void sppp_ipcp_close(struct sppp *sp);
	static void sppp_ipcp_TO(void *sp);
	static int sppp_ipcp_RCR(struct sppp sp, struct lcp_header h, int len);
	static void sppp_ipcp_RCN_rej(struct sppp sp, struct lcp_header h, int len);
	static void sppp_ipcp_RCN_nak(struct sppp sp, struct lcp_header h, int len);
	static void sppp_ipcp_tlu(struct sppp *sp);
	static void sppp_ipcp_tld(struct sppp *sp);
	static void sppp_ipcp_tls(struct sppp *sp);
	static void sppp_ipcp_tlf(struct sppp *sp);
	static void sppp_ipcp_scr(struct sppp *sp);

	static void sppp_ipv6cp_init(struct sppp *sp);
	static void sppp_ipv6cp_up(struct sppp *sp);
	static void sppp_ipv6cp_down(struct sppp *sp);
	static void sppp_ipv6cp_open(struct sppp *sp);
	static void sppp_ipv6cp_close(struct sppp *sp);
	static void sppp_ipv6cp_TO(void *sp);
	static int sppp_ipv6cp_RCR(struct sppp sp, struct lcp_header h, int len);
	static void sppp_ipv6cp_RCN_rej(struct sppp sp, struct lcp_header h, int len);
	static void sppp_ipv6cp_RCN_nak(struct sppp sp, struct lcp_header h, int len);
	static void sppp_ipv6cp_tlu(struct sppp *sp);
	static void sppp_ipv6cp_tld(struct sppp *sp);
	static void sppp_ipv6cp_tls(struct sppp *sp);
	static void sppp_ipv6cp_tlf(struct sppp *sp);
	static void sppp_ipv6cp_scr(struct sppp *sp);

	static void sppp_pap_input(struct sppp sp, struct mbuf m);
	static void sppp_pap_init(struct sppp *sp);
	static void sppp_pap_open(struct sppp *sp);
	static void sppp_pap_close(struct sppp *sp);
	static void sppp_pap_TO(void *sp);
	static void sppp_pap_my_TO(void *sp);
	static void sppp_pap_tlu(struct sppp *sp);
	static void sppp_pap_tld(struct sppp *sp);
	static void sppp_pap_scr(struct sppp *sp);

	static void sppp_chap_input(struct sppp sp, struct mbuf m);
	static void sppp_chap_init(struct sppp *sp);
	static void sppp_chap_open(struct sppp *sp);
	static void sppp_chap_close(struct sppp *sp);
	static void sppp_chap_TO(void *sp);
	static void sppp_chap_tlu(struct sppp *sp);
	static void sppp_chap_tld(struct sppp *sp);
	static void sppp_chap_scr(struct sppp *sp);

	static const char *sppp_auth_type_name(u_short proto, u_char type);
	static const char *sppp_cp_type_name(u_char type);
	static const char *sppp_dotted_quad(u_long addr);
	static const char *sppp_ipcp_opt_name(u_char opt);
	#ifdef INET6
	static const char *sppp_ipv6cp_opt_name(u_char opt);
	#endif
	static const char *sppp_lcp_opt_name(u_char opt);
	static const char *sppp_phase_name(enum ppp_phase phase);
	static const char *sppp_proto_name(u_short proto);
	static const char *sppp_state_name(int state);
	static int sppp_params(struct sppp sp, u_long cmd, void data);
	static int sppp_strnlen(u_char *p, int max);
	static void sppp_keepalive(void *dummy);
	static void sppp_phase_network(struct sppp *sp);
	static void sppp_print_bytes(const u_char *p, u_short len);
	static void sppp_print_string(const char *p, u_short len);
	static void sppp_qflush(struct ifqueue *ifq);
	static void sppp_set_ip_addr(struct sppp *sp, u_long src);
	#ifdef INET6
	static void sppp_get_ip6_addrs(struct sppp sp, struct in6_addr src,
	struct in6_addr dst, struct in6_addr srcmask);
	#ifdef IPV6CP_MYIFID_DYN
	static void sppp_set_ip6_addr(struct sppp sp, const struct in6_addr src);
	static void sppp_gen_ip6_addr(struct sppp sp, const struct in6_addr src);
	#endif
	static void sppp_suggest_ip6_addr(struct sppp sp, struct in6_addr src);
	#endif

	/* if_start () wrapper */
	static void sppp_ifstart (struct ifnet *ifp);

	/* our control protocol descriptors */
	static const struct cp lcp = {
	PPP_LCP, IDX_LCP, CP_LCP, "lcp",
	sppp_lcp_up, sppp_lcp_down, sppp_lcp_open, sppp_lcp_close,
	sppp_lcp_TO, sppp_lcp_RCR, sppp_lcp_RCN_rej, sppp_lcp_RCN_nak,
	sppp_lcp_tlu, sppp_lcp_tld, sppp_lcp_tls, sppp_lcp_tlf,
	sppp_lcp_scr
	};

	static const struct cp ipcp = {
	PPP_IPCP, IDX_IPCP,
	#ifdef INET /* don't run IPCP if there's no IPv4 support */
	CP_NCP,
	#else
	0,
	#endif
	"ipcp",
	sppp_ipcp_up, sppp_ipcp_down, sppp_ipcp_open, sppp_ipcp_close,
	sppp_ipcp_TO, sppp_ipcp_RCR, sppp_ipcp_RCN_rej, sppp_ipcp_RCN_nak,
	sppp_ipcp_tlu, sppp_ipcp_tld, sppp_ipcp_tls, sppp_ipcp_tlf,
	sppp_ipcp_scr
	};

	static const struct cp ipv6cp = {
	PPP_IPV6CP, IDX_IPV6CP,
	#ifdef INET6 /don't run IPv6CP if there's no IPv6 support/
	CP_NCP,
	#else
	0,
	#endif
	"ipv6cp",
	sppp_ipv6cp_up, sppp_ipv6cp_down, sppp_ipv6cp_open, sppp_ipv6cp_close,
	sppp_ipv6cp_TO, sppp_ipv6cp_RCR, sppp_ipv6cp_RCN_rej, sppp_ipv6cp_RCN_nak,
	sppp_ipv6cp_tlu, sppp_ipv6cp_tld, sppp_ipv6cp_tls, sppp_ipv6cp_tlf,
	sppp_ipv6cp_scr
	};

	static const struct cp pap = {
	PPP_PAP, IDX_PAP, CP_AUTH, "pap",
	sppp_null, sppp_null, sppp_pap_open, sppp_pap_close,
	sppp_pap_TO, 0, 0, 0,
	sppp_pap_tlu, sppp_pap_tld, sppp_null, sppp_null,
	sppp_pap_scr
	};

	static const struct cp chap = {
	PPP_CHAP, IDX_CHAP, CP_AUTH, "chap",
	sppp_null, sppp_null, sppp_chap_open, sppp_chap_close,
	sppp_chap_TO, 0, 0, 0,
	sppp_chap_tlu, sppp_chap_tld, sppp_null, sppp_null,
	sppp_chap_scr
	};

	static const struct cp *cps[IDX_COUNT] = {
	&lcp, /* IDX_LCP */
	&ipcp, /* IDX_IPCP */
	&ipv6cp, /* IDX_IPV6CP */
	&pap, /* IDX_PAP */
	&chap, /* IDX_CHAP */
	};

	static void*
	sppp_alloc(u_char type, struct ifnet *ifp)
	{
	struct sppp *sp;

	sp = malloc(sizeof(struct sppp), M_SPPP, M_WAITOK \| M_ZERO);
	sp->pp_ifp = ifp;

	return (sp);
	}

	static void
	sppp_free(void *com, u_char type)
	{

	free(com, M_SPPP);
	}

	static int
	sppp_modevent(module_t mod, int type, void *unused)
	{
	switch (type) {
	case MOD_LOAD:
	/*
	* XXX: should probably be IFT_SPPP, but it's fairly
	* harmless to allocate struct sppp's for non-sppp
	* interfaces.
	*/

	if_register_com_alloc(IFT_PPP, sppp_alloc, sppp_free);
	break;
	case MOD_UNLOAD:
	/* if_deregister_com_alloc(IFT_PPP); */
	return EACCES;
	default:
	return EOPNOTSUPP;
	}
	return 0;
	}
	static moduledata_t spppmod = {
	"sppp",
	sppp_modevent,
	0
	};
	MODULE_VERSION(sppp, 1);
	DECLARE_MODULE(sppp, spppmod, SI_SUB_DRIVERS, SI_ORDER_ANY);

	/*
	* Exported functions, comprising our interface to the lower layer.
	*/

	/*
	* Process the received packet.
	*/
	void
	sppp_input(struct ifnet ifp, struct mbuf m)
	{
	struct ppp_header *h;
	int isr = -1;
	struct sppp *sp = IFP2SP(ifp);
	u_char *iphdr;
	int hlen, vjlen, do_account = 0;
	int debug;

	SPPP_LOCK(sp);
	debug = ifp->if_flags & IFF_DEBUG;

	if (ifp->if_flags & IFF_UP)
	/* Count received bytes, add FCS and one flag */
	ifp->if_ibytes += m->m_pkthdr.len + 3;

	if (m->m_pkthdr.len <= PPP_HEADER_LEN) {
	/* Too small packet, drop it. */
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "input packet is too small, %d bytes\n",
	SPP_ARGS(ifp), m->m_pkthdr.len);
	drop:
	m_freem (m);
	SPPP_UNLOCK(sp);
	drop2:
	++ifp->if_ierrors;
	++ifp->if_iqdrops;
	return;
	}

	if (sp->pp_mode == PP_FR) {
	sppp_fr_input (sp, m);
	SPPP_UNLOCK(sp);
	return;
	}

	/* Get PPP header. */
	h = mtod (m, struct ppp_header*);
	m_adj (m, PPP_HEADER_LEN);

	switch (h->address) {
	case PPP_ALLSTATIONS:
	if (h->control != PPP_UI)
	goto invalid;
	if (sp->pp_mode == IFF_CISCO) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "PPP packet in Cisco mode "
	"<addr=0x%x ctrl=0x%x proto=0x%x>\n",
	SPP_ARGS(ifp),
	h->address, h->control, ntohs(h->protocol));
	goto drop;
	}
	switch (ntohs (h->protocol)) {
	default:
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "rejecting protocol "
	"<addr=0x%x ctrl=0x%x proto=0x%x>\n",
	SPP_ARGS(ifp),
	h->address, h->control, ntohs(h->protocol));
	if (sp->state[IDX_LCP] == STATE_OPENED)
	sppp_cp_send (sp, PPP_LCP, PROTO_REJ,
	++sp->pp_seq[IDX_LCP], m->m_pkthdr.len + 2,
	&h->protocol);
	++ifp->if_noproto;
	goto drop;
	case PPP_LCP:
	sppp_cp_input(&lcp, sp, m);
	m_freem (m);
	SPPP_UNLOCK(sp);
	return;
	case PPP_PAP:
	if (sp->pp_phase >= PHASE_AUTHENTICATE)
	sppp_pap_input(sp, m);
	m_freem (m);
	SPPP_UNLOCK(sp);
	return;
	case PPP_CHAP:
	if (sp->pp_phase >= PHASE_AUTHENTICATE)
	sppp_chap_input(sp, m);
	m_freem (m);
	SPPP_UNLOCK(sp);
	return;
	#ifdef INET
	case PPP_IPCP:
	if (sp->pp_phase == PHASE_NETWORK)
	sppp_cp_input(&ipcp, sp, m);
	m_freem (m);
	SPPP_UNLOCK(sp);
	return;
	case PPP_IP:
	if (sp->state[IDX_IPCP] == STATE_OPENED) {
	isr = NETISR_IP;
	}
	do_account++;
	break;
	case PPP_VJ_COMP:
	if (sp->state[IDX_IPCP] == STATE_OPENED) {
	if ((vjlen =
	sl_uncompress_tcp_core(mtod(m, u_char *),
	m->m_len, m->m_len,
	TYPE_COMPRESSED_TCP,
	sp->pp_comp,
	&iphdr, &hlen)) <= 0) {
	if (debug)
	log(LOG_INFO,
	SPP_FMT "VJ uncompress failed on compressed packet\n",
	SPP_ARGS(ifp));
	goto drop;
	}

	/*
	* Trim the VJ header off the packet, and prepend
	* the uncompressed IP header (which will usually
	* end up in two chained mbufs since there's not
	* enough leading space in the existing mbuf).
	*/
	m_adj(m, vjlen);
	M_PREPEND(m, hlen, M_DONTWAIT);
	if (m == NULL) {
	SPPP_UNLOCK(sp);
	goto drop2;
	}
	bcopy(iphdr, mtod(m, u_char *), hlen);
	isr = NETISR_IP;
	}
	do_account++;
	break;
	case PPP_VJ_UCOMP:
	if (sp->state[IDX_IPCP] == STATE_OPENED) {
	if (sl_uncompress_tcp_core(mtod(m, u_char *),
	m->m_len, m->m_len,
	TYPE_UNCOMPRESSED_TCP,
	sp->pp_comp,
	&iphdr, &hlen) != 0) {
	if (debug)
	log(LOG_INFO,
	SPP_FMT "VJ uncompress failed on uncompressed packet\n",
	SPP_ARGS(ifp));
	goto drop;
	}
	isr = NETISR_IP;
	}
	do_account++;
	break;
	#endif
	#ifdef INET6
	case PPP_IPV6CP:
	if (sp->pp_phase == PHASE_NETWORK)
	sppp_cp_input(&ipv6cp, sp, m);
	m_freem (m);
	SPPP_UNLOCK(sp);
	return;

	case PPP_IPV6:
	if (sp->state[IDX_IPV6CP] == STATE_OPENED)
	isr = NETISR_IPV6;
	do_account++;
	break;
	#endif
	#ifdef IPX
	case PPP_IPX:
	/* IPX IPXCP not implemented yet */
	if (sp->pp_phase == PHASE_NETWORK)
	isr = NETISR_IPX;
	do_account++;
	break;
	#endif
	}
	break;
	case CISCO_MULTICAST:
	case CISCO_UNICAST:
	/* Don't check the control field here (RFC 1547). */
	if (sp->pp_mode != IFF_CISCO) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "Cisco packet in PPP mode "
	"<addr=0x%x ctrl=0x%x proto=0x%x>\n",
	SPP_ARGS(ifp),
	h->address, h->control, ntohs(h->protocol));
	goto drop;
	}
	switch (ntohs (h->protocol)) {
	default:
	++ifp->if_noproto;
	goto invalid;
	case CISCO_KEEPALIVE:
	sppp_cisco_input (sp, m);
	m_freem (m);
	SPPP_UNLOCK(sp);
	return;
	#ifdef INET
	case ETHERTYPE_IP:
	isr = NETISR_IP;
	do_account++;
	break;
	#endif
	#ifdef INET6
	case ETHERTYPE_IPV6:
	isr = NETISR_IPV6;
	do_account++;
	break;
	#endif
	#ifdef IPX
	case ETHERTYPE_IPX:
	isr = NETISR_IPX;
	do_account++;
	break;
	#endif
	}
	break;
	default: /* Invalid PPP packet. */
	invalid:
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "invalid input packet "
	"<addr=0x%x ctrl=0x%x proto=0x%x>\n",
	SPP_ARGS(ifp),
	h->address, h->control, ntohs(h->protocol));
	goto drop;
	}

	if (! (ifp->if_flags & IFF_UP) \|\| isr == -1)
	goto drop;

	SPPP_UNLOCK(sp);
	/* Check queue. */
	if (netisr_queue(isr, m)) { /* (0) on success. */
	if (debug)
	log(LOG_DEBUG, SPP_FMT "protocol queue overflow\n",
	SPP_ARGS(ifp));
	goto drop2;
	}

	if (do_account)
	/*
	* Do only account for network packets, not for control
	* packets. This is used by some subsystems to detect
	* idle lines.
	*/
	sp->pp_last_recv = time_uptime;
	}

	static void
	sppp_ifstart_sched(void *dummy)
	{
	struct sppp *sp = dummy;

	sp->if_start(SP2IFP(sp));
	}

	/* if_start () wrapper function. We use it to schedule real if_start () for
	* execution. We can't call it directly
	*/
	static void
	sppp_ifstart(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);

	if (SPPP_LOCK_OWNED(sp)) {
	if (callout_pending(&sp->ifstart_callout))
	return;
	callout_reset(&sp->ifstart_callout, 1, sppp_ifstart_sched,
	(void *)sp);
	} else {
	sp->if_start(ifp);
	}
	}

	/*
	* Enqueue transmit packet.
	*/
	static int
	sppp_output(struct ifnet ifp, struct mbuf m,
	struct sockaddr dst, struct rtentry rt)
	{
	struct sppp *sp = IFP2SP(ifp);
	struct ppp_header *h;
	struct ifqueue *ifq = NULL;
	int s, error, rv = 0;
	int ipproto = PPP_IP;
	int debug = ifp->if_flags & IFF_DEBUG;

	s = splimp();
	SPPP_LOCK(sp);

	if (!(ifp->if_flags & IFF_UP) \|\|
	(!(ifp->if_flags & IFF_AUTO) &&
	!(ifp->if_drv_flags & IFF_DRV_RUNNING))) {
	#ifdef INET6
	drop:
	#endif
	m_freem (m);
	SPPP_UNLOCK(sp);
	splx (s);
	return (ENETDOWN);
	}

	if ((ifp->if_flags & IFF_AUTO) &&
	!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	#ifdef INET6
	/*
	* XXX
	*
	* Hack to prevent the initialization-time generated
	* IPv6 multicast packet to erroneously cause a
	* dialout event in case IPv6 has been
	* administratively disabled on that interface.
	*/
	if (dst->sa_family == AF_INET6 &&
	!(sp->confflags & CONF_ENABLE_IPV6))
	goto drop;
	#endif
	/*
	* Interface is not yet running, but auto-dial. Need
	* to start LCP for it.
	*/
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	splx(s);
	lcp.Open(sp);
	s = splimp();
	}

	#ifdef INET
	if (dst->sa_family == AF_INET) {
	/* XXX Check mbuf length here? */
	struct ip ip = mtod (m, struct ip);
	struct tcphdr tcp = (struct tcphdr) ((long*)ip + ip->ip_hl);

	/*
	* When using dynamic local IP address assignment by using
	* 0.0.0.0 as a local address, the first TCP session will
	* not connect because the local TCP checksum is computed
	* using 0.0.0.0 which will later become our real IP address
	* so the TCP checksum computed at the remote end will
	* become invalid. So we
	* - don't let packets with src ip addr 0 thru
	* - we flag TCP packets with src ip 0 as an error
	*/

	if(ip->ip_src.s_addr == INADDR_ANY) /* -hm */
	{
	m_freem(m);
	SPPP_UNLOCK(sp);
	splx(s);
	if(ip->ip_p == IPPROTO_TCP)
	return(EADDRNOTAVAIL);
	else
	return(0);
	}

	/*
	* Put low delay, telnet, rlogin and ftp control packets
	* in front of the queue or let ALTQ take care.
	*/
	if (ALTQ_IS_ENABLED(&ifp->if_snd))
	;
	else if (_IF_QFULL(&sp->pp_fastq))
	;
	else if (ip->ip_tos & IPTOS_LOWDELAY)
	ifq = &sp->pp_fastq;
	else if (m->m_len < sizeof ip + sizeof tcp)
	;
	else if (ip->ip_p != IPPROTO_TCP)
	;
	else if (INTERACTIVE (ntohs (tcp->th_sport)))
	ifq = &sp->pp_fastq;
	else if (INTERACTIVE (ntohs (tcp->th_dport)))
	ifq = &sp->pp_fastq;

	/*
	* Do IP Header compression
	*/
	if (sp->pp_mode != IFF_CISCO && sp->pp_mode != PP_FR &&
	(sp->ipcp.flags & IPCP_VJ) && ip->ip_p == IPPROTO_TCP)
	switch (sl_compress_tcp(m, ip, sp->pp_comp,
	sp->ipcp.compress_cid)) {
	case TYPE_COMPRESSED_TCP:
	ipproto = PPP_VJ_COMP;
	break;
	case TYPE_UNCOMPRESSED_TCP:
	ipproto = PPP_VJ_UCOMP;
	break;
	case TYPE_IP:
	ipproto = PPP_IP;
	break;
	default:
	m_freem(m);
	SPPP_UNLOCK(sp);
	splx(s);
	return (EINVAL);
	}
	}
	#endif

	#ifdef INET6
	if (dst->sa_family == AF_INET6) {
	/* XXX do something tricky here? */
	}
	#endif

	if (sp->pp_mode == PP_FR) {
	/* Add frame relay header. */
	m = sppp_fr_header (sp, m, dst->sa_family);
	if (! m)
	goto nobufs;
	goto out;
	}

	/*
	* Prepend general data packet PPP header. For now, IP only.
	*/
	M_PREPEND (m, PPP_HEADER_LEN, M_DONTWAIT);
	if (! m) {
	nobufs: if (debug)
	log(LOG_DEBUG, SPP_FMT "no memory for transmit header\n",
	SPP_ARGS(ifp));
	++ifp->if_oerrors;
	SPPP_UNLOCK(sp);
	splx (s);
	return (ENOBUFS);
	}
	/*
	* May want to check size of packet
	* (albeit due to the implementation it's always enough)
	*/
	h = mtod (m, struct ppp_header*);
	if (sp->pp_mode == IFF_CISCO) {
	h->address = CISCO_UNICAST; /* unicast address */
	h->control = 0;
	} else {
	h->address = PPP_ALLSTATIONS; /* broadcast address */
	h->control = PPP_UI; /* Unnumbered Info */
	}

	switch (dst->sa_family) {
	#ifdef INET
	case AF_INET: /* Internet Protocol */
	if (sp->pp_mode == IFF_CISCO)
	h->protocol = htons (ETHERTYPE_IP);
	else {
	/*
	* Don't choke with an ENETDOWN early. It's
	* possible that we just started dialing out,
	* so don't drop the packet immediately. If
	* we notice that we run out of buffer space
	* below, we will however remember that we are
	* not ready to carry IP packets, and return
	* ENETDOWN, as opposed to ENOBUFS.
	*/
	h->protocol = htons(ipproto);
	if (sp->state[IDX_IPCP] != STATE_OPENED)
	rv = ENETDOWN;
	}
	break;
	#endif
	#ifdef INET6
	case AF_INET6: /* Internet Protocol */
	if (sp->pp_mode == IFF_CISCO)
	h->protocol = htons (ETHERTYPE_IPV6);
	else {
	/*
	* Don't choke with an ENETDOWN early. It's
	* possible that we just started dialing out,
	* so don't drop the packet immediately. If
	* we notice that we run out of buffer space
	* below, we will however remember that we are
	* not ready to carry IP packets, and return
	* ENETDOWN, as opposed to ENOBUFS.
	*/
	h->protocol = htons(PPP_IPV6);
	if (sp->state[IDX_IPV6CP] != STATE_OPENED)
	rv = ENETDOWN;
	}
	break;
	#endif
	#ifdef IPX
	case AF_IPX: /* Novell IPX Protocol */
	h->protocol = htons (sp->pp_mode == IFF_CISCO ?
	ETHERTYPE_IPX : PPP_IPX);
	break;
	#endif
	default:
	m_freem (m);
	++ifp->if_oerrors;
	SPPP_UNLOCK(sp);
	splx (s);
	return (EAFNOSUPPORT);
	}

	/*
	* Queue message on interface, and start output if interface
	* not yet active.
	*/
	out:
	if (ifq != NULL)
	error = !(IF_HANDOFF_ADJ(ifq, m, ifp, 3));
	else
	IFQ_HANDOFF_ADJ(ifp, m, 3, error);
	if (error) {
	++ifp->if_oerrors;
	SPPP_UNLOCK(sp);
	splx (s);
	return (rv? rv: ENOBUFS);
	}
	SPPP_UNLOCK(sp);
	splx (s);
	/*
	* Unlike in sppp_input(), we can always bump the timestamp
	* here since sppp_output() is only called on behalf of
	* network-layer traffic; control-layer traffic is handled
	* by sppp_cp_send().
	*/
	sp->pp_last_sent = time_uptime;
	return (0);
	}

	void
	sppp_attach(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);

	/* Initialize mtx lock */
	mtx_init(&sp->mtx, "sppp", MTX_NETWORK_LOCK, MTX_DEF \| MTX_RECURSE);

	/* Initialize keepalive handler. */
	callout_init(&sp->keepalive_callout,
	(ifp->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive,
	(void *)sp);

	ifp->if_mtu = PP_MTU;
	ifp->if_flags = IFF_POINTOPOINT \| IFF_MULTICAST;
	ifp->if_output = sppp_output;
	#if 0
	sp->pp_flags = PP_KEEPALIVE;
	#endif
	ifp->if_snd.ifq_maxlen = 32;
	sp->pp_fastq.ifq_maxlen = 32;
	sp->pp_cpq.ifq_maxlen = 20;
	sp->pp_loopcnt = 0;
	sp->pp_alivecnt = 0;
	bzero(&sp->pp_seq[0], sizeof(sp->pp_seq));
	bzero(&sp->pp_rseq[0], sizeof(sp->pp_rseq));
	sp->pp_phase = PHASE_DEAD;
	sp->pp_up = sppp_pp_up;
	sp->pp_down = sppp_pp_down;
	if(!mtx_initialized(&sp->pp_cpq.ifq_mtx))
	mtx_init(&sp->pp_cpq.ifq_mtx, "sppp_cpq", NULL, MTX_DEF);
	if(!mtx_initialized(&sp->pp_fastq.ifq_mtx))
	mtx_init(&sp->pp_fastq.ifq_mtx, "sppp_fastq", NULL, MTX_DEF);
	sp->pp_last_recv = sp->pp_last_sent = time_uptime;
	sp->confflags = 0;
	#ifdef INET
	sp->confflags \|= CONF_ENABLE_VJ;
	#endif
	#ifdef INET6
	sp->confflags \|= CONF_ENABLE_IPV6;
	#endif
	callout_init(&sp->ifstart_callout,
	(ifp->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	sp->if_start = ifp->if_start;
	ifp->if_start = sppp_ifstart;
	sp->pp_comp = malloc(sizeof(struct slcompress), M_TEMP, M_WAITOK);
	sl_compress_init(sp->pp_comp, -1);
	sppp_lcp_init(sp);
	sppp_ipcp_init(sp);
	sppp_ipv6cp_init(sp);
	sppp_pap_init(sp);
	sppp_chap_init(sp);
	}

	void
	sppp_detach(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);
	int i;

	KASSERT(mtx_initialized(&sp->mtx), ("sppp mutex is not initialized"));

	/* Stop keepalive handler. */
	if (!callout_drain(&sp->keepalive_callout))
	callout_stop(&sp->keepalive_callout);

	for (i = 0; i < IDX_COUNT; i++) {
	if (!callout_drain(&sp->ch[i]))
	callout_stop(&sp->ch[i]);
	}
	if (!callout_drain(&sp->pap_my_to_ch))
	callout_stop(&sp->pap_my_to_ch);
	mtx_destroy(&sp->pp_cpq.ifq_mtx);
	mtx_destroy(&sp->pp_fastq.ifq_mtx);
	mtx_destroy(&sp->mtx);
	}

	/*
	* Flush the interface output queue.
	*/
	static void
	sppp_flush_unlocked(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);

	sppp_qflush ((struct ifqueue *)&SP2IFP(sp)->if_snd);
	sppp_qflush (&sp->pp_fastq);
	sppp_qflush (&sp->pp_cpq);
	}

	void
	sppp_flush(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);

	SPPP_LOCK(sp);
	sppp_flush_unlocked (ifp);
	SPPP_UNLOCK(sp);
	}

	/*
	* Check if the output queue is empty.
	*/
	int
	sppp_isempty(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);
	int empty, s;

	s = splimp();
	SPPP_LOCK(sp);
	empty = !sp->pp_fastq.ifq_head && !sp->pp_cpq.ifq_head &&
	!SP2IFP(sp)->if_snd.ifq_head;
	SPPP_UNLOCK(sp);
	splx(s);
	return (empty);
	}

	/*
	* Get next packet to send.
	*/
	struct mbuf *
	sppp_dequeue(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);
	struct mbuf *m;
	int s;

	s = splimp();
	SPPP_LOCK(sp);
	/*
	* Process only the control protocol queue until we have at
	* least one NCP open.
	*
	* Do always serve all three queues in Cisco mode.
	*/
	IF_DEQUEUE(&sp->pp_cpq, m);
	if (m == NULL &&
	(sppp_ncp_check(sp) \|\| sp->pp_mode == IFF_CISCO \|\|
	sp->pp_mode == PP_FR)) {
	IF_DEQUEUE(&sp->pp_fastq, m);
	if (m == NULL)
	IF_DEQUEUE (&SP2IFP(sp)->if_snd, m);
	}
	SPPP_UNLOCK(sp);
	splx(s);
	return m;
	}

	/*
	* Pick the next packet, do not remove it from the queue.
	*/
	struct mbuf *
	sppp_pick(struct ifnet *ifp)
	{
	struct sppp *sp = IFP2SP(ifp);
	struct mbuf *m;
	int s;

	s = splimp ();
	SPPP_LOCK(sp);

	m = sp->pp_cpq.ifq_head;
	if (m == NULL &&
	(sp->pp_phase == PHASE_NETWORK \|\|
	sp->pp_mode == IFF_CISCO \|\|
	sp->pp_mode == PP_FR))
	if ((m = sp->pp_fastq.ifq_head) == NULL)
	m = SP2IFP(sp)->if_snd.ifq_head;
	SPPP_UNLOCK(sp);
	splx (s);
	return (m);
	}

	/*
	* Process an ioctl request. Called on low priority level.
	*/
	int
	sppp_ioctl(struct ifnet ifp, IOCTL_CMD_T cmd, void data)
	{
	struct ifreq ifr = (struct ifreq) data;
	struct sppp *sp = IFP2SP(ifp);
	int s, rv, going_up, going_down, newmode;

	s = splimp();
	SPPP_LOCK(sp);
	rv = 0;
	switch (cmd) {
	case SIOCAIFADDR:
	case SIOCSIFDSTADDR:
	break;

	case SIOCSIFADDR:
	/* set the interface "up" when assigning an IP address */
	ifp->if_flags \|= IFF_UP;
	/* FALLTHROUGH */

	case SIOCSIFFLAGS:
	going_up = ifp->if_flags & IFF_UP &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING) == 0;
	going_down = (ifp->if_flags & IFF_UP) == 0 &&
	ifp->if_drv_flags & IFF_DRV_RUNNING;

	newmode = ifp->if_flags & IFF_PASSIVE;
	if (!newmode)
	newmode = ifp->if_flags & IFF_AUTO;
	if (!newmode)
	newmode = ifp->if_flags & IFF_CISCO;
	ifp->if_flags &= ~(IFF_PASSIVE \| IFF_AUTO \| IFF_CISCO);
	ifp->if_flags \|= newmode;

	if (!newmode)
	newmode = sp->pp_flags & PP_FR;

	if (newmode != sp->pp_mode) {
	going_down = 1;
	if (!going_up)
	going_up = ifp->if_drv_flags & IFF_DRV_RUNNING;
	}

	if (going_down) {
	if (sp->pp_mode != IFF_CISCO &&
	sp->pp_mode != PP_FR)
	lcp.Close(sp);
	else if (sp->pp_tlf)
	(sp->pp_tlf)(sp);
	sppp_flush_unlocked(ifp);
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	sp->pp_mode = newmode;
	}

	if (going_up) {
	if (sp->pp_mode != IFF_CISCO &&
	sp->pp_mode != PP_FR)
	lcp.Close(sp);
	sp->pp_mode = newmode;
	if (sp->pp_mode == 0) {
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	lcp.Open(sp);
	}
	if ((sp->pp_mode == IFF_CISCO) \|\|
	(sp->pp_mode == PP_FR)) {
	if (sp->pp_tls)
	(sp->pp_tls)(sp);
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	}
	}

	break;

	#ifdef SIOCSIFMTU
	#ifndef ifr_mtu
	#define ifr_mtu ifr_metric
	#endif
	case SIOCSIFMTU:
	if (ifr->ifr_mtu < 128 \|\| ifr->ifr_mtu > sp->lcp.their_mru)
	return (EINVAL);
	ifp->if_mtu = ifr->ifr_mtu;
	break;
	#endif
	#ifdef SLIOCSETMTU
	case SLIOCSETMTU:
	if ((short)data < 128 \|\| (short)data > sp->lcp.their_mru)
	return (EINVAL);
	ifp->if_mtu = (short)data;
	break;
	#endif
	#ifdef SIOCGIFMTU
	case SIOCGIFMTU:
	ifr->ifr_mtu = ifp->if_mtu;
	break;
	#endif
	#ifdef SLIOCGETMTU
	case SLIOCGETMTU:
	(short)data = ifp->if_mtu;
	break;
	#endif
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	break;

	case SIOCGIFGENERIC:
	case SIOCSIFGENERIC:
	rv = sppp_params(sp, cmd, data);
	break;

	default:
	rv = ENOTTY;
	}
	SPPP_UNLOCK(sp);
	splx(s);
	return rv;
	}

	/*
	* Cisco framing implementation.
	*/

	/*
	* Handle incoming Cisco keepalive protocol packets.
	*/
	static void
	sppp_cisco_input(struct sppp sp, struct mbuf m)
	{
	STDDCL;
	struct cisco_packet *h;
	u_long me, mymask;

	if (m->m_pkthdr.len < CISCO_PACKET_LEN) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "cisco invalid packet length: %d bytes\n",
	SPP_ARGS(ifp), m->m_pkthdr.len);
	return;
	}
	h = mtod (m, struct cisco_packet*);
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "cisco input: %d bytes "
	"<0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n",
	SPP_ARGS(ifp), m->m_pkthdr.len,
	(u_long)ntohl (h->type), (u_long)h->par1, (u_long)h->par2, (u_int)h->rel,
	(u_int)h->time0, (u_int)h->time1);
	switch (ntohl (h->type)) {
	default:
	if (debug)
	log(-1, SPP_FMT "cisco unknown packet type: 0x%lx\n",
	SPP_ARGS(ifp), (u_long)ntohl (h->type));
	break;
	case CISCO_ADDR_REPLY:
	/* Reply on address request, ignore */
	break;
	case CISCO_KEEPALIVE_REQ:
	sp->pp_alivecnt = 0;
	sp->pp_rseq[IDX_LCP] = ntohl (h->par1);
	if (sp->pp_seq[IDX_LCP] == sp->pp_rseq[IDX_LCP]) {
	/* Local and remote sequence numbers are equal.
	* Probably, the line is in loopback mode. */
	if (sp->pp_loopcnt >= MAXALIVECNT) {
	printf (SPP_FMT "loopback\n",
	SPP_ARGS(ifp));
	sp->pp_loopcnt = 0;
	if (ifp->if_flags & IFF_UP) {
	if_down (ifp);
	sppp_qflush (&sp->pp_cpq);
	}
	}
	++sp->pp_loopcnt;

	/* Generate new local sequence number */
	sp->pp_seq[IDX_LCP] = random();
	break;
	}
	sp->pp_loopcnt = 0;
	if (! (ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	if_up(ifp);
	printf (SPP_FMT "up\n", SPP_ARGS(ifp));
	}
	break;
	case CISCO_ADDR_REQ:
	sppp_get_ip_addrs(sp, &me, 0, &mymask);
	if (me != 0L)
	sppp_cisco_send(sp, CISCO_ADDR_REPLY, me, mymask);
	break;
	}
	}

	/*
	* Send Cisco keepalive packet.
	*/
	static void
	sppp_cisco_send(struct sppp *sp, int type, long par1, long par2)
	{
	STDDCL;
	struct ppp_header *h;
	struct cisco_packet *ch;
	struct mbuf *m;
	struct timeval tv;

	getmicrouptime(&tv);

	MGETHDR (m, M_DONTWAIT, MT_DATA);
	if (! m)
	return;
	m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + CISCO_PACKET_LEN;
	m->m_pkthdr.rcvif = 0;

	h = mtod (m, struct ppp_header*);
	h->address = CISCO_MULTICAST;
	h->control = 0;
	h->protocol = htons (CISCO_KEEPALIVE);

	ch = (struct cisco_packet*) (h + 1);
	ch->type = htonl (type);
	ch->par1 = htonl (par1);
	ch->par2 = htonl (par2);
	ch->rel = -1;

	ch->time0 = htons ((u_short) (tv.tv_sec >> 16));
	ch->time1 = htons ((u_short) tv.tv_sec);

	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "cisco output: <0x%lx 0x%lx 0x%lx 0x%x 0x%x-0x%x>\n",
	SPP_ARGS(ifp), (u_long)ntohl (ch->type), (u_long)ch->par1,
	(u_long)ch->par2, (u_int)ch->rel, (u_int)ch->time0, (u_int)ch->time1);

	if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3))
	ifp->if_oerrors++;
	}

	/*
	* PPP protocol implementation.
	*/

	/*
	* Send PPP control protocol packet.
	*/
	static void
	sppp_cp_send(struct sppp *sp, u_short proto, u_char type,
	u_char ident, u_short len, void *data)
	{
	STDDCL;
	struct ppp_header *h;
	struct lcp_header *lh;
	struct mbuf *m;

	if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN)
	len = MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN;
	MGETHDR (m, M_DONTWAIT, MT_DATA);
	if (! m)
	return;
	m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len;
	m->m_pkthdr.rcvif = 0;

	h = mtod (m, struct ppp_header*);
	h->address = PPP_ALLSTATIONS; /* broadcast address */
	h->control = PPP_UI; /* Unnumbered Info */
	h->protocol = htons (proto); /* Link Control Protocol */

	lh = (struct lcp_header*) (h + 1);
	lh->type = type;
	lh->ident = ident;
	lh->len = htons (LCP_HEADER_LEN + len);
	if (len)
	bcopy (data, lh+1, len);

	if (debug) {
	log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d",
	SPP_ARGS(ifp),
	sppp_proto_name(proto),
	sppp_cp_type_name (lh->type), lh->ident,
	ntohs (lh->len));
	sppp_print_bytes ((u_char*) (lh+1), len);
	log(-1, ">\n");
	}
	if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3))
	ifp->if_oerrors++;
	}

	/*
	* Handle incoming PPP control protocol packets.
	*/
	static void
	sppp_cp_input(const struct cp cp, struct sppp sp, struct mbuf *m)
	{
	STDDCL;
	struct lcp_header *h;
	int len = m->m_pkthdr.len;
	int rv;
	u_char *p;

	if (len < 4) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "%s invalid packet length: %d bytes\n",
	SPP_ARGS(ifp), cp->name, len);
	return;
	}
	h = mtod (m, struct lcp_header*);
	if (debug) {
	log(LOG_DEBUG,
	SPP_FMT "%s input(%s): <%s id=0x%x len=%d",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]),
	sppp_cp_type_name (h->type), h->ident, ntohs (h->len));
	sppp_print_bytes ((u_char*) (h+1), len-4);
	log(-1, ">\n");
	}
	if (len > ntohs (h->len))
	len = ntohs (h->len);
	p = (u_char *)(h + 1);
	switch (h->type) {
	case CONF_REQ:
	if (len < 4) {
	if (debug)
	log(-1, SPP_FMT "%s invalid conf-req length %d\n",
	SPP_ARGS(ifp), cp->name,
	len);
	++ifp->if_ierrors;
	break;
	}
	/* handle states where RCR doesn't get a SCA/SCN */
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSING:
	case STATE_STOPPING:
	return;
	case STATE_CLOSED:
	sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident,
	0, 0);
	return;
	}
	rv = (cp->RCR)(sp, h, len);
	switch (sp->state[cp->protoidx]) {
	case STATE_OPENED:
	(cp->tld)(sp);
	(cp->scr)(sp);
	/* FALLTHROUGH */
	case STATE_ACK_SENT:
	case STATE_REQ_SENT:
	/*
	* sppp_cp_change_state() have the side effect of
	* restarting the timeouts. We want to avoid that
	* if the state don't change, otherwise we won't
	* ever timeout and resend a configuration request
	* that got lost.
	*/
	if (sp->state[cp->protoidx] == (rv ? STATE_ACK_SENT:
	STATE_REQ_SENT))
	break;
	sppp_cp_change_state(cp, sp, rv?
	STATE_ACK_SENT: STATE_REQ_SENT);
	break;
	case STATE_STOPPED:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_configure;
	(cp->scr)(sp);
	sppp_cp_change_state(cp, sp, rv?
	STATE_ACK_SENT: STATE_REQ_SENT);
	break;
	case STATE_ACK_RCVD:
	if (rv) {
	sppp_cp_change_state(cp, sp, STATE_OPENED);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s tlu\n",
	SPP_ARGS(ifp),
	cp->name);
	(cp->tlu)(sp);
	} else
	sppp_cp_change_state(cp, sp, STATE_ACK_RCVD);
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;
	case CONF_ACK:
	if (h->ident != sp->confid[cp->protoidx]) {
	if (debug)
	log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n",
	SPP_ARGS(ifp), cp->name,
	h->ident, sp->confid[cp->protoidx]);
	++ifp->if_ierrors;
	break;
	}
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSED:
	case STATE_STOPPED:
	sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0);
	break;
	case STATE_CLOSING:
	case STATE_STOPPING:
	break;
	case STATE_REQ_SENT:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_configure;
	sppp_cp_change_state(cp, sp, STATE_ACK_RCVD);
	break;
	case STATE_OPENED:
	(cp->tld)(sp);
	/* FALLTHROUGH */
	case STATE_ACK_RCVD:
	(cp->scr)(sp);
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	case STATE_ACK_SENT:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_configure;
	sppp_cp_change_state(cp, sp, STATE_OPENED);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s tlu\n",
	SPP_ARGS(ifp), cp->name);
	(cp->tlu)(sp);
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;
	case CONF_NAK:
	case CONF_REJ:
	if (h->ident != sp->confid[cp->protoidx]) {
	if (debug)
	log(-1, SPP_FMT "%s id mismatch 0x%x != 0x%x\n",
	SPP_ARGS(ifp), cp->name,
	h->ident, sp->confid[cp->protoidx]);
	++ifp->if_ierrors;
	break;
	}
	if (h->type == CONF_NAK)
	(cp->RCN_nak)(sp, h, len);
	else /* CONF_REJ */
	(cp->RCN_rej)(sp, h, len);

	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSED:
	case STATE_STOPPED:
	sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0);
	break;
	case STATE_REQ_SENT:
	case STATE_ACK_SENT:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_configure;
	/*
	* Slow things down a bit if we think we might be
	* in loopback. Depend on the timeout to send the
	* next configuration request.
	*/
	if (sp->pp_loopcnt)
	break;
	(cp->scr)(sp);
	break;
	case STATE_OPENED:
	(cp->tld)(sp);
	/* FALLTHROUGH */
	case STATE_ACK_RCVD:
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	(cp->scr)(sp);
	break;
	case STATE_CLOSING:
	case STATE_STOPPING:
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;

	case TERM_REQ:
	switch (sp->state[cp->protoidx]) {
	case STATE_ACK_RCVD:
	case STATE_ACK_SENT:
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	/* FALLTHROUGH */
	case STATE_CLOSED:
	case STATE_STOPPED:
	case STATE_CLOSING:
	case STATE_STOPPING:
	case STATE_REQ_SENT:
	sta:
	/* Send Terminate-Ack packet. */
	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s send terminate-ack\n",
	SPP_ARGS(ifp), cp->name);
	sppp_cp_send(sp, cp->proto, TERM_ACK, h->ident, 0, 0);
	break;
	case STATE_OPENED:
	(cp->tld)(sp);
	sp->rst_counter[cp->protoidx] = 0;
	sppp_cp_change_state(cp, sp, STATE_STOPPING);
	goto sta;
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;
	case TERM_ACK:
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSED:
	case STATE_STOPPED:
	case STATE_REQ_SENT:
	case STATE_ACK_SENT:
	break;
	case STATE_CLOSING:
	sppp_cp_change_state(cp, sp, STATE_CLOSED);
	(cp->tlf)(sp);
	break;
	case STATE_STOPPING:
	sppp_cp_change_state(cp, sp, STATE_STOPPED);
	(cp->tlf)(sp);
	break;
	case STATE_ACK_RCVD:
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	case STATE_OPENED:
	(cp->tld)(sp);
	(cp->scr)(sp);
	sppp_cp_change_state(cp, sp, STATE_ACK_RCVD);
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;
	case CODE_REJ:
	/* XXX catastrophic rejects (RXJ-) aren't handled yet. */
	log(LOG_INFO,
	SPP_FMT "%s: ignoring RXJ (%s) for proto 0x%x, "
	"danger will robinson\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type), ntohs(((u_short )p)));
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSED:
	case STATE_STOPPED:
	case STATE_REQ_SENT:
	case STATE_ACK_SENT:
	case STATE_CLOSING:
	case STATE_STOPPING:
	case STATE_OPENED:
	break;
	case STATE_ACK_RCVD:
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;
	case PROTO_REJ:
	{
	int catastrophic;
	const struct cp *upper;
	int i;
	u_int16_t proto;

	catastrophic = 0;
	upper = NULL;
	proto = ntohs(((u_int16_t )p));
	for (i = 0; i < IDX_COUNT; i++) {
	if (cps[i]->proto == proto) {
	upper = cps[i];
	break;
	}
	}
	if (upper == NULL)
	catastrophic++;

	if (catastrophic \|\| debug)
	log(catastrophic? LOG_INFO: LOG_DEBUG,
	SPP_FMT "%s: RXJ%c (%s) for proto 0x%x (%s/%s)\n",
	SPP_ARGS(ifp), cp->name, catastrophic ? '-' : '+',
	sppp_cp_type_name(h->type), proto,
	upper ? upper->name : "unknown",
	upper ? sppp_state_name(sp->state[upper->protoidx]) : "?");

	/*
	* if we got RXJ+ against conf-req, the peer does not implement
	* this particular protocol type. terminate the protocol.
	*/
	if (upper && !catastrophic) {
	if (sp->state[upper->protoidx] == STATE_REQ_SENT) {
	upper->Close(sp);
	break;
	}
	}

	/* XXX catastrophic rejects (RXJ-) aren't handled yet. */
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSED:
	case STATE_STOPPED:
	case STATE_REQ_SENT:
	case STATE_ACK_SENT:
	case STATE_CLOSING:
	case STATE_STOPPING:
	case STATE_OPENED:
	break;
	case STATE_ACK_RCVD:
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	default:
	printf(SPP_FMT "%s illegal %s in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_cp_type_name(h->type),
	sppp_state_name(sp->state[cp->protoidx]));
	++ifp->if_ierrors;
	}
	break;
	}
	case DISC_REQ:
	if (cp->proto != PPP_LCP)
	goto illegal;
	/* Discard the packet. */
	break;
	case ECHO_REQ:
	if (cp->proto != PPP_LCP)
	goto illegal;
	if (sp->state[cp->protoidx] != STATE_OPENED) {
	if (debug)
	log(-1, SPP_FMT "lcp echo req but lcp closed\n",
	SPP_ARGS(ifp));
	++ifp->if_ierrors;
	break;
	}
	if (len < 8) {
	if (debug)
	log(-1, SPP_FMT "invalid lcp echo request "
	"packet length: %d bytes\n",
	SPP_ARGS(ifp), len);
	break;
	}
	if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) &&
	ntohl ((long)(h+1)) == sp->lcp.magic) {
	/* Line loopback mode detected. */
	printf(SPP_FMT "loopback\n", SPP_ARGS(ifp));
	sp->pp_loopcnt = MAXALIVECNT * 5;
	if_down (ifp);
	sppp_qflush (&sp->pp_cpq);

	/* Shut down the PPP link. */
	/* XXX */
	lcp.Down(sp);
	lcp.Up(sp);
	break;
	}
	(long)(h+1) = htonl (sp->lcp.magic);
	if (debug)
	log(-1, SPP_FMT "got lcp echo req, sending echo rep\n",
	SPP_ARGS(ifp));
	sppp_cp_send (sp, PPP_LCP, ECHO_REPLY, h->ident, len-4, h+1);
	break;
	case ECHO_REPLY:
	if (cp->proto != PPP_LCP)
	goto illegal;
	if (h->ident != sp->lcp.echoid) {
	++ifp->if_ierrors;
	break;
	}
	if (len < 8) {
	if (debug)
	log(-1, SPP_FMT "lcp invalid echo reply "
	"packet length: %d bytes\n",
	SPP_ARGS(ifp), len);
	break;
	}
	if (debug)
	log(-1, SPP_FMT "lcp got echo rep\n",
	SPP_ARGS(ifp));
	if (!(sp->lcp.opts & (1 << LCP_OPT_MAGIC)) \|\|
	ntohl ((long)(h+1)) != sp->lcp.magic)
	sp->pp_alivecnt = 0;
	break;
	default:
	/* Unknown packet type -- send Code-Reject packet. */
	illegal:
	if (debug)
	log(-1, SPP_FMT "%s send code-rej for 0x%x\n",
	SPP_ARGS(ifp), cp->name, h->type);
	sppp_cp_send(sp, cp->proto, CODE_REJ,
	++sp->pp_seq[cp->protoidx], m->m_pkthdr.len, h);
	++ifp->if_ierrors;
	}
	}


	/*
	* The generic part of all Up/Down/Open/Close/TO event handlers.
	* Basically, the state transition handling in the automaton.
	*/
	static void
	sppp_up_event(const struct cp cp, struct sppp sp)
	{
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s up(%s)\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]));

	switch (sp->state[cp->protoidx]) {
	case STATE_INITIAL:
	sppp_cp_change_state(cp, sp, STATE_CLOSED);
	break;
	case STATE_STARTING:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_configure;
	(cp->scr)(sp);
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	default:
	printf(SPP_FMT "%s illegal up in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]));
	}
	}

	static void
	sppp_down_event(const struct cp cp, struct sppp sp)
	{
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s down(%s)\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]));

	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSED:
	case STATE_CLOSING:
	sppp_cp_change_state(cp, sp, STATE_INITIAL);
	break;
	case STATE_STOPPED:
	sppp_cp_change_state(cp, sp, STATE_STARTING);
	(cp->tls)(sp);
	break;
	case STATE_STOPPING:
	case STATE_REQ_SENT:
	case STATE_ACK_RCVD:
	case STATE_ACK_SENT:
	sppp_cp_change_state(cp, sp, STATE_STARTING);
	break;
	case STATE_OPENED:
	(cp->tld)(sp);
	sppp_cp_change_state(cp, sp, STATE_STARTING);
	break;
	default:
	printf(SPP_FMT "%s illegal down in state %s\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]));
	}
	}


	static void
	sppp_open_event(const struct cp cp, struct sppp sp)
	{
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s open(%s)\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]));

	switch (sp->state[cp->protoidx]) {
	case STATE_INITIAL:
	sppp_cp_change_state(cp, sp, STATE_STARTING);
	(cp->tls)(sp);
	break;
	case STATE_STARTING:
	break;
	case STATE_CLOSED:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_configure;
	(cp->scr)(sp);
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	case STATE_STOPPED:
	/*
	* Try escaping stopped state. This seems to bite
	* people occasionally, in particular for IPCP,
	* presumably following previous IPCP negotiation
	* aborts. Somehow, we must have missed a Down event
	* which would have caused a transition into starting
	* state, so as a bandaid we force the Down event now.
	* This effectively implements (something like the)
	* `restart' option mentioned in the state transition
	* table of RFC 1661.
	*/
	sppp_cp_change_state(cp, sp, STATE_STARTING);
	(cp->tls)(sp);
	break;
	case STATE_STOPPING:
	case STATE_REQ_SENT:
	case STATE_ACK_RCVD:
	case STATE_ACK_SENT:
	case STATE_OPENED:
	break;
	case STATE_CLOSING:
	sppp_cp_change_state(cp, sp, STATE_STOPPING);
	break;
	}
	}


	static void
	sppp_close_event(const struct cp cp, struct sppp sp)
	{
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s close(%s)\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]));

	switch (sp->state[cp->protoidx]) {
	case STATE_INITIAL:
	case STATE_CLOSED:
	case STATE_CLOSING:
	break;
	case STATE_STARTING:
	sppp_cp_change_state(cp, sp, STATE_INITIAL);
	(cp->tlf)(sp);
	break;
	case STATE_STOPPED:
	sppp_cp_change_state(cp, sp, STATE_CLOSED);
	break;
	case STATE_STOPPING:
	sppp_cp_change_state(cp, sp, STATE_CLOSING);
	break;
	case STATE_OPENED:
	(cp->tld)(sp);
	/* FALLTHROUGH */
	case STATE_REQ_SENT:
	case STATE_ACK_RCVD:
	case STATE_ACK_SENT:
	sp->rst_counter[cp->protoidx] = sp->lcp.max_terminate;
	sppp_cp_send(sp, cp->proto, TERM_REQ,
	++sp->pp_seq[cp->protoidx], 0, 0);
	sppp_cp_change_state(cp, sp, STATE_CLOSING);
	break;
	}
	}

	static void
	sppp_to_event(const struct cp cp, struct sppp sp)
	{
	STDDCL;
	int s;

	s = splimp();
	SPPP_LOCK(sp);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s TO(%s) rst_counter = %d\n",
	SPP_ARGS(ifp), cp->name,
	sppp_state_name(sp->state[cp->protoidx]),
	sp->rst_counter[cp->protoidx]);

	if (--sp->rst_counter[cp->protoidx] < 0)
	/* TO- event */
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSING:
	sppp_cp_change_state(cp, sp, STATE_CLOSED);
	(cp->tlf)(sp);
	break;
	case STATE_STOPPING:
	sppp_cp_change_state(cp, sp, STATE_STOPPED);
	(cp->tlf)(sp);
	break;
	case STATE_REQ_SENT:
	case STATE_ACK_RCVD:
	case STATE_ACK_SENT:
	sppp_cp_change_state(cp, sp, STATE_STOPPED);
	(cp->tlf)(sp);
	break;
	}
	else
	/* TO+ event */
	switch (sp->state[cp->protoidx]) {
	case STATE_CLOSING:
	case STATE_STOPPING:
	sppp_cp_send(sp, cp->proto, TERM_REQ,
	++sp->pp_seq[cp->protoidx], 0, 0);
	callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout,
	cp->TO, (void *)sp);
	break;
	case STATE_REQ_SENT:
	case STATE_ACK_RCVD:
	(cp->scr)(sp);
	/* sppp_cp_change_state() will restart the timer */
	sppp_cp_change_state(cp, sp, STATE_REQ_SENT);
	break;
	case STATE_ACK_SENT:
	(cp->scr)(sp);
	callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout,
	cp->TO, (void *)sp);
	break;
	}

	SPPP_UNLOCK(sp);
	splx(s);
	}

	/*
	* Change the state of a control protocol in the state automaton.
	* Takes care of starting/stopping the restart timer.
	*/
	static void
	sppp_cp_change_state(const struct cp cp, struct sppp sp, int newstate)
	{
	sp->state[cp->protoidx] = newstate;

	callout_stop (&sp->ch[cp->protoidx]);

	switch (newstate) {
	case STATE_INITIAL:
	case STATE_STARTING:
	case STATE_CLOSED:
	case STATE_STOPPED:
	case STATE_OPENED:
	break;
	case STATE_CLOSING:
	case STATE_STOPPING:
	case STATE_REQ_SENT:
	case STATE_ACK_RCVD:
	case STATE_ACK_SENT:
	callout_reset(&sp->ch[cp->protoidx], sp->lcp.timeout,
	cp->TO, (void *)sp);
	break;
	}
	}

	/*
	--------------------------------------------------------------------------
	* *
	* The LCP implementation. *
	* *
	--------------------------------------------------------------------------
	*/
	static void
	sppp_pp_up(struct sppp *sp)
	{
	SPPP_LOCK(sp);
	lcp.Up(sp);
	SPPP_UNLOCK(sp);
	}

	static void
	sppp_pp_down(struct sppp *sp)
	{
	SPPP_LOCK(sp);
	lcp.Down(sp);
	SPPP_UNLOCK(sp);
	}

	static void
	sppp_lcp_init(struct sppp *sp)
	{
	sp->lcp.opts = (1 << LCP_OPT_MAGIC);
	sp->lcp.magic = 0;
	sp->state[IDX_LCP] = STATE_INITIAL;
	sp->fail_counter[IDX_LCP] = 0;
	sp->pp_seq[IDX_LCP] = 0;
	sp->pp_rseq[IDX_LCP] = 0;
	sp->lcp.protos = 0;
	sp->lcp.mru = sp->lcp.their_mru = PP_MTU;

	/* Note that these values are relevant for all control protocols */
	sp->lcp.timeout = 3 * hz;
	sp->lcp.max_terminate = 2;
	sp->lcp.max_configure = 10;
	sp->lcp.max_failure = 10;
	callout_init(&sp->ch[IDX_LCP],
	(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	}

	static void
	sppp_lcp_up(struct sppp *sp)
	{
	STDDCL;

	sp->pp_alivecnt = 0;
	sp->lcp.opts = (1 << LCP_OPT_MAGIC);
	sp->lcp.magic = 0;
	sp->lcp.protos = 0;
	sp->lcp.mru = sp->lcp.their_mru = PP_MTU;
	/*
	* If we are authenticator, negotiate LCP_AUTH
	*/
	if (sp->hisauth.proto != 0)
	sp->lcp.opts \|= (1 << LCP_OPT_AUTH_PROTO);
	else
	sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO);
	sp->pp_flags &= ~PP_NEEDAUTH;
	/*
	* If this interface is passive or dial-on-demand, and we are
	* still in Initial state, it means we've got an incoming
	* call. Activate the interface.
	*/
	if ((ifp->if_flags & (IFF_AUTO \| IFF_PASSIVE)) != 0) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "Up event", SPP_ARGS(ifp));
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	if (sp->state[IDX_LCP] == STATE_INITIAL) {
	if (debug)
	log(-1, "(incoming call)\n");
	sp->pp_flags \|= PP_CALLIN;
	lcp.Open(sp);
	} else if (debug)
	log(-1, "\n");
	} else if ((ifp->if_flags & (IFF_AUTO \| IFF_PASSIVE)) == 0 &&
	(sp->state[IDX_LCP] == STATE_INITIAL)) {
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	lcp.Open(sp);
	}

	sppp_up_event(&lcp, sp);
	}

	static void
	sppp_lcp_down(struct sppp *sp)
	{
	STDDCL;

	sppp_down_event(&lcp, sp);

	/*
	* If this is neither a dial-on-demand nor a passive
	* interface, simulate an ``ifconfig down'' action, so the
	* administrator can force a redial by another ``ifconfig
	* up''. XXX For leased line operation, should we immediately
	* try to reopen the connection here?
	*/
	if ((ifp->if_flags & (IFF_AUTO \| IFF_PASSIVE)) == 0) {
	log(LOG_INFO,
	SPP_FMT "Down event, taking interface down.\n",
	SPP_ARGS(ifp));
	if_down(ifp);
	} else {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "Down event (carrier loss)\n",
	SPP_ARGS(ifp));
	sp->pp_flags &= ~PP_CALLIN;
	if (sp->state[IDX_LCP] != STATE_INITIAL)
	lcp.Close(sp);
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	}
	}

	static void
	sppp_lcp_open(struct sppp *sp)
	{
	sppp_open_event(&lcp, sp);
	}

	static void
	sppp_lcp_close(struct sppp *sp)
	{
	sppp_close_event(&lcp, sp);
	}

	static void
	sppp_lcp_TO(void *cookie)
	{
	sppp_to_event(&lcp, (struct sppp *)cookie);
	}

	/*
	* Analyze a configure request. Return true if it was agreeable, and
	* caused action sca, false if it has been rejected or nak'ed, and
	* caused action scn. (The return value is used to make the state
	* transition decision in the state automaton.)
	*/
	static int
	sppp_lcp_RCR(struct sppp sp, struct lcp_header h, int len)
	{
	STDDCL;
	u_char buf, r, *p;
	int origlen, rlen;
	u_long nmagic;
	u_short authproto;

	len -= 4;
	origlen = len;
	buf = r = malloc (len, M_TEMP, M_NOWAIT);
	if (! buf)
	return (0);

	if (debug)
	log(LOG_DEBUG, SPP_FMT "lcp parse opts: ",
	SPP_ARGS(ifp));

	/* pass 1: check for things that need to be rejected */
	p = (void*) (h+1);
	for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1];
	len-=p[1], p+=p[1]) {
	if (debug)
	log(-1, " %s ", sppp_lcp_opt_name(*p));
	switch (*p) {
	case LCP_OPT_MAGIC:
	/* Magic number. */
	if (len >= 6 && p[1] == 6)
	continue;
	if (debug)
	log(-1, "[invalid] ");
	break;
	case LCP_OPT_ASYNC_MAP:
	/* Async control character map. */
	if (len >= 6 && p[1] == 6)
	continue;
	if (debug)
	log(-1, "[invalid] ");
	break;
	case LCP_OPT_MRU:
	/* Maximum receive unit. */
	if (len >= 4 && p[1] == 4)
	continue;
	if (debug)
	log(-1, "[invalid] ");
	break;
	case LCP_OPT_AUTH_PROTO:
	if (len < 4) {
	if (debug)
	log(-1, "[invalid] ");
	break;
	}
	authproto = (p[2] << 8) + p[3];
	if (authproto == PPP_CHAP && p[1] != 5) {
	if (debug)
	log(-1, "[invalid chap len] ");
	break;
	}
	if (sp->myauth.proto == 0) {
	/* we are not configured to do auth */
	if (debug)
	log(-1, "[not configured] ");
	break;
	}
	/*
	* Remote want us to authenticate, remember this,
	* so we stay in PHASE_AUTHENTICATE after LCP got
	* up.
	*/
	sp->pp_flags \|= PP_NEEDAUTH;
	continue;
	default:
	/* Others not supported. */
	if (debug)
	log(-1, "[rej] ");
	break;
	}
	/* Add the option to rejected list. */
	bcopy (p, r, p[1]);
	r += p[1];
	rlen += p[1];
	}
	if (rlen) {
	if (debug)
	log(-1, " send conf-rej\n");
	sppp_cp_send (sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf);
	return 0;
	} else if (debug)
	log(-1, "\n");

	/*
	* pass 2: check for option values that are unacceptable and
	* thus require to be nak'ed.
	*/
	if (debug)
	log(LOG_DEBUG, SPP_FMT "lcp parse opt values: ",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	len = origlen;
	for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1];
	len-=p[1], p+=p[1]) {
	if (debug)
	log(-1, " %s ", sppp_lcp_opt_name(*p));
	switch (*p) {
	case LCP_OPT_MAGIC:
	/* Magic number -- extract. */
	nmagic = (u_long)p[2] << 24 \|
	(u_long)p[3] << 16 \| p[4] << 8 \| p[5];
	if (nmagic != sp->lcp.magic) {
	sp->pp_loopcnt = 0;
	if (debug)
	log(-1, "0x%lx ", nmagic);
	continue;
	}
	if (debug && sp->pp_loopcnt < MAXALIVECNT*5)
	log(-1, "[glitch] ");
	++sp->pp_loopcnt;
	/*
	* We negate our magic here, and NAK it. If
	* we see it later in an NAK packet, we
	* suggest a new one.
	*/
	nmagic = ~sp->lcp.magic;
	/* Gonna NAK it. */
	p[2] = nmagic >> 24;
	p[3] = nmagic >> 16;
	p[4] = nmagic >> 8;
	p[5] = nmagic;
	break;

	case LCP_OPT_ASYNC_MAP:
	/*
	* Async control character map -- just ignore it.
	*
	* Quote from RFC 1662, chapter 6:
	* To enable this functionality, synchronous PPP
	* implementations MUST always respond to the
	* Async-Control-Character-Map Configuration
	* Option with the LCP Configure-Ack. However,
	* acceptance of the Configuration Option does
	* not imply that the synchronous implementation
	* will do any ACCM mapping. Instead, all such
	* octet mapping will be performed by the
	* asynchronous-to-synchronous converter.
	*/
	continue;

	case LCP_OPT_MRU:
	/*
	* Maximum receive unit. Always agreeable,
	* but ignored by now.
	*/
	sp->lcp.their_mru = p[2] * 256 + p[3];
	if (debug)
	log(-1, "%lu ", sp->lcp.their_mru);
	continue;

	case LCP_OPT_AUTH_PROTO:
	authproto = (p[2] << 8) + p[3];
	if (sp->myauth.proto != authproto) {
	/* not agreed, nak */
	if (debug)
	log(-1, "[mine %s != his %s] ",
	sppp_proto_name(sp->hisauth.proto),
	sppp_proto_name(authproto));
	p[2] = sp->myauth.proto >> 8;
	p[3] = sp->myauth.proto;
	break;
	}
	if (authproto == PPP_CHAP && p[4] != CHAP_MD5) {
	if (debug)
	log(-1, "[chap not MD5] ");
	p[4] = CHAP_MD5;
	break;
	}
	continue;
	}
	/* Add the option to nak'ed list. */
	bcopy (p, r, p[1]);
	r += p[1];
	rlen += p[1];
	}
	if (rlen) {
	/*
	* Local and remote magics equal -- loopback?
	*/
	if (sp->pp_loopcnt >= MAXALIVECNT*5) {
	if (sp->pp_loopcnt == MAXALIVECNT*5)
	printf (SPP_FMT "loopback\n",
	SPP_ARGS(ifp));
	if (ifp->if_flags & IFF_UP) {
	if_down(ifp);
	sppp_qflush(&sp->pp_cpq);
	/* XXX ? */
	lcp.Down(sp);
	lcp.Up(sp);
	}
	} else if (!sp->pp_loopcnt &&
	++sp->fail_counter[IDX_LCP] >= sp->lcp.max_failure) {
	if (debug)
	log(-1, " max_failure (%d) exceeded, "
	"send conf-rej\n",
	sp->lcp.max_failure);
	sppp_cp_send(sp, PPP_LCP, CONF_REJ, h->ident, rlen, buf);
	} else {
	if (debug)
	log(-1, " send conf-nak\n");
	sppp_cp_send (sp, PPP_LCP, CONF_NAK, h->ident, rlen, buf);
	}
	} else {
	if (debug)
	log(-1, " send conf-ack\n");
	sp->fail_counter[IDX_LCP] = 0;
	sp->pp_loopcnt = 0;
	sppp_cp_send (sp, PPP_LCP, CONF_ACK,
	h->ident, origlen, h+1);
	}

	free (buf, M_TEMP);
	return (rlen == 0);
	}

	/*
	* Analyze the LCP Configure-Reject option list, and adjust our
	* negotiation.
	*/
	static void
	sppp_lcp_RCN_rej(struct sppp sp, struct lcp_header h, int len)
	{
	STDDCL;
	u_char buf, p;

	len -= 4;
	buf = malloc (len, M_TEMP, M_NOWAIT);
	if (!buf)
	return;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "lcp rej opts: ",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	for (; len >= 2 && p[1] >= 2 && len >= p[1];
	len -= p[1], p += p[1]) {
	if (debug)
	log(-1, " %s ", sppp_lcp_opt_name(*p));
	switch (*p) {
	case LCP_OPT_MAGIC:
	/* Magic number -- can't use it, use 0 */
	sp->lcp.opts &= ~(1 << LCP_OPT_MAGIC);
	sp->lcp.magic = 0;
	break;
	case LCP_OPT_MRU:
	/*
	* Should not be rejected anyway, since we only
	* negotiate a MRU if explicitly requested by
	* peer.
	*/
	sp->lcp.opts &= ~(1 << LCP_OPT_MRU);
	break;
	case LCP_OPT_AUTH_PROTO:
	/*
	* Peer doesn't want to authenticate himself,
	* deny unless this is a dialout call, and
	* AUTHFLAG_NOCALLOUT is set.
	*/
	if ((sp->pp_flags & PP_CALLIN) == 0 &&
	(sp->hisauth.flags & AUTHFLAG_NOCALLOUT) != 0) {
	if (debug)
	log(-1, "[don't insist on auth "
	"for callout]");
	sp->lcp.opts &= ~(1 << LCP_OPT_AUTH_PROTO);
	break;
	}
	if (debug)
	log(-1, "[access denied]\n");
	lcp.Close(sp);
	break;
	}
	}
	if (debug)
	log(-1, "\n");
	free (buf, M_TEMP);
	return;
	}

	/*
	* Analyze the LCP Configure-NAK option list, and adjust our
	* negotiation.
	*/
	static void
	sppp_lcp_RCN_nak(struct sppp sp, struct lcp_header h, int len)
	{
	STDDCL;
	u_char buf, p;
	u_long magic;

	len -= 4;
	buf = malloc (len, M_TEMP, M_NOWAIT);
	if (!buf)
	return;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "lcp nak opts: ",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	for (; len >= 2 && p[1] >= 2 && len >= p[1];
	len -= p[1], p += p[1]) {
	if (debug)
	log(-1, " %s ", sppp_lcp_opt_name(*p));
	switch (*p) {
	case LCP_OPT_MAGIC:
	/* Magic number -- renegotiate */
	if ((sp->lcp.opts & (1 << LCP_OPT_MAGIC)) &&
	len >= 6 && p[1] == 6) {
	magic = (u_long)p[2] << 24 \|
	(u_long)p[3] << 16 \| p[4] << 8 \| p[5];
	/*
	* If the remote magic is our negated one,
	* this looks like a loopback problem.
	* Suggest a new magic to make sure.
	*/
	if (magic == ~sp->lcp.magic) {
	if (debug)
	log(-1, "magic glitch ");
	sp->lcp.magic = random();
	} else {
	sp->lcp.magic = magic;
	if (debug)
	log(-1, "%lu ", magic);
	}
	}
	break;
	case LCP_OPT_MRU:
	/*
	* Peer wants to advise us to negotiate an MRU.
	* Agree on it if it's reasonable, or use
	* default otherwise.
	*/
	if (len >= 4 && p[1] == 4) {
	u_int mru = p[2] * 256 + p[3];
	if (debug)
	log(-1, "%d ", mru);
	if (mru < PP_MTU \|\| mru > PP_MAX_MRU)
	mru = PP_MTU;
	sp->lcp.mru = mru;
	sp->lcp.opts \|= (1 << LCP_OPT_MRU);
	}
	break;
	case LCP_OPT_AUTH_PROTO:
	/*
	* Peer doesn't like our authentication method,
	* deny.
	*/
	if (debug)
	log(-1, "[access denied]\n");
	lcp.Close(sp);
	break;
	}
	}
	if (debug)
	log(-1, "\n");
	free (buf, M_TEMP);
	return;
	}

	static void
	sppp_lcp_tlu(struct sppp *sp)
	{
	STDDCL;
	int i;
	u_long mask;

	/* XXX ? */
	if (! (ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	/* Coming out of loopback mode. */
	if_up(ifp);
	printf (SPP_FMT "up\n", SPP_ARGS(ifp));
	}

	for (i = 0; i < IDX_COUNT; i++)
	if ((cps[i])->flags & CP_QUAL)
	(cps[i])->Open(sp);

	if ((sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0 \|\|
	(sp->pp_flags & PP_NEEDAUTH) != 0)
	sp->pp_phase = PHASE_AUTHENTICATE;
	else
	sp->pp_phase = PHASE_NETWORK;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp),
	sppp_phase_name(sp->pp_phase));

	/*
	* Open all authentication protocols. This is even required
	* if we already proceeded to network phase, since it might be
	* that remote wants us to authenticate, so we might have to
	* send a PAP request. Undesired authentication protocols
	* don't do anything when they get an Open event.
	*/
	for (i = 0; i < IDX_COUNT; i++)
	if ((cps[i])->flags & CP_AUTH)
	(cps[i])->Open(sp);

	if (sp->pp_phase == PHASE_NETWORK) {
	/* Notify all NCPs. */
	for (i = 0; i < IDX_COUNT; i++)
	if (((cps[i])->flags & CP_NCP) &&
	/*
	* XXX
	* Hack to administratively disable IPv6 if
	* not desired. Perhaps we should have another
	* flag for this, but right now, we can make
	* all struct cp's read/only.
	*/
	(cps[i] != &ipv6cp \|\|
	(sp->confflags & CONF_ENABLE_IPV6)))
	(cps[i])->Open(sp);
	}

	/* Send Up events to all started protos. */
	for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1)
	if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0)
	(cps[i])->Up(sp);

	/* notify low-level driver of state change */
	if (sp->pp_chg)
	sp->pp_chg(sp, (int)sp->pp_phase);

	if (sp->pp_phase == PHASE_NETWORK)
	/* if no NCP is starting, close down */
	sppp_lcp_check_and_close(sp);
	}

	static void
	sppp_lcp_tld(struct sppp *sp)
	{
	STDDCL;
	int i;
	u_long mask;

	sp->pp_phase = PHASE_TERMINATE;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp),
	sppp_phase_name(sp->pp_phase));

	/*
	* Take upper layers down. We send the Down event first and
	* the Close second to prevent the upper layers from sending
	* ``a flurry of terminate-request packets'', as the RFC
	* describes it.
	*/
	for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1)
	if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_LCP) == 0) {
	(cps[i])->Down(sp);
	(cps[i])->Close(sp);
	}
	}

	static void
	sppp_lcp_tls(struct sppp *sp)
	{
	STDDCL;

	sp->pp_phase = PHASE_ESTABLISH;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp),
	sppp_phase_name(sp->pp_phase));

	/* Notify lower layer if desired. */
	if (sp->pp_tls)
	(sp->pp_tls)(sp);
	else
	(sp->pp_up)(sp);
	}

	static void
	sppp_lcp_tlf(struct sppp *sp)
	{
	STDDCL;

	sp->pp_phase = PHASE_DEAD;
	if (debug)
	log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp),
	sppp_phase_name(sp->pp_phase));

	/* Notify lower layer if desired. */
	if (sp->pp_tlf)
	(sp->pp_tlf)(sp);
	else
	(sp->pp_down)(sp);
	}

	static void
	sppp_lcp_scr(struct sppp *sp)
	{
	char opt[6 /* magicnum / + 4 / mru / + 5 / chap */];
	int i = 0;
	u_short authproto;

	if (sp->lcp.opts & (1 << LCP_OPT_MAGIC)) {
	if (! sp->lcp.magic)
	sp->lcp.magic = random();
	opt[i++] = LCP_OPT_MAGIC;
	opt[i++] = 6;
	opt[i++] = sp->lcp.magic >> 24;
	opt[i++] = sp->lcp.magic >> 16;
	opt[i++] = sp->lcp.magic >> 8;
	opt[i++] = sp->lcp.magic;
	}

	if (sp->lcp.opts & (1 << LCP_OPT_MRU)) {
	opt[i++] = LCP_OPT_MRU;
	opt[i++] = 4;
	opt[i++] = sp->lcp.mru >> 8;
	opt[i++] = sp->lcp.mru;
	}

	if (sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) {
	authproto = sp->hisauth.proto;
	opt[i++] = LCP_OPT_AUTH_PROTO;
	opt[i++] = authproto == PPP_CHAP? 5: 4;
	opt[i++] = authproto >> 8;
	opt[i++] = authproto;
	if (authproto == PPP_CHAP)
	opt[i++] = CHAP_MD5;
	}

	sp->confid[IDX_LCP] = ++sp->pp_seq[IDX_LCP];
	sppp_cp_send (sp, PPP_LCP, CONF_REQ, sp->confid[IDX_LCP], i, &opt);
	}

	/*
	* Check the open NCPs, return true if at least one NCP is open.
	*/
	static int
	sppp_ncp_check(struct sppp *sp)
	{
	int i, mask;

	for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1)
	if ((sp->lcp.protos & mask) && (cps[i])->flags & CP_NCP)
	return 1;
	return 0;
	}

	/*
	* Re-check the open NCPs and see if we should terminate the link.
	* Called by the NCPs during their tlf action handling.
	*/
	static void
	sppp_lcp_check_and_close(struct sppp *sp)
	{

	if (sp->pp_phase < PHASE_NETWORK)
	/* don't bother, we are already going down */
	return;

	if (sppp_ncp_check(sp))
	return;

	lcp.Close(sp);
	}

	/*
	--------------------------------------------------------------------------
	* *
	* The IPCP implementation. *
	* *
	--------------------------------------------------------------------------
	*/

	static void
	sppp_ipcp_init(struct sppp *sp)
	{
	sp->ipcp.opts = 0;
	sp->ipcp.flags = 0;
	sp->state[IDX_IPCP] = STATE_INITIAL;
	sp->fail_counter[IDX_IPCP] = 0;
	sp->pp_seq[IDX_IPCP] = 0;
	sp->pp_rseq[IDX_IPCP] = 0;
	callout_init(&sp->ch[IDX_IPCP],
	(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	}

	static void
	sppp_ipcp_up(struct sppp *sp)
	{
	sppp_up_event(&ipcp, sp);
	}

	static void
	sppp_ipcp_down(struct sppp *sp)
	{
	sppp_down_event(&ipcp, sp);
	}

	static void
	sppp_ipcp_open(struct sppp *sp)
	{
	STDDCL;
	u_long myaddr, hisaddr;

	sp->ipcp.flags &= ~(IPCP_HISADDR_SEEN \| IPCP_MYADDR_SEEN \|
	IPCP_MYADDR_DYN \| IPCP_VJ);
	sp->ipcp.opts = 0;

	sppp_get_ip_addrs(sp, &myaddr, &hisaddr, 0);
	/*
	* If we don't have his address, this probably means our
	* interface doesn't want to talk IP at all. (This could
	* be the case if somebody wants to speak only IPX, for
	* example.) Don't open IPCP in this case.
	*/
	if (hisaddr == 0L) {
	/* XXX this message should go away */
	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipcp_open(): no IP interface\n",
	SPP_ARGS(ifp));
	return;
	}
	if (myaddr == 0L) {
	/*
	* I don't have an assigned address, so i need to
	* negotiate my address.
	*/
	sp->ipcp.flags \|= IPCP_MYADDR_DYN;
	sp->ipcp.opts \|= (1 << IPCP_OPT_ADDRESS);
	} else
	sp->ipcp.flags \|= IPCP_MYADDR_SEEN;
	if (sp->confflags & CONF_ENABLE_VJ) {
	sp->ipcp.opts \|= (1 << IPCP_OPT_COMPRESSION);
	sp->ipcp.max_state = MAX_STATES - 1;
	sp->ipcp.compress_cid = 1;
	}
	sppp_open_event(&ipcp, sp);
	}

	static void
	sppp_ipcp_close(struct sppp *sp)
	{
	sppp_close_event(&ipcp, sp);
	if (sp->ipcp.flags & IPCP_MYADDR_DYN)
	/*
	* My address was dynamic, clear it again.
	*/
	sppp_set_ip_addr(sp, 0L);
	}

	static void
	sppp_ipcp_TO(void *cookie)
	{
	sppp_to_event(&ipcp, (struct sppp *)cookie);
	}

	/*
	* Analyze a configure request. Return true if it was agreeable, and
	* caused action sca, false if it has been rejected or nak'ed, and
	* caused action scn. (The return value is used to make the state
	* transition decision in the state automaton.)
	*/
	static int
	sppp_ipcp_RCR(struct sppp sp, struct lcp_header h, int len)
	{
	u_char buf, r, *p;
	struct ifnet *ifp = SP2IFP(sp);
	int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG;
	u_long hisaddr, desiredaddr;
	int gotmyaddr = 0;
	int desiredcomp;

	len -= 4;
	origlen = len;
	/*
	* Make sure to allocate a buf that can at least hold a
	* conf-nak with an `address' option. We might need it below.
	*/
	buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT);
	if (! buf)
	return (0);

	/* pass 1: see if we can recognize them */
	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipcp parse opts: ",
	SPP_ARGS(ifp));
	p = (void*) (h+1);
	for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1];
	len-=p[1], p+=p[1]) {
	if (debug)
	log(-1, " %s ", sppp_ipcp_opt_name(*p));
	switch (*p) {
	case IPCP_OPT_COMPRESSION:
	if (!(sp->confflags & CONF_ENABLE_VJ)) {
	/* VJ compression administratively disabled */
	if (debug)
	log(-1, "[locally disabled] ");
	break;
	}
	/*
	* In theory, we should only conf-rej an
	* option that is shorter than RFC 1618
	* requires (i.e. < 4), and should conf-nak
	* anything else that is not VJ. However,
	* since our algorithm always uses the
	* original option to NAK it with new values,
	* things would become more complicated. In
	* pratice, the only commonly implemented IP
	* compression option is VJ anyway, so the
	* difference is negligible.
	*/
	if (len >= 6 && p[1] == 6) {
	/*
	* correctly formed compression option
	* that could be VJ compression
	*/
	continue;
	}
	if (debug)
	log(-1,
	"optlen %d [invalid/unsupported] ",
	p[1]);
	break;
	case IPCP_OPT_ADDRESS:
	if (len >= 6 && p[1] == 6) {
	/* correctly formed address option */
	continue;
	}
	if (debug)
	log(-1, "[invalid] ");
	break;
	default:
	/* Others not supported. */
	if (debug)
	log(-1, "[rej] ");
	break;
	}
	/* Add the option to rejected list. */
	bcopy (p, r, p[1]);
	r += p[1];
	rlen += p[1];
	}
	if (rlen) {
	if (debug)
	log(-1, " send conf-rej\n");
	sppp_cp_send (sp, PPP_IPCP, CONF_REJ, h->ident, rlen, buf);
	return 0;
	} else if (debug)
	log(-1, "\n");

	/* pass 2: parse option values */
	sppp_get_ip_addrs(sp, 0, &hisaddr, 0);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipcp parse opt values: ",
	SPP_ARGS(ifp));
	p = (void*) (h+1);
	len = origlen;
	for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1];
	len-=p[1], p+=p[1]) {
	if (debug)
	log(-1, " %s ", sppp_ipcp_opt_name(*p));
	switch (*p) {
	case IPCP_OPT_COMPRESSION:
	desiredcomp = p[2] << 8 \| p[3];
	/* We only support VJ */
	if (desiredcomp == IPCP_COMP_VJ) {
	if (debug)
	log(-1, "VJ [ack] ");
	sp->ipcp.flags \|= IPCP_VJ;
	sl_compress_init(sp->pp_comp, p[4]);
	sp->ipcp.max_state = p[4];
	sp->ipcp.compress_cid = p[5];
	continue;
	}
	if (debug)
	log(-1,
	"compproto %#04x [not supported] ",
	desiredcomp);
	p[2] = IPCP_COMP_VJ >> 8;
	p[3] = IPCP_COMP_VJ;
	p[4] = sp->ipcp.max_state;
	p[5] = sp->ipcp.compress_cid;
	break;
	case IPCP_OPT_ADDRESS:
	/* This is the address he wants in his end */
	desiredaddr = p[2] << 24 \| p[3] << 16 \|
	p[4] << 8 \| p[5];
	if (desiredaddr == hisaddr \|\|
	(hisaddr >= 1 && hisaddr <= 254 && desiredaddr != 0)) {
	/*
	* Peer's address is same as our value,
	* or we have set it to 0.0.0.* to
	* indicate that we do not really care,
	* this is agreeable. Gonna conf-ack
	* it.
	*/
	if (debug)
	log(-1, "%s [ack] ",
	sppp_dotted_quad(hisaddr));
	/* record that we've seen it already */
	sp->ipcp.flags \|= IPCP_HISADDR_SEEN;
	continue;
	}
	/*
	* The address wasn't agreeable. This is either
	* he sent us 0.0.0.0, asking to assign him an
	* address, or he send us another address not
	* matching our value. Either case, we gonna
	* conf-nak it with our value.
	* XXX: we should "rej" if hisaddr == 0
	*/
	if (debug) {
	if (desiredaddr == 0)
	log(-1, "[addr requested] ");
	else
	log(-1, "%s [not agreed] ",
	sppp_dotted_quad(desiredaddr));

	}
	p[2] = hisaddr >> 24;
	p[3] = hisaddr >> 16;
	p[4] = hisaddr >> 8;
	p[5] = hisaddr;
	break;
	}
	/* Add the option to nak'ed list. */
	bcopy (p, r, p[1]);
	r += p[1];
	rlen += p[1];
	}

	/*
	* If we are about to conf-ack the request, but haven't seen
	* his address so far, gonna conf-nak it instead, with the
	* `address' option present and our idea of his address being
	* filled in there, to request negotiation of both addresses.
	*
	* XXX This can result in an endless req - nak loop if peer
	* doesn't want to send us his address. Q: What should we do
	* about it? XXX A: implement the max-failure counter.
	*/
	if (rlen == 0 && !(sp->ipcp.flags & IPCP_HISADDR_SEEN) && !gotmyaddr) {
	buf[0] = IPCP_OPT_ADDRESS;
	buf[1] = 6;
	buf[2] = hisaddr >> 24;
	buf[3] = hisaddr >> 16;
	buf[4] = hisaddr >> 8;
	buf[5] = hisaddr;
	rlen = 6;
	if (debug)
	log(-1, "still need hisaddr ");
	}

	if (rlen) {
	if (debug)
	log(-1, " send conf-nak\n");
	sppp_cp_send (sp, PPP_IPCP, CONF_NAK, h->ident, rlen, buf);
	} else {
	if (debug)
	log(-1, " send conf-ack\n");
	sppp_cp_send (sp, PPP_IPCP, CONF_ACK,
	h->ident, origlen, h+1);
	}

	free (buf, M_TEMP);
	return (rlen == 0);
	}

	/*
	* Analyze the IPCP Configure-Reject option list, and adjust our
	* negotiation.
	*/
	static void
	sppp_ipcp_RCN_rej(struct sppp sp, struct lcp_header h, int len)
	{
	u_char buf, p;
	struct ifnet *ifp = SP2IFP(sp);
	int debug = ifp->if_flags & IFF_DEBUG;

	len -= 4;
	buf = malloc (len, M_TEMP, M_NOWAIT);
	if (!buf)
	return;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipcp rej opts: ",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	for (; len >= 2 && p[1] >= 2 && len >= p[1];
	len -= p[1], p += p[1]) {
	if (debug)
	log(-1, " %s ", sppp_ipcp_opt_name(*p));
	switch (*p) {
	case IPCP_OPT_COMPRESSION:
	sp->ipcp.opts &= ~(1 << IPCP_OPT_COMPRESSION);
	break;
	case IPCP_OPT_ADDRESS:
	/*
	* Peer doesn't grok address option. This is
	* bad. XXX Should we better give up here?
	* XXX We could try old "addresses" option...
	*/
	sp->ipcp.opts &= ~(1 << IPCP_OPT_ADDRESS);
	break;
	}
	}
	if (debug)
	log(-1, "\n");
	free (buf, M_TEMP);
	return;
	}

	/*
	* Analyze the IPCP Configure-NAK option list, and adjust our
	* negotiation.
	*/
	static void
	sppp_ipcp_RCN_nak(struct sppp sp, struct lcp_header h, int len)
	{
	u_char buf, p;
	struct ifnet *ifp = SP2IFP(sp);
	int debug = ifp->if_flags & IFF_DEBUG;
	int desiredcomp;
	u_long wantaddr;

	len -= 4;
	buf = malloc (len, M_TEMP, M_NOWAIT);
	if (!buf)
	return;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipcp nak opts: ",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	for (; len >= 2 && p[1] >= 2 && len >= p[1];
	len -= p[1], p += p[1]) {
	if (debug)
	log(-1, " %s ", sppp_ipcp_opt_name(*p));
	switch (*p) {
	case IPCP_OPT_COMPRESSION:
	if (len >= 6 && p[1] == 6) {
	desiredcomp = p[2] << 8 \| p[3];
	if (debug)
	log(-1, "[wantcomp %#04x] ",
	desiredcomp);
	if (desiredcomp == IPCP_COMP_VJ) {
	sl_compress_init(sp->pp_comp, p[4]);
	sp->ipcp.max_state = p[4];
	sp->ipcp.compress_cid = p[5];
	if (debug)
	log(-1, "[agree] ");
	} else
	sp->ipcp.opts &=
	~(1 << IPCP_OPT_COMPRESSION);
	}
	break;
	case IPCP_OPT_ADDRESS:
	/*
	* Peer doesn't like our local IP address. See
	* if we can do something for him. We'll drop
	* him our address then.
	*/
	if (len >= 6 && p[1] == 6) {
	wantaddr = p[2] << 24 \| p[3] << 16 \|
	p[4] << 8 \| p[5];
	sp->ipcp.opts \|= (1 << IPCP_OPT_ADDRESS);
	if (debug)
	log(-1, "[wantaddr %s] ",
	sppp_dotted_quad(wantaddr));
	/*
	* When doing dynamic address assignment,
	* we accept his offer. Otherwise, we
	* ignore it and thus continue to negotiate
	* our already existing value.
	* XXX: Bogus, if he said no once, he'll
	* just say no again, might as well die.
	*/
	if (sp->ipcp.flags & IPCP_MYADDR_DYN) {
	sppp_set_ip_addr(sp, wantaddr);
	if (debug)
	log(-1, "[agree] ");
	sp->ipcp.flags \|= IPCP_MYADDR_SEEN;
	}
	}
	break;
	}
	}
	if (debug)
	log(-1, "\n");
	free (buf, M_TEMP);
	return;
	}

	static void
	sppp_ipcp_tlu(struct sppp *sp)
	{
	/* we are up - notify isdn daemon */
	if (sp->pp_con)
	sp->pp_con(sp);
	}

	static void
	sppp_ipcp_tld(struct sppp *sp)
	{
	}

	static void
	sppp_ipcp_tls(struct sppp *sp)
	{
	/* indicate to LCP that it must stay alive */
	sp->lcp.protos \|= (1 << IDX_IPCP);
	}

	static void
	sppp_ipcp_tlf(struct sppp *sp)
	{
	/* we no longer need LCP */
	sp->lcp.protos &= ~(1 << IDX_IPCP);
	sppp_lcp_check_and_close(sp);
	}

	static void
	sppp_ipcp_scr(struct sppp *sp)
	{
	char opt[6 /* compression / + 6 / address */];
	u_long ouraddr;
	int i = 0;

	if (sp->ipcp.opts & (1 << IPCP_OPT_COMPRESSION)) {
	opt[i++] = IPCP_OPT_COMPRESSION;
	opt[i++] = 6;
	opt[i++] = IPCP_COMP_VJ >> 8;
	opt[i++] = IPCP_COMP_VJ;
	opt[i++] = sp->ipcp.max_state;
	opt[i++] = sp->ipcp.compress_cid;
	}
	if (sp->ipcp.opts & (1 << IPCP_OPT_ADDRESS)) {
	sppp_get_ip_addrs(sp, &ouraddr, 0, 0);
	opt[i++] = IPCP_OPT_ADDRESS;
	opt[i++] = 6;
	opt[i++] = ouraddr >> 24;
	opt[i++] = ouraddr >> 16;
	opt[i++] = ouraddr >> 8;
	opt[i++] = ouraddr;
	}

	sp->confid[IDX_IPCP] = ++sp->pp_seq[IDX_IPCP];
	sppp_cp_send(sp, PPP_IPCP, CONF_REQ, sp->confid[IDX_IPCP], i, &opt);
	}

	/*
	--------------------------------------------------------------------------
	* *
	* The IPv6CP implementation. *
	* *
	--------------------------------------------------------------------------
	*/

	#ifdef INET6
	static void
	sppp_ipv6cp_init(struct sppp *sp)
	{
	sp->ipv6cp.opts = 0;
	sp->ipv6cp.flags = 0;
	sp->state[IDX_IPV6CP] = STATE_INITIAL;
	sp->fail_counter[IDX_IPV6CP] = 0;
	sp->pp_seq[IDX_IPV6CP] = 0;
	sp->pp_rseq[IDX_IPV6CP] = 0;
	callout_init(&sp->ch[IDX_IPV6CP],
	(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	}

	static void
	sppp_ipv6cp_up(struct sppp *sp)
	{
	sppp_up_event(&ipv6cp, sp);
	}

	static void
	sppp_ipv6cp_down(struct sppp *sp)
	{
	sppp_down_event(&ipv6cp, sp);
	}

	static void
	sppp_ipv6cp_open(struct sppp *sp)
	{
	STDDCL;
	struct in6_addr myaddr, hisaddr;

	#ifdef IPV6CP_MYIFID_DYN
	sp->ipv6cp.flags &= ~(IPV6CP_MYIFID_SEEN\|IPV6CP_MYIFID_DYN);
	#else
	sp->ipv6cp.flags &= ~IPV6CP_MYIFID_SEEN;
	#endif

	sppp_get_ip6_addrs(sp, &myaddr, &hisaddr, 0);
	/*
	* If we don't have our address, this probably means our
	* interface doesn't want to talk IPv6 at all. (This could
	* be the case if somebody wants to speak only IPX, for
	* example.) Don't open IPv6CP in this case.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&myaddr)) {
	/* XXX this message should go away */
	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipv6cp_open(): no IPv6 interface\n",
	SPP_ARGS(ifp));
	return;
	}

	sp->ipv6cp.flags \|= IPV6CP_MYIFID_SEEN;
	sp->ipv6cp.opts \|= (1 << IPV6CP_OPT_IFID);
	sppp_open_event(&ipv6cp, sp);
	}

	static void
	sppp_ipv6cp_close(struct sppp *sp)
	{
	sppp_close_event(&ipv6cp, sp);
	}

	static void
	sppp_ipv6cp_TO(void *cookie)
	{
	sppp_to_event(&ipv6cp, (struct sppp *)cookie);
	}

	/*
	* Analyze a configure request. Return true if it was agreeable, and
	* caused action sca, false if it has been rejected or nak'ed, and
	* caused action scn. (The return value is used to make the state
	* transition decision in the state automaton.)
	*/
	static int
	sppp_ipv6cp_RCR(struct sppp sp, struct lcp_header h, int len)
	{
	u_char buf, r, *p;
	struct ifnet *ifp = SP2IFP(sp);
	int rlen, origlen, debug = ifp->if_flags & IFF_DEBUG;
	struct in6_addr myaddr, desiredaddr, suggestaddr;
	int ifidcount;
	int type;
	int collision, nohisaddr;
	char ip6buf[INET6_ADDRSTRLEN];

	len -= 4;
	origlen = len;
	/*
	* Make sure to allocate a buf that can at least hold a
	* conf-nak with an `address' option. We might need it below.
	*/
	buf = r = malloc ((len < 6? 6: len), M_TEMP, M_NOWAIT);
	if (! buf)
	return (0);

	/* pass 1: see if we can recognize them */
	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipv6cp parse opts:",
	SPP_ARGS(ifp));
	p = (void*) (h+1);
	ifidcount = 0;
	for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1];
	len-=p[1], p+=p[1]) {
	if (debug)
	log(-1, " %s", sppp_ipv6cp_opt_name(*p));
	switch (*p) {
	case IPV6CP_OPT_IFID:
	if (len >= 10 && p[1] == 10 && ifidcount == 0) {
	/* correctly formed address option */
	ifidcount++;
	continue;
	}
	if (debug)
	log(-1, " [invalid]");
	break;
	#ifdef notyet
	case IPV6CP_OPT_COMPRESSION:
	if (len >= 4 && p[1] >= 4) {
	/* correctly formed compress option */
	continue;
	}
	if (debug)
	log(-1, " [invalid]");
	break;
	#endif
	default:
	/* Others not supported. */
	if (debug)
	log(-1, " [rej]");
	break;
	}
	/* Add the option to rejected list. */
	bcopy (p, r, p[1]);
	r += p[1];
	rlen += p[1];
	}
	if (rlen) {
	if (debug)
	log(-1, " send conf-rej\n");
	sppp_cp_send (sp, PPP_IPV6CP, CONF_REJ, h->ident, rlen, buf);
	goto end;
	} else if (debug)
	log(-1, "\n");

	/* pass 2: parse option values */
	sppp_get_ip6_addrs(sp, &myaddr, 0, 0);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipv6cp parse opt values: ",
	SPP_ARGS(ifp));
	p = (void*) (h+1);
	len = origlen;
	type = CONF_ACK;
	for (rlen=0; len >= 2 && p[1] >= 2 && len >= p[1];
	len-=p[1], p+=p[1]) {
	if (debug)
	log(-1, " %s", sppp_ipv6cp_opt_name(*p));
	switch (*p) {
	#ifdef notyet
	case IPV6CP_OPT_COMPRESSION:
	continue;
	#endif
	case IPV6CP_OPT_IFID:
	bzero(&desiredaddr, sizeof(desiredaddr));
	bcopy(&p[2], &desiredaddr.s6_addr[8], 8);
	collision = (bcmp(&desiredaddr.s6_addr[8],
	&myaddr.s6_addr[8], 8) == 0);
	nohisaddr = IN6_IS_ADDR_UNSPECIFIED(&desiredaddr);

	desiredaddr.s6_addr16[0] = htons(0xfe80);
	(void)in6_setscope(&desiredaddr, SP2IFP(sp), NULL);

	if (!collision && !nohisaddr) {
	/* no collision, hisaddr known - Conf-Ack */
	type = CONF_ACK;

	if (debug) {
	log(-1, " %s [%s]",
	ip6_sprintf(ip6buf, &desiredaddr),
	sppp_cp_type_name(type));
	}
	continue;
	}

	bzero(&suggestaddr, sizeof(&suggestaddr));
	if (collision && nohisaddr) {
	/* collision, hisaddr unknown - Conf-Rej */
	type = CONF_REJ;
	bzero(&p[2], 8);
	} else {
	/*
	* - no collision, hisaddr unknown, or
	* - collision, hisaddr known
	* Conf-Nak, suggest hisaddr
	*/
	type = CONF_NAK;
	sppp_suggest_ip6_addr(sp, &suggestaddr);
	bcopy(&suggestaddr.s6_addr[8], &p[2], 8);
	}
	if (debug)
	log(-1, " %s [%s]",
	ip6_sprintf(ip6buf, &desiredaddr),
	sppp_cp_type_name(type));
	break;
	}
	/* Add the option to nak'ed list. */
	bcopy (p, r, p[1]);
	r += p[1];
	rlen += p[1];
	}

	if (rlen == 0 && type == CONF_ACK) {
	if (debug)
	log(-1, " send %s\n", sppp_cp_type_name(type));
	sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, origlen, h+1);
	} else {
	#ifdef DIAGNOSTIC
	if (type == CONF_ACK)
	panic("IPv6CP RCR: CONF_ACK with non-zero rlen");
	#endif

	if (debug) {
	log(-1, " send %s suggest %s\n",
	sppp_cp_type_name(type),
	ip6_sprintf(ip6buf, &suggestaddr));
	}
	sppp_cp_send (sp, PPP_IPV6CP, type, h->ident, rlen, buf);
	}

	end:
	free (buf, M_TEMP);
	return (rlen == 0);
	}

	/*
	* Analyze the IPv6CP Configure-Reject option list, and adjust our
	* negotiation.
	*/
	static void
	sppp_ipv6cp_RCN_rej(struct sppp sp, struct lcp_header h, int len)
	{
	u_char buf, p;
	struct ifnet *ifp = SP2IFP(sp);
	int debug = ifp->if_flags & IFF_DEBUG;

	len -= 4;
	buf = malloc (len, M_TEMP, M_NOWAIT);
	if (!buf)
	return;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipv6cp rej opts:",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	for (; len >= 2 && p[1] >= 2 && len >= p[1];
	len -= p[1], p += p[1]) {
	if (debug)
	log(-1, " %s", sppp_ipv6cp_opt_name(*p));
	switch (*p) {
	case IPV6CP_OPT_IFID:
	/*
	* Peer doesn't grok address option. This is
	* bad. XXX Should we better give up here?
	*/
	sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_IFID);
	break;
	#ifdef notyet
	case IPV6CP_OPT_COMPRESS:
	sp->ipv6cp.opts &= ~(1 << IPV6CP_OPT_COMPRESS);
	break;
	#endif
	}
	}
	if (debug)
	log(-1, "\n");
	free (buf, M_TEMP);
	return;
	}

	/*
	* Analyze the IPv6CP Configure-NAK option list, and adjust our
	* negotiation.
	*/
	static void
	sppp_ipv6cp_RCN_nak(struct sppp sp, struct lcp_header h, int len)
	{
	u_char buf, p;
	struct ifnet *ifp = SP2IFP(sp);
	int debug = ifp->if_flags & IFF_DEBUG;
	struct in6_addr suggestaddr;
	char ip6buf[INET6_ADDRSTRLEN];

	len -= 4;
	buf = malloc (len, M_TEMP, M_NOWAIT);
	if (!buf)
	return;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "ipv6cp nak opts:",
	SPP_ARGS(ifp));

	p = (void*) (h+1);
	for (; len >= 2 && p[1] >= 2 && len >= p[1];
	len -= p[1], p += p[1]) {
	if (debug)
	log(-1, " %s", sppp_ipv6cp_opt_name(*p));
	switch (*p) {
	case IPV6CP_OPT_IFID:
	/*
	* Peer doesn't like our local ifid. See
	* if we can do something for him. We'll drop
	* him our address then.
	*/
	if (len < 10 \|\| p[1] != 10)
	break;
	bzero(&suggestaddr, sizeof(suggestaddr));
	suggestaddr.s6_addr16[0] = htons(0xfe80);
	(void)in6_setscope(&suggestaddr, SP2IFP(sp), NULL);
	bcopy(&p[2], &suggestaddr.s6_addr[8], 8);

	sp->ipv6cp.opts \|= (1 << IPV6CP_OPT_IFID);
	if (debug)
	log(-1, " [suggestaddr %s]",
	ip6_sprintf(ip6buf, &suggestaddr));
	#ifdef IPV6CP_MYIFID_DYN
	/*
	* When doing dynamic address assignment,
	* we accept his offer.
	*/
	if (sp->ipv6cp.flags & IPV6CP_MYIFID_DYN) {
	struct in6_addr lastsuggest;
	/*
	* If <suggested myaddr from peer> equals to
	* <hisaddr we have suggested last time>,
	* we have a collision. generate new random
	* ifid.
	*/
	sppp_suggest_ip6_addr(&lastsuggest);
	if (IN6_ARE_ADDR_EQUAL(&suggestaddr,
	lastsuggest)) {
	if (debug)
	log(-1, " [random]");
	sppp_gen_ip6_addr(sp, &suggestaddr);
	}
	sppp_set_ip6_addr(sp, &suggestaddr, 0);
	if (debug)
	log(-1, " [agree]");
	sp->ipv6cp.flags \|= IPV6CP_MYIFID_SEEN;
	}
	#else
	/*
	* Since we do not do dynamic address assignment,
	* we ignore it and thus continue to negotiate
	* our already existing value. This can possibly
	* go into infinite request-reject loop.
	*
	* This is not likely because we normally use
	* ifid based on MAC-address.
	* If you have no ethernet card on the node, too bad.
	* XXX should we use fail_counter?
	*/
	#endif
	break;
	#ifdef notyet
	case IPV6CP_OPT_COMPRESS:
	/*
	* Peer wants different compression parameters.
	*/
	break;
	#endif
	}
	}
	if (debug)
	log(-1, "\n");
	free (buf, M_TEMP);
	return;
	}
	static void
	sppp_ipv6cp_tlu(struct sppp *sp)
	{
	/* we are up - notify isdn daemon */
	if (sp->pp_con)
	sp->pp_con(sp);
	}

	static void
	sppp_ipv6cp_tld(struct sppp *sp)
	{
	}

	static void
	sppp_ipv6cp_tls(struct sppp *sp)
	{
	/* indicate to LCP that it must stay alive */
	sp->lcp.protos \|= (1 << IDX_IPV6CP);
	}

	static void
	sppp_ipv6cp_tlf(struct sppp *sp)
	{

	#if 0 /* need #if 0 to close IPv6CP properly */
	/* we no longer need LCP */
	sp->lcp.protos &= ~(1 << IDX_IPV6CP);
	sppp_lcp_check_and_close(sp);
	#endif
	}

	static void
	sppp_ipv6cp_scr(struct sppp *sp)
	{
	char opt[10 /* ifid / + 4 / compression, minimum */];
	struct in6_addr ouraddr;
	int i = 0;

	if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_IFID)) {
	sppp_get_ip6_addrs(sp, &ouraddr, 0, 0);
	opt[i++] = IPV6CP_OPT_IFID;
	opt[i++] = 10;
	bcopy(&ouraddr.s6_addr[8], &opt[i], 8);
	i += 8;
	}

	#ifdef notyet
	if (sp->ipv6cp.opts & (1 << IPV6CP_OPT_COMPRESSION)) {
	opt[i++] = IPV6CP_OPT_COMPRESSION;
	opt[i++] = 4;
	opt[i++] = 0; /* TBD */
	opt[i++] = 0; /* TBD */
	/* variable length data may follow */
	}
	#endif

	sp->confid[IDX_IPV6CP] = ++sp->pp_seq[IDX_IPV6CP];
	sppp_cp_send(sp, PPP_IPV6CP, CONF_REQ, sp->confid[IDX_IPV6CP], i, &opt);
	}
	#else /INET6/
	static void sppp_ipv6cp_init(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_up(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_down(struct sppp *sp)
	{
	}


	static void sppp_ipv6cp_open(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_close(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_TO(void *sp)
	{
	}

	static int sppp_ipv6cp_RCR(struct sppp sp, struct lcp_header h, int len)
	{
	return 0;
	}

	static void sppp_ipv6cp_RCN_rej(struct sppp sp, struct lcp_header h, int len)
	{
	}

	static void sppp_ipv6cp_RCN_nak(struct sppp sp, struct lcp_header h, int len)
	{
	}

	static void sppp_ipv6cp_tlu(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_tld(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_tls(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_tlf(struct sppp *sp)
	{
	}

	static void sppp_ipv6cp_scr(struct sppp *sp)
	{
	}
	#endif /INET6/

	/*
	--------------------------------------------------------------------------
	* *
	* The CHAP implementation. *
	* *
	--------------------------------------------------------------------------
	*/

	/*
	* The authentication protocols don't employ a full-fledged state machine as
	* the control protocols do, since they do have Open and Close events, but
	* not Up and Down, nor are they explicitly terminated. Also, use of the
	* authentication protocols may be different in both directions (this makes
	* sense, think of a machine that never accepts incoming calls but only
	* calls out, it doesn't require the called party to authenticate itself).
	*
	* Our state machine for the local authentication protocol (we are requesting
	* the peer to authenticate) looks like:
	*
	* RCA-
	* +--------------------------------------------+
	* V scn,tld\|
	* +--------+ Close +---------+ RCA+
	* \| \|<----------------------------------\| \|------+
	* +--->\| Closed \| TO* \| Opened \| sca \|
	* \| \| \|-----+ +-------\| \|<-----+
	* \| +--------+ irc \| \| +---------+
	* \| ^ \| \| ^
	* \| \| \| \| \|
	* \| \| \| \| \|
	* \| TO-\| \| \| \|
	* \| \|tld TO+ V \| \|
	* \| \| +------->+ \| \|
	* \| \| \| \| \| \|
	* \| +--------+ V \| \|
	* \| \| \|<----+<--------------------+ \|
	* \| \| Req- \| scr \|
	* \| \| Sent \| \|
	* \| \| \| \|
	* \| +--------+ \|
	* \| RCA- \| \| RCA+ \|
	* +------+ +------------------------------------------+
	* scn,tld sca,irc,ict,tlu
	*
	*
	* with:
	*
	* Open: LCP reached authentication phase
	* Close: LCP reached terminate phase
	*
	* RCA+: received reply (pap-req, chap-response), acceptable
	* RCN: received reply (pap-req, chap-response), not acceptable
	* TO+: timeout with restart counter >= 0
	* TO-: timeout with restart counter < 0
	* TO*: reschedule timeout for CHAP
	*
	* scr: send request packet (none for PAP, chap-challenge)
	* sca: send ack packet (pap-ack, chap-success)
	* scn: send nak packet (pap-nak, chap-failure)
	* ict: initialize re-challenge timer (CHAP only)
	*
	* tlu: this-layer-up, LCP reaches network phase
	* tld: this-layer-down, LCP enters terminate phase
	*
	* Note that in CHAP mode, after sending a new challenge, while the state
	* automaton falls back into Req-Sent state, it doesn't signal a tld
	* event to LCP, so LCP remains in network phase. Only after not getting
	* any response (or after getting an unacceptable response), CHAP closes,
	* causing LCP to enter terminate phase.
	*
	* With PAP, there is no initial request that can be sent. The peer is
	* expected to send one based on the successful negotiation of PAP as
	* the authentication protocol during the LCP option negotiation.
	*
	* Incoming authentication protocol requests (remote requests
	* authentication, we are peer) don't employ a state machine at all,
	* they are simply answered. Some peers [Ascend P50 firmware rev
	* 4.50] react allergically when sending IPCP requests while they are
	* still in authentication phase (thereby violating the standard that
	* demands that these NCP packets are to be discarded), so we keep
	* track of the peer demanding us to authenticate, and only proceed to
	* phase network once we've seen a positive acknowledge for the
	* authentication.
	*/

	/*
	* Handle incoming CHAP packets.
	*/
	static void
	sppp_chap_input(struct sppp sp, struct mbuf m)
	{
	STDDCL;
	struct lcp_header *h;
	int len, x;
	u_char value, name, digest[AUTHKEYLEN], dsize;
	int value_len, name_len;
	MD5_CTX ctx;

	len = m->m_pkthdr.len;
	if (len < 4) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "chap invalid packet length: %d bytes\n",
	SPP_ARGS(ifp), len);
	return;
	}
	h = mtod (m, struct lcp_header*);
	if (len > ntohs (h->len))
	len = ntohs (h->len);

	switch (h->type) {
	/* challenge, failure and success are his authproto */
	case CHAP_CHALLENGE:
	value = 1 + (u_char*)(h+1);
	value_len = value[-1];
	name = value + value_len;
	name_len = len - value_len - 5;
	if (name_len < 0) {
	if (debug) {
	log(LOG_DEBUG,
	SPP_FMT "chap corrupted challenge "
	"<%s id=0x%x len=%d",
	SPP_ARGS(ifp),
	sppp_auth_type_name(PPP_CHAP, h->type),
	h->ident, ntohs(h->len));
	sppp_print_bytes((u_char*) (h+1), len-4);
	log(-1, ">\n");
	}
	break;
	}

	if (debug) {
	log(LOG_DEBUG,
	SPP_FMT "chap input <%s id=0x%x len=%d name=",
	SPP_ARGS(ifp),
	sppp_auth_type_name(PPP_CHAP, h->type), h->ident,
	ntohs(h->len));
	sppp_print_string((char*) name, name_len);
	log(-1, " value-size=%d value=", value_len);
	sppp_print_bytes(value, value_len);
	log(-1, ">\n");
	}

	/* Compute reply value. */
	MD5Init(&ctx);
	MD5Update(&ctx, &h->ident, 1);
	MD5Update(&ctx, sp->myauth.secret,
	sppp_strnlen(sp->myauth.secret, AUTHKEYLEN));
	MD5Update(&ctx, value, value_len);
	MD5Final(digest, &ctx);
	dsize = sizeof digest;

	sppp_auth_send(&chap, sp, CHAP_RESPONSE, h->ident,
	sizeof dsize, (const char *)&dsize,
	sizeof digest, digest,
	(size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN),
	sp->myauth.name,
	0);
	break;

	case CHAP_SUCCESS:
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "chap success",
	SPP_ARGS(ifp));
	if (len > 4) {
	log(-1, ": ");
	sppp_print_string((char*)(h + 1), len - 4);
	}
	log(-1, "\n");
	}
	x = splimp();
	SPPP_LOCK(sp);
	sp->pp_flags &= ~PP_NEEDAUTH;
	if (sp->myauth.proto == PPP_CHAP &&
	(sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) &&
	(sp->lcp.protos & (1 << IDX_CHAP)) == 0) {
	/*
	* We are authenticator for CHAP but didn't
	* complete yet. Leave it to tlu to proceed
	* to network phase.
	*/
	SPPP_UNLOCK(sp);
	splx(x);
	break;
	}
	SPPP_UNLOCK(sp);
	splx(x);
	sppp_phase_network(sp);
	break;

	case CHAP_FAILURE:
	if (debug) {
	log(LOG_INFO, SPP_FMT "chap failure",
	SPP_ARGS(ifp));
	if (len > 4) {
	log(-1, ": ");
	sppp_print_string((char*)(h + 1), len - 4);
	}
	log(-1, "\n");
	} else
	log(LOG_INFO, SPP_FMT "chap failure\n",
	SPP_ARGS(ifp));
	/* await LCP shutdown by authenticator */
	break;

	/* response is my authproto */
	case CHAP_RESPONSE:
	value = 1 + (u_char*)(h+1);
	value_len = value[-1];
	name = value + value_len;
	name_len = len - value_len - 5;
	if (name_len < 0) {
	if (debug) {
	log(LOG_DEBUG,
	SPP_FMT "chap corrupted response "
	"<%s id=0x%x len=%d",
	SPP_ARGS(ifp),
	sppp_auth_type_name(PPP_CHAP, h->type),
	h->ident, ntohs(h->len));
	sppp_print_bytes((u_char*)(h+1), len-4);
	log(-1, ">\n");
	}
	break;
	}
	if (h->ident != sp->confid[IDX_CHAP]) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "chap dropping response for old ID "
	"(got %d, expected %d)\n",
	SPP_ARGS(ifp),
	h->ident, sp->confid[IDX_CHAP]);
	break;
	}
	if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN)
	\|\| bcmp(name, sp->hisauth.name, name_len) != 0) {
	log(LOG_INFO, SPP_FMT "chap response, his name ",
	SPP_ARGS(ifp));
	sppp_print_string(name, name_len);
	log(-1, " != expected ");
	sppp_print_string(sp->hisauth.name,
	sppp_strnlen(sp->hisauth.name, AUTHNAMELEN));
	log(-1, "\n");
	}
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "chap input(%s) "
	"<%s id=0x%x len=%d name=",
	SPP_ARGS(ifp),
	sppp_state_name(sp->state[IDX_CHAP]),
	sppp_auth_type_name(PPP_CHAP, h->type),
	h->ident, ntohs (h->len));
	sppp_print_string((char*)name, name_len);
	log(-1, " value-size=%d value=", value_len);
	sppp_print_bytes(value, value_len);
	log(-1, ">\n");
	}
	if (value_len != AUTHKEYLEN) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "chap bad hash value length: "
	"%d bytes, should be %d\n",
	SPP_ARGS(ifp), value_len,
	AUTHKEYLEN);
	break;
	}

	MD5Init(&ctx);
	MD5Update(&ctx, &h->ident, 1);
	MD5Update(&ctx, sp->hisauth.secret,
	sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN));
	MD5Update(&ctx, sp->myauth.challenge, AUTHKEYLEN);
	MD5Final(digest, &ctx);

	#define FAILMSG "Failed..."
	#define SUCCMSG "Welcome!"

	if (value_len != sizeof digest \|\|
	bcmp(digest, value, value_len) != 0) {
	/* action scn, tld */
	sppp_auth_send(&chap, sp, CHAP_FAILURE, h->ident,
	sizeof(FAILMSG) - 1, (u_char *)FAILMSG,
	0);
	chap.tld(sp);
	break;
	}
	/* action sca, perhaps tlu */
	if (sp->state[IDX_CHAP] == STATE_REQ_SENT \|\|
	sp->state[IDX_CHAP] == STATE_OPENED)
	sppp_auth_send(&chap, sp, CHAP_SUCCESS, h->ident,
	sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG,
	0);
	if (sp->state[IDX_CHAP] == STATE_REQ_SENT) {
	sppp_cp_change_state(&chap, sp, STATE_OPENED);
	chap.tlu(sp);
	}
	break;

	default:
	/* Unknown CHAP packet type -- ignore. */
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "chap unknown input(%s) "
	"<0x%x id=0x%xh len=%d",
	SPP_ARGS(ifp),
	sppp_state_name(sp->state[IDX_CHAP]),
	h->type, h->ident, ntohs(h->len));
	sppp_print_bytes((u_char*)(h+1), len-4);
	log(-1, ">\n");
	}
	break;

	}
	}

	static void
	sppp_chap_init(struct sppp *sp)
	{
	/* Chap doesn't have STATE_INITIAL at all. */
	sp->state[IDX_CHAP] = STATE_CLOSED;
	sp->fail_counter[IDX_CHAP] = 0;
	sp->pp_seq[IDX_CHAP] = 0;
	sp->pp_rseq[IDX_CHAP] = 0;
	callout_init(&sp->ch[IDX_CHAP],
	(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	}

	static void
	sppp_chap_open(struct sppp *sp)
	{
	if (sp->myauth.proto == PPP_CHAP &&
	(sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) {
	/* we are authenticator for CHAP, start it */
	chap.scr(sp);
	sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure;
	sppp_cp_change_state(&chap, sp, STATE_REQ_SENT);
	}
	/* nothing to be done if we are peer, await a challenge */
	}

	static void
	sppp_chap_close(struct sppp *sp)
	{
	if (sp->state[IDX_CHAP] != STATE_CLOSED)
	sppp_cp_change_state(&chap, sp, STATE_CLOSED);
	}

	static void
	sppp_chap_TO(void *cookie)
	{
	struct sppp sp = (struct sppp )cookie;
	STDDCL;
	int s;

	s = splimp();
	SPPP_LOCK(sp);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "chap TO(%s) rst_counter = %d\n",
	SPP_ARGS(ifp),
	sppp_state_name(sp->state[IDX_CHAP]),
	sp->rst_counter[IDX_CHAP]);

	if (--sp->rst_counter[IDX_CHAP] < 0)
	/* TO- event */
	switch (sp->state[IDX_CHAP]) {
	case STATE_REQ_SENT:
	chap.tld(sp);
	sppp_cp_change_state(&chap, sp, STATE_CLOSED);
	break;
	}
	else
	/* TO+ (or TO) event /
	switch (sp->state[IDX_CHAP]) {
	case STATE_OPENED:
	/* TO* event */
	sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure;
	/* FALLTHROUGH */
	case STATE_REQ_SENT:
	chap.scr(sp);
	/* sppp_cp_change_state() will restart the timer */
	sppp_cp_change_state(&chap, sp, STATE_REQ_SENT);
	break;
	}

	SPPP_UNLOCK(sp);
	splx(s);
	}

	static void
	sppp_chap_tlu(struct sppp *sp)
	{
	STDDCL;
	int i, x;

	i = 0;
	sp->rst_counter[IDX_CHAP] = sp->lcp.max_configure;

	/*
	* Some broken CHAP implementations (Conware CoNet, firmware
	* 4.0.?) don't want to re-authenticate their CHAP once the
	* initial challenge-response exchange has taken place.
	* Provide for an option to avoid rechallenges.
	*/
	if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0) {
	/*
	* Compute the re-challenge timeout. This will yield
	* a number between 300 and 810 seconds.
	*/
	i = 300 + ((unsigned)(random() & 0xff00) >> 7);
	callout_reset(&sp->ch[IDX_CHAP], i * hz, chap.TO, (void *)sp);
	}

	if (debug) {
	log(LOG_DEBUG,
	SPP_FMT "chap %s, ",
	SPP_ARGS(ifp),
	sp->pp_phase == PHASE_NETWORK? "reconfirmed": "tlu");
	if ((sp->hisauth.flags & AUTHFLAG_NORECHALLENGE) == 0)
	log(-1, "next re-challenge in %d seconds\n", i);
	else
	log(-1, "re-challenging supressed\n");
	}

	x = splimp();
	SPPP_LOCK(sp);
	/* indicate to LCP that we need to be closed down */
	sp->lcp.protos \|= (1 << IDX_CHAP);

	if (sp->pp_flags & PP_NEEDAUTH) {
	/*
	* Remote is authenticator, but his auth proto didn't
	* complete yet. Defer the transition to network
	* phase.
	*/
	SPPP_UNLOCK(sp);
	splx(x);
	return;
	}
	SPPP_UNLOCK(sp);
	splx(x);

	/*
	* If we are already in phase network, we are done here. This
	* is the case if this is a dummy tlu event after a re-challenge.
	*/
	if (sp->pp_phase != PHASE_NETWORK)
	sppp_phase_network(sp);
	}

	static void
	sppp_chap_tld(struct sppp *sp)
	{
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "chap tld\n", SPP_ARGS(ifp));
	callout_stop(&sp->ch[IDX_CHAP]);
	sp->lcp.protos &= ~(1 << IDX_CHAP);

	lcp.Close(sp);
	}

	static void
	sppp_chap_scr(struct sppp *sp)
	{
	u_long *ch, seed;
	u_char clen;

	/* Compute random challenge. */
	ch = (u_long *)sp->myauth.challenge;
	read_random(&seed, sizeof seed);
	ch[0] = seed ^ random();
	ch[1] = seed ^ random();
	ch[2] = seed ^ random();
	ch[3] = seed ^ random();
	clen = AUTHKEYLEN;

	sp->confid[IDX_CHAP] = ++sp->pp_seq[IDX_CHAP];

	sppp_auth_send(&chap, sp, CHAP_CHALLENGE, sp->confid[IDX_CHAP],
	sizeof clen, (const char *)&clen,
	(size_t)AUTHKEYLEN, sp->myauth.challenge,
	(size_t)sppp_strnlen(sp->myauth.name, AUTHNAMELEN),
	sp->myauth.name,
	0);
	}

	/*
	--------------------------------------------------------------------------
	* *
	* The PAP implementation. *
	* *
	--------------------------------------------------------------------------
	*/
	/*
	* For PAP, we need to keep a little state also if we are the peer, not the
	* authenticator. This is since we don't get a request to authenticate, but
	* have to repeatedly authenticate ourself until we got a response (or the
	* retry counter is expired).
	*/

	/*
	* Handle incoming PAP packets. */
	static void
	sppp_pap_input(struct sppp sp, struct mbuf m)
	{
	STDDCL;
	struct lcp_header *h;
	int len, x;
	u_char name, passwd, mlen;
	int name_len, passwd_len;

	len = m->m_pkthdr.len;
	if (len < 5) {
	if (debug)
	log(LOG_DEBUG,
	SPP_FMT "pap invalid packet length: %d bytes\n",
	SPP_ARGS(ifp), len);
	return;
	}
	h = mtod (m, struct lcp_header*);
	if (len > ntohs (h->len))
	len = ntohs (h->len);
	switch (h->type) {
	/* PAP request is my authproto */
	case PAP_REQ:
	name = 1 + (u_char*)(h+1);
	name_len = name[-1];
	passwd = name + name_len + 1;
	if (name_len > len - 6 \|\|
	(passwd_len = passwd[-1]) > len - 6 - name_len) {
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "pap corrupted input "
	"<%s id=0x%x len=%d",
	SPP_ARGS(ifp),
	sppp_auth_type_name(PPP_PAP, h->type),
	h->ident, ntohs(h->len));
	sppp_print_bytes((u_char*)(h+1), len-4);
	log(-1, ">\n");
	}
	break;
	}
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "pap input(%s) "
	"<%s id=0x%x len=%d name=",
	SPP_ARGS(ifp),
	sppp_state_name(sp->state[IDX_PAP]),
	sppp_auth_type_name(PPP_PAP, h->type),
	h->ident, ntohs(h->len));
	sppp_print_string((char*)name, name_len);
	log(-1, " passwd=");
	sppp_print_string((char*)passwd, passwd_len);
	log(-1, ">\n");
	}
	if (name_len != sppp_strnlen(sp->hisauth.name, AUTHNAMELEN) \|\|
	passwd_len != sppp_strnlen(sp->hisauth.secret, AUTHKEYLEN) \|\|
	bcmp(name, sp->hisauth.name, name_len) != 0 \|\|
	bcmp(passwd, sp->hisauth.secret, passwd_len) != 0) {
	/* action scn, tld */
	mlen = sizeof(FAILMSG) - 1;
	sppp_auth_send(&pap, sp, PAP_NAK, h->ident,
	sizeof mlen, (const char *)&mlen,
	sizeof(FAILMSG) - 1, (u_char *)FAILMSG,
	0);
	pap.tld(sp);
	break;
	}
	/* action sca, perhaps tlu */
	if (sp->state[IDX_PAP] == STATE_REQ_SENT \|\|
	sp->state[IDX_PAP] == STATE_OPENED) {
	mlen = sizeof(SUCCMSG) - 1;
	sppp_auth_send(&pap, sp, PAP_ACK, h->ident,
	sizeof mlen, (const char *)&mlen,
	sizeof(SUCCMSG) - 1, (u_char *)SUCCMSG,
	0);
	}
	if (sp->state[IDX_PAP] == STATE_REQ_SENT) {
	sppp_cp_change_state(&pap, sp, STATE_OPENED);
	pap.tlu(sp);
	}
	break;

	/* ack and nak are his authproto */
	case PAP_ACK:
	callout_stop(&sp->pap_my_to_ch);
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "pap success",
	SPP_ARGS(ifp));
	name_len = ((char )h);
	if (len > 5 && name_len) {
	log(-1, ": ");
	sppp_print_string((char*)(h+1), name_len);
	}
	log(-1, "\n");
	}
	x = splimp();
	SPPP_LOCK(sp);
	sp->pp_flags &= ~PP_NEEDAUTH;
	if (sp->myauth.proto == PPP_PAP &&
	(sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) &&
	(sp->lcp.protos & (1 << IDX_PAP)) == 0) {
	/*
	* We are authenticator for PAP but didn't
	* complete yet. Leave it to tlu to proceed
	* to network phase.
	*/
	SPPP_UNLOCK(sp);
	splx(x);
	break;
	}
	SPPP_UNLOCK(sp);
	splx(x);
	sppp_phase_network(sp);
	break;

	case PAP_NAK:
	callout_stop (&sp->pap_my_to_ch);
	if (debug) {
	log(LOG_INFO, SPP_FMT "pap failure",
	SPP_ARGS(ifp));
	name_len = ((char )h);
	if (len > 5 && name_len) {
	log(-1, ": ");
	sppp_print_string((char*)(h+1), name_len);
	}
	log(-1, "\n");
	} else
	log(LOG_INFO, SPP_FMT "pap failure\n",
	SPP_ARGS(ifp));
	/* await LCP shutdown by authenticator */
	break;

	default:
	/* Unknown PAP packet type -- ignore. */
	if (debug) {
	log(LOG_DEBUG, SPP_FMT "pap corrupted input "
	"<0x%x id=0x%x len=%d",
	SPP_ARGS(ifp),
	h->type, h->ident, ntohs(h->len));
	sppp_print_bytes((u_char*)(h+1), len-4);
	log(-1, ">\n");
	}
	break;

	}
	}

	static void
	sppp_pap_init(struct sppp *sp)
	{
	/* PAP doesn't have STATE_INITIAL at all. */
	sp->state[IDX_PAP] = STATE_CLOSED;
	sp->fail_counter[IDX_PAP] = 0;
	sp->pp_seq[IDX_PAP] = 0;
	sp->pp_rseq[IDX_PAP] = 0;
	callout_init(&sp->ch[IDX_PAP],
	(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	callout_init(&sp->pap_my_to_ch,
	(SP2IFP(sp)->if_flags & IFF_NEEDSGIANT) ? 0 : CALLOUT_MPSAFE);
	}

	static void
	sppp_pap_open(struct sppp *sp)
	{
	if (sp->hisauth.proto == PPP_PAP &&
	(sp->lcp.opts & (1 << LCP_OPT_AUTH_PROTO)) != 0) {
	/* we are authenticator for PAP, start our timer */
	sp->rst_counter[IDX_PAP] = sp->lcp.max_configure;
	sppp_cp_change_state(&pap, sp, STATE_REQ_SENT);
	}
	if (sp->myauth.proto == PPP_PAP) {
	/* we are peer, send a request, and start a timer */
	pap.scr(sp);
	callout_reset(&sp->pap_my_to_ch, sp->lcp.timeout,
	sppp_pap_my_TO, (void *)sp);
	}
	}

	static void
	sppp_pap_close(struct sppp *sp)
	{
	if (sp->state[IDX_PAP] != STATE_CLOSED)
	sppp_cp_change_state(&pap, sp, STATE_CLOSED);
	}

	/*
	* That's the timeout routine if we are authenticator. Since the
	* authenticator is basically passive in PAP, we can't do much here.
	*/
	static void
	sppp_pap_TO(void *cookie)
	{
	struct sppp sp = (struct sppp )cookie;
	STDDCL;
	int s;

	s = splimp();
	SPPP_LOCK(sp);
	if (debug)
	log(LOG_DEBUG, SPP_FMT "pap TO(%s) rst_counter = %d\n",
	SPP_ARGS(ifp),
	sppp_state_name(sp->state[IDX_PAP]),
	sp->rst_counter[IDX_PAP]);

	if (--sp->rst_counter[IDX_PAP] < 0)
	/* TO- event */
	switch (sp->state[IDX_PAP]) {
	case STATE_REQ_SENT:
	pap.tld(sp);
	sppp_cp_change_state(&pap, sp, STATE_CLOSED);
	break;
	}
	else
	/* TO+ event, not very much we could do */
	switch (sp->state[IDX_PAP]) {
	case STATE_REQ_SENT:
	/* sppp_cp_change_state() will restart the timer */
	sppp_cp_change_state(&pap, sp, STATE_REQ_SENT);
	break;
	}

	SPPP_UNLOCK(sp);
	splx(s);
	}

	/*
	* That's the timeout handler if we are peer. Since the peer is active,
	* we need to retransmit our PAP request since it is apparently lost.
	* XXX We should impose a max counter.
	*/
	static void
	sppp_pap_my_TO(void *cookie)
	{
	struct sppp sp = (struct sppp )cookie;
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "pap peer TO\n",
	SPP_ARGS(ifp));

	SPPP_LOCK(sp);
	pap.scr(sp);
	SPPP_UNLOCK(sp);
	}

	static void
	sppp_pap_tlu(struct sppp *sp)
	{
	STDDCL;
	int x;

	sp->rst_counter[IDX_PAP] = sp->lcp.max_configure;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "%s tlu\n",
	SPP_ARGS(ifp), pap.name);

	x = splimp();
	SPPP_LOCK(sp);
	/* indicate to LCP that we need to be closed down */
	sp->lcp.protos \|= (1 << IDX_PAP);

	if (sp->pp_flags & PP_NEEDAUTH) {
	/*
	* Remote is authenticator, but his auth proto didn't
	* complete yet. Defer the transition to network
	* phase.
	*/
	SPPP_UNLOCK(sp);
	splx(x);
	return;
	}
	SPPP_UNLOCK(sp);
	splx(x);
	sppp_phase_network(sp);
	}

	static void
	sppp_pap_tld(struct sppp *sp)
	{
	STDDCL;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "pap tld\n", SPP_ARGS(ifp));
	callout_stop (&sp->ch[IDX_PAP]);
	callout_stop (&sp->pap_my_to_ch);
	sp->lcp.protos &= ~(1 << IDX_PAP);

	lcp.Close(sp);
	}

	static void
	sppp_pap_scr(struct sppp *sp)
	{
	u_char idlen, pwdlen;

	sp->confid[IDX_PAP] = ++sp->pp_seq[IDX_PAP];
	pwdlen = sppp_strnlen(sp->myauth.secret, AUTHKEYLEN);
	idlen = sppp_strnlen(sp->myauth.name, AUTHNAMELEN);

	sppp_auth_send(&pap, sp, PAP_REQ, sp->confid[IDX_PAP],
	sizeof idlen, (const char *)&idlen,
	(size_t)idlen, sp->myauth.name,
	sizeof pwdlen, (const char *)&pwdlen,
	(size_t)pwdlen, sp->myauth.secret,
	0);
	}

	/*
	* Random miscellaneous functions.
	*/

	/*
	* Send a PAP or CHAP proto packet.
	*
	* Varadic function, each of the elements for the ellipsis is of type
	* ``size_t mlen, const u_char *msg''. Processing will stop iff
	* mlen == 0.
	* NOTE: never declare variadic functions with types subject to type
	* promotion (i.e. u_char). This is asking for big trouble depending
	* on the architecture you are on...
	*/

	static void
	sppp_auth_send(const struct cp cp, struct sppp sp,
	unsigned int type, unsigned int id,
	...)
	{
	STDDCL;
	struct ppp_header *h;
	struct lcp_header *lh;
	struct mbuf *m;
	u_char *p;
	int len;
	unsigned int mlen;
	const char *msg;
	va_list ap;

	MGETHDR (m, M_DONTWAIT, MT_DATA);
	if (! m)
	return;
	m->m_pkthdr.rcvif = 0;

	h = mtod (m, struct ppp_header*);
	h->address = PPP_ALLSTATIONS; /* broadcast address */
	h->control = PPP_UI; /* Unnumbered Info */
	h->protocol = htons(cp->proto);

	lh = (struct lcp_header*)(h + 1);
	lh->type = type;
	lh->ident = id;
	p = (u_char*) (lh+1);

	va_start(ap, id);
	len = 0;

	while ((mlen = (unsigned int)va_arg(ap, size_t)) != 0) {
	msg = va_arg(ap, const char *);
	len += mlen;
	if (len > MHLEN - PPP_HEADER_LEN - LCP_HEADER_LEN) {
	va_end(ap);
	m_freem(m);
	return;
	}

	bcopy(msg, p, mlen);
	p += mlen;
	}
	va_end(ap);

	m->m_pkthdr.len = m->m_len = PPP_HEADER_LEN + LCP_HEADER_LEN + len;
	lh->len = htons (LCP_HEADER_LEN + len);

	if (debug) {
	log(LOG_DEBUG, SPP_FMT "%s output <%s id=0x%x len=%d",
	SPP_ARGS(ifp), cp->name,
	sppp_auth_type_name(cp->proto, lh->type),
	lh->ident, ntohs(lh->len));
	sppp_print_bytes((u_char*) (lh+1), len);
	log(-1, ">\n");
	}
	if (! IF_HANDOFF_ADJ(&sp->pp_cpq, m, ifp, 3))
	ifp->if_oerrors++;
	}

	/*
	* Flush interface queue.
	*/
	static void
	sppp_qflush(struct ifqueue *ifq)
	{
	struct mbuf m, n;

	n = ifq->ifq_head;
	while ((m = n)) {
	n = m->m_act;
	m_freem (m);
	}
	ifq->ifq_head = 0;
	ifq->ifq_tail = 0;
	ifq->ifq_len = 0;
	}

	/*
	* Send keepalive packets, every 10 seconds.
	*/
	static void
	sppp_keepalive(void *dummy)
	{
	struct sppp sp = (struct sppp)dummy;
	struct ifnet *ifp = SP2IFP(sp);
	int s;

	s = splimp();
	SPPP_LOCK(sp);
	/* Keepalive mode disabled or channel down? */
	if (! (sp->pp_flags & PP_KEEPALIVE) \|\|
	! (ifp->if_drv_flags & IFF_DRV_RUNNING))
	goto out;

	if (sp->pp_mode == PP_FR) {
	sppp_fr_keepalive (sp);
	goto out;
	}

	/* No keepalive in PPP mode if LCP not opened yet. */
	if (sp->pp_mode != IFF_CISCO &&
	sp->pp_phase < PHASE_AUTHENTICATE)
	goto out;

	if (sp->pp_alivecnt == MAXALIVECNT) {
	/* No keepalive packets got. Stop the interface. */
	printf (SPP_FMT "down\n", SPP_ARGS(ifp));
	if_down (ifp);
	sppp_qflush (&sp->pp_cpq);
	if (sp->pp_mode != IFF_CISCO) {
	/* XXX */
	/* Shut down the PPP link. */
	lcp.Down(sp);
	/* Initiate negotiation. XXX */
	lcp.Up(sp);
	}
	}
	if (sp->pp_alivecnt <= MAXALIVECNT)
	++sp->pp_alivecnt;
	if (sp->pp_mode == IFF_CISCO)
	sppp_cisco_send (sp, CISCO_KEEPALIVE_REQ,
	++sp->pp_seq[IDX_LCP], sp->pp_rseq[IDX_LCP]);
	else if (sp->pp_phase >= PHASE_AUTHENTICATE) {
	long nmagic = htonl (sp->lcp.magic);
	sp->lcp.echoid = ++sp->pp_seq[IDX_LCP];
	sppp_cp_send (sp, PPP_LCP, ECHO_REQ,
	sp->lcp.echoid, 4, &nmagic);
	}
	out:
	SPPP_UNLOCK(sp);
	splx(s);
	callout_reset(&sp->keepalive_callout, hz * 10, sppp_keepalive,
	(void *)sp);
	}

	/*
	* Get both IP addresses.
	*/
	void
	sppp_get_ip_addrs(struct sppp sp, u_long src, u_long dst, u_long srcmask)
	{
	struct ifnet *ifp = SP2IFP(sp);
	struct ifaddr *ifa;
	struct sockaddr_in si, sm;
	u_long ssrc, ddst;

	sm = NULL;
	ssrc = ddst = 0L;
	/*
	* Pick the first AF_INET address from the list,
	* aliases don't make any sense on a p2p link anyway.
	*/
	si = 0;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (ifa->ifa_addr->sa_family == AF_INET) {
	si = (struct sockaddr_in *)ifa->ifa_addr;
	sm = (struct sockaddr_in *)ifa->ifa_netmask;
	if (si)
	break;
	}
	if (ifa) {
	if (si && si->sin_addr.s_addr) {
	ssrc = si->sin_addr.s_addr;
	if (srcmask)
	*srcmask = ntohl(sm->sin_addr.s_addr);
	}

	si = (struct sockaddr_in *)ifa->ifa_dstaddr;
	if (si && si->sin_addr.s_addr)
	ddst = si->sin_addr.s_addr;
	}

	if (dst) *dst = ntohl(ddst);
	if (src) *src = ntohl(ssrc);
	}

	/*
	* Set my IP address. Must be called at splimp.
	*/
	static void
	sppp_set_ip_addr(struct sppp *sp, u_long src)
	{
	+ INIT_VNET_INET(curvnet);
	STDDCL;
	struct ifaddr *ifa;
	struct sockaddr_in *si;
	struct in_ifaddr *ia;

	/*
	* Pick the first AF_INET address from the list,
	* aliases don't make any sense on a p2p link anyway.
	*/
	si = 0;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	{
	if (ifa->ifa_addr->sa_family == AF_INET)
	{
	si = (struct sockaddr_in *)ifa->ifa_addr;
	if (si)
	break;
	}
	}

	if (ifa && si)
	{
	int error;
	/* delete old route */
	error = rtinit(ifa, (int)RTM_DELETE, RTF_HOST);
	if(debug && error)
	{
	log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit DEL failed, error=%d\n",
	SPP_ARGS(ifp), error);
	}

	/* set new address */
	si->sin_addr.s_addr = htonl(src);
	ia = ifatoia(ifa);
	LIST_REMOVE(ia, ia_hash);
	LIST_INSERT_HEAD(INADDR_HASH(si->sin_addr.s_addr), ia, ia_hash);

	/* add new route */
	error = rtinit(ifa, (int)RTM_ADD, RTF_HOST);
	if (debug && error)
	{
	log(LOG_DEBUG, SPP_FMT "sppp_set_ip_addr: rtinit ADD failed, error=%d",
	SPP_ARGS(ifp), error);
	}
	}
	}

	#ifdef INET6
	/*
	* Get both IPv6 addresses.
	*/
	static void
	sppp_get_ip6_addrs(struct sppp sp, struct in6_addr src, struct in6_addr *dst,
	struct in6_addr *srcmask)
	{
	struct ifnet *ifp = SP2IFP(sp);
	struct ifaddr *ifa;
	struct sockaddr_in6 si, sm;
	struct in6_addr ssrc, ddst;

	sm = NULL;
	bzero(&ssrc, sizeof(ssrc));
	bzero(&ddst, sizeof(ddst));
	/*
	* Pick the first link-local AF_INET6 address from the list,
	* aliases don't make any sense on a p2p link anyway.
	*/
	si = 0;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (ifa->ifa_addr->sa_family == AF_INET6) {
	si = (struct sockaddr_in6 *)ifa->ifa_addr;
	sm = (struct sockaddr_in6 *)ifa->ifa_netmask;
	if (si && IN6_IS_ADDR_LINKLOCAL(&si->sin6_addr))
	break;
	}
	if (ifa) {
	if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr)) {
	bcopy(&si->sin6_addr, &ssrc, sizeof(ssrc));
	if (srcmask) {
	bcopy(&sm->sin6_addr, srcmask,
	sizeof(*srcmask));
	}
	}

	si = (struct sockaddr_in6 *)ifa->ifa_dstaddr;
	if (si && !IN6_IS_ADDR_UNSPECIFIED(&si->sin6_addr))
	bcopy(&si->sin6_addr, &ddst, sizeof(ddst));
	}

	if (dst)
	bcopy(&ddst, dst, sizeof(*dst));
	if (src)
	bcopy(&ssrc, src, sizeof(*src));
	}

	#ifdef IPV6CP_MYIFID_DYN
	/*
	* Generate random ifid.
	*/
	static void
	sppp_gen_ip6_addr(struct sppp sp, struct in6_addr addr)
	{
	/* TBD */
	}

	/*
	* Set my IPv6 address. Must be called at splimp.
	*/
	static void
	sppp_set_ip6_addr(struct sppp sp, const struct in6_addr src)
	{
	STDDCL;
	struct ifaddr *ifa;
	struct sockaddr_in6 *sin6;

	/*
	* Pick the first link-local AF_INET6 address from the list,
	* aliases don't make any sense on a p2p link anyway.
	*/

	sin6 = NULL;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	{
	if (ifa->ifa_addr->sa_family == AF_INET6)
	{
	sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
	if (sin6 && IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
	break;
	}
	}

	if (ifa && sin6)
	{
	int error;
	struct sockaddr_in6 new_sin6 = *sin6;

	bcopy(src, &new_sin6.sin6_addr, sizeof(new_sin6.sin6_addr));
	error = in6_ifinit(ifp, ifatoia6(ifa), &new_sin6, 1);
	if (debug && error)
	{
	log(LOG_DEBUG, SPP_FMT "sppp_set_ip6_addr: in6_ifinit "
	" failed, error=%d\n", SPP_ARGS(ifp), error);
	}
	}
	}
	#endif

	/*
	* Suggest a candidate address to be used by peer.
	*/
	static void
	sppp_suggest_ip6_addr(struct sppp sp, struct in6_addr suggest)
	{
	struct in6_addr myaddr;
	struct timeval tv;

	sppp_get_ip6_addrs(sp, &myaddr, 0, 0);

	myaddr.s6_addr[8] &= ~0x02; /* u bit to "local" */
	microtime(&tv);
	if ((tv.tv_usec & 0xff) == 0 && (tv.tv_sec & 0xff) == 0) {
	myaddr.s6_addr[14] ^= 0xff;
	myaddr.s6_addr[15] ^= 0xff;
	} else {
	myaddr.s6_addr[14] ^= (tv.tv_usec & 0xff);
	myaddr.s6_addr[15] ^= (tv.tv_sec & 0xff);
	}
	if (suggest)
	bcopy(&myaddr, suggest, sizeof(myaddr));
	}
	#endif /INET6/

	static int
	sppp_params(struct sppp sp, u_long cmd, void data)
	{
	u_long subcmd;
	struct ifreq ifr = (struct ifreq )data;
	struct spppreq *spr;
	int rv = 0;

	if ((spr = malloc(sizeof(struct spppreq), M_TEMP, M_NOWAIT)) == 0)
	return (EAGAIN);
	/*
	* ifr->ifr_data is supposed to point to a struct spppreq.
	* Check the cmd word first before attempting to fetch all the
	* data.
	*/
	if ((subcmd = fuword(ifr->ifr_data)) == -1) {
	rv = EFAULT;
	goto quit;
	}

	if (copyin((caddr_t)ifr->ifr_data, spr, sizeof(struct spppreq)) != 0) {
	rv = EFAULT;
	goto quit;
	}

	switch (subcmd) {
	case (u_long)SPPPIOGDEFS:
	if (cmd != SIOCGIFGENERIC) {
	rv = EINVAL;
	break;
	}
	/*
	* We copy over the entire current state, but clean
	* out some of the stuff we don't wanna pass up.
	* Remember, SIOCGIFGENERIC is unprotected, and can be
	* called by any user. No need to ever get PAP or
	* CHAP secrets back to userland anyway.
	*/
	spr->defs.pp_phase = sp->pp_phase;
	spr->defs.enable_vj = (sp->confflags & CONF_ENABLE_VJ) != 0;
	spr->defs.enable_ipv6 = (sp->confflags & CONF_ENABLE_IPV6) != 0;
	spr->defs.lcp = sp->lcp;
	spr->defs.ipcp = sp->ipcp;
	spr->defs.ipv6cp = sp->ipv6cp;
	spr->defs.myauth = sp->myauth;
	spr->defs.hisauth = sp->hisauth;
	bzero(spr->defs.myauth.secret, AUTHKEYLEN);
	bzero(spr->defs.myauth.challenge, AUTHKEYLEN);
	bzero(spr->defs.hisauth.secret, AUTHKEYLEN);
	bzero(spr->defs.hisauth.challenge, AUTHKEYLEN);
	/*
	* Fixup the LCP timeout value to milliseconds so
	* spppcontrol doesn't need to bother about the value
	* of "hz". We do the reverse calculation below when
	* setting it.
	*/
	spr->defs.lcp.timeout = sp->lcp.timeout * 1000 / hz;
	rv = copyout(spr, (caddr_t)ifr->ifr_data,
	sizeof(struct spppreq));
	break;

	case (u_long)SPPPIOSDEFS:
	if (cmd != SIOCSIFGENERIC) {
	rv = EINVAL;
	break;
	}
	/*
	* We have a very specific idea of which fields we
	* allow being passed back from userland, so to not
	* clobber our current state. For one, we only allow
	* setting anything if LCP is in dead or establish
	* phase. Once the authentication negotiations
	* started, the authentication settings must not be
	* changed again. (The administrator can force an
	* ifconfig down in order to get LCP back into dead
	* phase.)
	*
	* Also, we only allow for authentication parameters to be
	* specified.
	*
	* XXX Should allow to set or clear pp_flags.
	*
	* Finally, if the respective authentication protocol to
	* be used is set differently than 0, but the secret is
	* passed as all zeros, we don't trash the existing secret.
	* This allows an administrator to change the system name
	* only without clobbering the secret (which he didn't get
	* back in a previous SPPPIOGDEFS call). However, the
	* secrets are cleared if the authentication protocol is
	* reset to 0. */
	if (sp->pp_phase != PHASE_DEAD &&
	sp->pp_phase != PHASE_ESTABLISH) {
	rv = EBUSY;
	break;
	}

	if ((spr->defs.myauth.proto != 0 && spr->defs.myauth.proto != PPP_PAP &&
	spr->defs.myauth.proto != PPP_CHAP) \|\|
	(spr->defs.hisauth.proto != 0 && spr->defs.hisauth.proto != PPP_PAP &&
	spr->defs.hisauth.proto != PPP_CHAP)) {
	rv = EINVAL;
	break;
	}

	if (spr->defs.myauth.proto == 0)
	/* resetting myauth */
	bzero(&sp->myauth, sizeof sp->myauth);
	else {
	/* setting/changing myauth */
	sp->myauth.proto = spr->defs.myauth.proto;
	bcopy(spr->defs.myauth.name, sp->myauth.name, AUTHNAMELEN);
	if (spr->defs.myauth.secret[0] != '\0')
	bcopy(spr->defs.myauth.secret, sp->myauth.secret,
	AUTHKEYLEN);
	}
	if (spr->defs.hisauth.proto == 0)
	/* resetting hisauth */
	bzero(&sp->hisauth, sizeof sp->hisauth);
	else {
	/* setting/changing hisauth */
	sp->hisauth.proto = spr->defs.hisauth.proto;
	sp->hisauth.flags = spr->defs.hisauth.flags;
	bcopy(spr->defs.hisauth.name, sp->hisauth.name, AUTHNAMELEN);
	if (spr->defs.hisauth.secret[0] != '\0')
	bcopy(spr->defs.hisauth.secret, sp->hisauth.secret,
	AUTHKEYLEN);
	}
	/* set LCP restart timer timeout */
	if (spr->defs.lcp.timeout != 0)
	sp->lcp.timeout = spr->defs.lcp.timeout * hz / 1000;
	/* set VJ enable and IPv6 disable flags */
	#ifdef INET
	if (spr->defs.enable_vj)
	sp->confflags \|= CONF_ENABLE_VJ;
	else
	sp->confflags &= ~CONF_ENABLE_VJ;
	#endif
	#ifdef INET6
	if (spr->defs.enable_ipv6)
	sp->confflags \|= CONF_ENABLE_IPV6;
	else
	sp->confflags &= ~CONF_ENABLE_IPV6;
	#endif
	break;

	default:
	rv = EINVAL;
	}

	quit:
	free(spr, M_TEMP);

	return (rv);
	}

	static void
	sppp_phase_network(struct sppp *sp)
	{
	STDDCL;
	int i;
	u_long mask;

	sp->pp_phase = PHASE_NETWORK;

	if (debug)
	log(LOG_DEBUG, SPP_FMT "phase %s\n", SPP_ARGS(ifp),
	sppp_phase_name(sp->pp_phase));

	/* Notify NCPs now. */
	for (i = 0; i < IDX_COUNT; i++)
	if ((cps[i])->flags & CP_NCP)
	(cps[i])->Open(sp);

	/* Send Up events to all NCPs. */
	for (i = 0, mask = 1; i < IDX_COUNT; i++, mask <<= 1)
	if ((sp->lcp.protos & mask) && ((cps[i])->flags & CP_NCP))
	(cps[i])->Up(sp);

	/* if no NCP is starting, all this was in vain, close down */
	sppp_lcp_check_and_close(sp);
	}


	static const char *
	sppp_cp_type_name(u_char type)
	{
	static char buf[12];
	switch (type) {
	case CONF_REQ: return "conf-req";
	case CONF_ACK: return "conf-ack";
	case CONF_NAK: return "conf-nak";
	case CONF_REJ: return "conf-rej";
	case TERM_REQ: return "term-req";
	case TERM_ACK: return "term-ack";
	case CODE_REJ: return "code-rej";
	case PROTO_REJ: return "proto-rej";
	case ECHO_REQ: return "echo-req";
	case ECHO_REPLY: return "echo-reply";
	case DISC_REQ: return "discard-req";
	}
	snprintf (buf, sizeof(buf), "cp/0x%x", type);
	return buf;
	}

	static const char *
	sppp_auth_type_name(u_short proto, u_char type)
	{
	static char buf[12];
	switch (proto) {
	case PPP_CHAP:
	switch (type) {
	case CHAP_CHALLENGE: return "challenge";
	case CHAP_RESPONSE: return "response";
	case CHAP_SUCCESS: return "success";
	case CHAP_FAILURE: return "failure";
	}
	case PPP_PAP:
	switch (type) {
	case PAP_REQ: return "req";
	case PAP_ACK: return "ack";
	case PAP_NAK: return "nak";
	}
	}
	snprintf (buf, sizeof(buf), "auth/0x%x", type);
	return buf;
	}

	static const char *
	sppp_lcp_opt_name(u_char opt)
	{
	static char buf[12];
	switch (opt) {
	case LCP_OPT_MRU: return "mru";
	case LCP_OPT_ASYNC_MAP: return "async-map";
	case LCP_OPT_AUTH_PROTO: return "auth-proto";
	case LCP_OPT_QUAL_PROTO: return "qual-proto";
	case LCP_OPT_MAGIC: return "magic";
	case LCP_OPT_PROTO_COMP: return "proto-comp";
	case LCP_OPT_ADDR_COMP: return "addr-comp";
	}
	snprintf (buf, sizeof(buf), "lcp/0x%x", opt);
	return buf;
	}

	static const char *
	sppp_ipcp_opt_name(u_char opt)
	{
	static char buf[12];
	switch (opt) {
	case IPCP_OPT_ADDRESSES: return "addresses";
	case IPCP_OPT_COMPRESSION: return "compression";
	case IPCP_OPT_ADDRESS: return "address";
	}
	snprintf (buf, sizeof(buf), "ipcp/0x%x", opt);
	return buf;
	}

	#ifdef INET6
	static const char *
	sppp_ipv6cp_opt_name(u_char opt)
	{
	static char buf[12];
	switch (opt) {
	case IPV6CP_OPT_IFID: return "ifid";
	case IPV6CP_OPT_COMPRESSION: return "compression";
	}
	sprintf (buf, "0x%x", opt);
	return buf;
	}
	#endif

	static const char *
	sppp_state_name(int state)
	{
	switch (state) {
	case STATE_INITIAL: return "initial";
	case STATE_STARTING: return "starting";
	case STATE_CLOSED: return "closed";
	case STATE_STOPPED: return "stopped";
	case STATE_CLOSING: return "closing";
	case STATE_STOPPING: return "stopping";
	case STATE_REQ_SENT: return "req-sent";
	case STATE_ACK_RCVD: return "ack-rcvd";
	case STATE_ACK_SENT: return "ack-sent";
	case STATE_OPENED: return "opened";
	}
	return "illegal";
	}

	static const char *
	sppp_phase_name(enum ppp_phase phase)
	{
	switch (phase) {
	case PHASE_DEAD: return "dead";
	case PHASE_ESTABLISH: return "establish";
	case PHASE_TERMINATE: return "terminate";
	case PHASE_AUTHENTICATE: return "authenticate";
	case PHASE_NETWORK: return "network";
	}
	return "illegal";
	}

	static const char *
	sppp_proto_name(u_short proto)
	{
	static char buf[12];
	switch (proto) {
	case PPP_LCP: return "lcp";
	case PPP_IPCP: return "ipcp";
	case PPP_PAP: return "pap";
	case PPP_CHAP: return "chap";
	case PPP_IPV6CP: return "ipv6cp";
	}
	snprintf(buf, sizeof(buf), "proto/0x%x", (unsigned)proto);
	return buf;
	}

	static void
	sppp_print_bytes(const u_char *p, u_short len)
	{
	if (len)
	log(-1, " %*D", len, p, "-");
	}

	static void
	sppp_print_string(const char *p, u_short len)
	{
	u_char c;

	while (len-- > 0) {
	c = *p++;
	/*
	* Print only ASCII chars directly. RFC 1994 recommends
	* using only them, but we don't rely on it. */
	if (c < ' ' \|\| c > '~')
	log(-1, "\\x%x", c);
	else
	log(-1, "%c", c);
	}
	}

	static const char *
	sppp_dotted_quad(u_long addr)
	{
	static char s[16];
	sprintf(s, "%d.%d.%d.%d",
	(int)((addr >> 24) & 0xff),
	(int)((addr >> 16) & 0xff),
	(int)((addr >> 8) & 0xff),
	(int)(addr & 0xff));
	return s;
	}

	static int
	sppp_strnlen(u_char *p, int max)
	{
	int len;

	for (len = 0; len < max && *p; ++p)
	++len;
	return len;
	}

	/* a dummy, used to drop uninteresting events */
	static void
	sppp_null(struct sppp *unused)
	{
	/* do just nothing */
	}
	Index: head/sys/net/if_stf.c
	===================================================================
	--- head/sys/net/if_stf.c (revision 183549)
	+++ head/sys/net/if_stf.c (revision 183550)
	@@ -1,834 +1,836 @@
	/* $FreeBSD$ */
	/* $KAME: if_stf.c,v 1.73 2001/12/03 11:08:30 keiichi Exp $ */

	/*-
	* Copyright (C) 2000 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* 6to4 interface, based on RFC3056.
	*
	* 6to4 interface is NOT capable of link-layer (I mean, IPv4) multicasting.
	* There is no address mapping defined from IPv6 multicast address to IPv4
	* address. Therefore, we do not have IFF_MULTICAST on the interface.
	*
	* Due to the lack of address mapping for link-local addresses, we cannot
	* throw packets toward link-local addresses (fe80::x). Also, we cannot throw
	* packets to link-local multicast addresses (ff02::x).
	*
	* Here are interesting symptoms due to the lack of link-local address:
	*
	* Unicast routing exchange:
	* - RIPng: Impossible. Uses link-local multicast packet toward ff02::9,
	* and link-local addresses as nexthop.
	* - OSPFv6: Impossible. OSPFv6 assumes that there's link-local address
	* assigned to the link, and makes use of them. Also, HELLO packets use
	* link-local multicast addresses (ff02::5 and ff02::6).
	* - BGP4+: Maybe. You can only use global address as nexthop, and global
	* address as TCP endpoint address.
	*
	* Multicast routing protocols:
	* - PIM: Hello packet cannot be used to discover adjacent PIM routers.
	* Adjacent PIM routers must be configured manually (is it really spec-wise
	* correct thing to do?).
	*
	* ICMPv6:
	* - Redirects cannot be used due to the lack of link-local address.
	*
	* stf interface does not have, and will not need, a link-local address.
	* It seems to have no real benefit and does not help the above symptoms much.
	* Even if we assign link-locals to interface, we cannot really
	* use link-local unicast/multicast on top of 6to4 cloud (since there's no
	* encapsulation defined for link-local address), and the above analysis does
	* not change. RFC3056 does not mandate the assignment of link-local address
	* either.
	*
	* 6to4 interface has security issues. Refer to
	* http://playground.iijlab.net/i-d/draft-itojun-ipv6-transition-abuse-00.txt
	* for details. The code tries to filter out some of malicious packets.
	* Note that there is no way to be 100% secure.
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/protosw.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/sysctl.h>
	#include <machine/cpu.h>

	#include <sys/malloc.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/route.h>
	#include <net/netisr.h>
	#include <net/if_types.h>
	#include <net/if_stf.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/in_var.h>

	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip_ecn.h>

	#include <netinet/ip_encap.h>

	#include <machine/stdarg.h>

	#include <net/bpf.h>

	#include <security/mac/mac_framework.h>

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_STF, stf, CTLFLAG_RW, 0, "6to4 Interface");

	static int stf_route_cache = 1;
	SYSCTL_INT(_net_link_stf, OID_AUTO, route_cache, CTLFLAG_RW,
	&stf_route_cache, 0, "Caching of IPv4 routes for 6to4 Output");

	#define STFNAME "stf"
	#define STFUNIT 0

	#define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002)

	/*
	* XXX: Return a pointer with 16-bit aligned. Don't cast it to
	* struct in_addr *; use bcopy() instead.
	*/
	#define GET_V4(x) ((caddr_t)(&(x)->s6_addr16[1]))

	struct stf_softc {
	struct ifnet *sc_ifp;
	union {
	struct route __sc_ro4;
	struct route_in6 __sc_ro6; /* just for safety */
	} __sc_ro46;
	#define sc_ro __sc_ro46.__sc_ro4
	struct mtx sc_ro_mtx;
	u_int sc_fibnum;
	const struct encaptab *encap_cookie;
	};
	#define STF2IFP(sc) ((sc)->sc_ifp)

	/*
	* Note that mutable fields in the softc are not currently locked.
	* We do lock sc_ro in stf_output though.
	*/
	static MALLOC_DEFINE(M_STF, STFNAME, "6to4 Tunnel Interface");
	static const int ip_stf_ttl = 40;

	extern struct domain inetdomain;
	struct protosw in_stf_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inetdomain,
	.pr_protocol = IPPROTO_IPV6,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = in_stf_input,
	.pr_output = (pr_output_t *)rip_output,
	.pr_ctloutput = rip_ctloutput,
	.pr_usrreqs = &rip_usrreqs
	};

	static char *stfnames[] = {"stf0", "stf", "6to4", NULL};

	static int stfmodevent(module_t, int, void *);
	static int stf_encapcheck(const struct mbuf , int, int, void );
	static struct in6_ifaddr stf_getsrcifa6(struct ifnet );
	static int stf_output(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *);
	static int isrfc1918addr(struct in_addr *);
	static int stf_checkaddr4(struct stf_softc , struct in_addr ,
	struct ifnet *);
	static int stf_checkaddr6(struct stf_softc , struct in6_addr ,
	struct ifnet *);
	static void stf_rtrequest(int, struct rtentry , struct rt_addrinfo );
	static int stf_ioctl(struct ifnet *, u_long, caddr_t);

	static int stf_clone_match(struct if_clone , const char );
	static int stf_clone_create(struct if_clone , char , size_t, caddr_t);
	static int stf_clone_destroy(struct if_clone , struct ifnet );
	struct if_clone stf_cloner = IFC_CLONE_INITIALIZER(STFNAME, NULL, 0,
	NULL, stf_clone_match, stf_clone_create, stf_clone_destroy);

	static int
	stf_clone_match(struct if_clone ifc, const char name)
	{
	int i;

	for(i = 0; stfnames[i] != NULL; i++) {
	if (strcmp(stfnames[i], name) == 0)
	return (1);
	}

	return (0);
	}

	static int
	stf_clone_create(struct if_clone ifc, char name, size_t len, caddr_t params)
	{
	int err, unit;
	struct stf_softc *sc;
	struct ifnet *ifp;

	/*
	* We can only have one unit, but since unit allocation is
	* already locked, we use it to keep from allocating extra
	* interfaces.
	*/
	unit = STFUNIT;
	err = ifc_alloc_unit(ifc, &unit);
	if (err != 0)
	return (err);

	sc = malloc(sizeof(struct stf_softc), M_STF, M_WAITOK \| M_ZERO);
	ifp = STF2IFP(sc) = if_alloc(IFT_STF);
	if (ifp == NULL) {
	free(sc, M_STF);
	ifc_free_unit(ifc, unit);
	return (ENOSPC);
	}
	ifp->if_softc = sc;
	sc->sc_fibnum = curthread->td_proc->p_fibnum;

	/*
	* Set the name manually rather then using if_initname because
	* we don't conform to the default naming convention for interfaces.
	*/
	strlcpy(ifp->if_xname, name, IFNAMSIZ);
	ifp->if_dname = ifc->ifc_name;
	ifp->if_dunit = IF_DUNIT_NONE;

	mtx_init(&(sc)->sc_ro_mtx, "stf ro", NULL, MTX_DEF);
	sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6,
	stf_encapcheck, &in_stf_protosw, sc);
	if (sc->encap_cookie == NULL) {
	if_printf(ifp, "attach failed\n");
	free(sc, M_STF);
	ifc_free_unit(ifc, unit);
	return (ENOMEM);
	}

	ifp->if_mtu = IPV6_MMTU;
	ifp->if_ioctl = stf_ioctl;
	ifp->if_output = stf_output;
	ifp->if_snd.ifq_maxlen = IFQ_MAXLEN;
	if_attach(ifp);
	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
	return (0);
	}

	static int
	stf_clone_destroy(struct if_clone ifc, struct ifnet ifp)
	{
	struct stf_softc *sc = ifp->if_softc;
	int err;

	err = encap_detach(sc->encap_cookie);
	KASSERT(err == 0, ("Unexpected error detaching encap_cookie"));
	mtx_destroy(&(sc)->sc_ro_mtx);
	bpfdetach(ifp);
	if_detach(ifp);
	if_free(ifp);

	free(sc, M_STF);
	ifc_free_unit(ifc, STFUNIT);

	return (0);
	}

	static int
	stfmodevent(mod, type, data)
	module_t mod;
	int type;
	void *data;
	{

	switch (type) {
	case MOD_LOAD:
	if_clone_attach(&stf_cloner);
	break;
	case MOD_UNLOAD:
	if_clone_detach(&stf_cloner);
	break;
	default:
	return (EOPNOTSUPP);
	}

	return (0);
	}

	static moduledata_t stf_mod = {
	"if_stf",
	stfmodevent,
	0
	};

	DECLARE_MODULE(if_stf, stf_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);

	static int
	stf_encapcheck(m, off, proto, arg)
	const struct mbuf *m;
	int off;
	int proto;
	void *arg;
	{
	struct ip ip;
	struct in6_ifaddr *ia6;
	struct stf_softc *sc;
	struct in_addr a, b, mask;

	sc = (struct stf_softc *)arg;
	if (sc == NULL)
	return 0;

	if ((STF2IFP(sc)->if_flags & IFF_UP) == 0)
	return 0;

	/* IFF_LINK0 means "no decapsulation" */
	if ((STF2IFP(sc)->if_flags & IFF_LINK0) != 0)
	return 0;

	if (proto != IPPROTO_IPV6)
	return 0;

	/* LINTED const cast */
	m_copydata((struct mbuf *)(uintptr_t)m, 0, sizeof(ip), (caddr_t)&ip);

	if (ip.ip_v != 4)
	return 0;

	ia6 = stf_getsrcifa6(STF2IFP(sc));
	if (ia6 == NULL)
	return 0;

	/*
	* check if IPv4 dst matches the IPv4 address derived from the
	* local 6to4 address.
	* success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:...
	*/
	if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst,
	sizeof(ip.ip_dst)) != 0)
	return 0;

	/*
	* check if IPv4 src matches the IPv4 address derived from the
	* local 6to4 address masked by prefixmask.
	* success on: src = 10.1.1.1, ia6->ia_addr = 2002:0a00:.../24
	* fail on: src = 10.1.1.1, ia6->ia_addr = 2002:0b00:.../24
	*/
	bzero(&a, sizeof(a));
	bcopy(GET_V4(&ia6->ia_addr.sin6_addr), &a, sizeof(a));
	bcopy(GET_V4(&ia6->ia_prefixmask.sin6_addr), &mask, sizeof(mask));
	a.s_addr &= mask.s_addr;
	b = ip.ip_src;
	b.s_addr &= mask.s_addr;
	if (a.s_addr != b.s_addr)
	return 0;

	/* stf interface makes single side match only */
	return 32;
	}

	static struct in6_ifaddr *
	stf_getsrcifa6(ifp)
	struct ifnet *ifp;
	{
	+ INIT_VNET_INET(ifp->if_vnet);
	struct ifaddr *ia;
	struct in_ifaddr *ia4;
	struct sockaddr_in6 *sin6;
	struct in_addr in;

	TAILQ_FOREACH(ia, &ifp->if_addrlist, ifa_list) {
	if (ia->ifa_addr->sa_family != AF_INET6)
	continue;
	sin6 = (struct sockaddr_in6 *)ia->ifa_addr;
	if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr))
	continue;

	bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in));
	LIST_FOREACH(ia4, INADDR_HASH(in.s_addr), ia_hash)
	if (ia4->ia_addr.sin_addr.s_addr == in.s_addr)
	break;
	if (ia4 == NULL)
	continue;

	return (struct in6_ifaddr *)ia;
	}

	return NULL;
	}

	static int
	stf_output(ifp, m, dst, rt)
	struct ifnet *ifp;
	struct mbuf *m;
	struct sockaddr *dst;
	struct rtentry *rt;
	{
	struct stf_softc *sc;
	struct sockaddr_in6 *dst6;
	struct route *cached_route;
	struct in_addr in4;
	caddr_t ptr;
	struct sockaddr_in *dst4;
	u_int8_t tos;
	struct ip *ip;
	struct ip6_hdr *ip6;
	struct in6_ifaddr *ia6;
	u_int32_t af;
	int error;

	#ifdef MAC
	error = mac_ifnet_check_transmit(ifp, m);
	if (error) {
	m_freem(m);
	return (error);
	}
	#endif

	sc = ifp->if_softc;
	dst6 = (struct sockaddr_in6 *)dst;

	/* just in case */
	if ((ifp->if_flags & IFF_UP) == 0) {
	m_freem(m);
	ifp->if_oerrors++;
	return ENETDOWN;
	}

	/*
	* If we don't have an ip4 address that match my inner ip6 address,
	* we shouldn't generate output. Without this check, we'll end up
	* using wrong IPv4 source.
	*/
	ia6 = stf_getsrcifa6(ifp);
	if (ia6 == NULL) {
	m_freem(m);
	ifp->if_oerrors++;
	return ENETDOWN;
	}

	if (m->m_len < sizeof(*ip6)) {
	m = m_pullup(m, sizeof(*ip6));
	if (!m) {
	ifp->if_oerrors++;
	return ENOBUFS;
	}
	}
	ip6 = mtod(m, struct ip6_hdr *);
	tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;

	/*
	* BPF writes need to be handled specially.
	* This is a null operation, nothing here checks dst->sa_family.
	*/
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	/*
	* Pickup the right outer dst addr from the list of candidates.
	* ip6_dst has priority as it may be able to give us shorter IPv4 hops.
	*/
	ptr = NULL;
	if (IN6_IS_ADDR_6TO4(&ip6->ip6_dst))
	ptr = GET_V4(&ip6->ip6_dst);
	else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr))
	ptr = GET_V4(&dst6->sin6_addr);
	else {
	m_freem(m);
	ifp->if_oerrors++;
	return ENETUNREACH;
	}
	bcopy(ptr, &in4, sizeof(in4));

	if (bpf_peers_present(ifp->if_bpf)) {
	/*
	* We need to prepend the address family as
	* a four byte field. Cons up a dummy header
	* to pacify bpf. This is safe because bpf
	* will only read from the mbuf (i.e., it won't
	* try to free it or keep a pointer a to it).
	*/
	af = AF_INET6;
	bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
	}

	M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
	if (m && m->m_len < sizeof(struct ip))
	m = m_pullup(m, sizeof(struct ip));
	if (m == NULL) {
	ifp->if_oerrors++;
	return ENOBUFS;
	}
	ip = mtod(m, struct ip *);

	bzero(ip, sizeof(*ip));

	bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr),
	&ip->ip_src, sizeof(ip->ip_src));
	bcopy(&in4, &ip->ip_dst, sizeof(ip->ip_dst));
	ip->ip_p = IPPROTO_IPV6;
	ip->ip_ttl = ip_stf_ttl;
	ip->ip_len = m->m_pkthdr.len; /host order/
	if (ifp->if_flags & IFF_LINK1)
	ip_ecn_ingress(ECN_ALLOWED, &ip->ip_tos, &tos);
	else
	ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos);

	if (!stf_route_cache) {
	cached_route = NULL;
	goto sendit;
	}

	/*
	* Do we have a cached route?
	*/
	mtx_lock(&(sc)->sc_ro_mtx);
	dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst;
	if (dst4->sin_family != AF_INET \|\|
	bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) {
	/* cache route doesn't match */
	dst4->sin_family = AF_INET;
	dst4->sin_len = sizeof(struct sockaddr_in);
	bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr));
	if (sc->sc_ro.ro_rt) {
	RTFREE(sc->sc_ro.ro_rt);
	sc->sc_ro.ro_rt = NULL;
	}
	}

	if (sc->sc_ro.ro_rt == NULL) {
	rtalloc_fib(&sc->sc_ro, sc->sc_fibnum);
	if (sc->sc_ro.ro_rt == NULL) {
	m_freem(m);
	mtx_unlock(&(sc)->sc_ro_mtx);
	ifp->if_oerrors++;
	return ENETUNREACH;
	}
	}
	cached_route = &sc->sc_ro;

	sendit:
	M_SETFIB(m, sc->sc_fibnum);
	ifp->if_opackets++;
	error = ip_output(m, NULL, cached_route, 0, NULL, NULL);

	if (cached_route != NULL)
	mtx_unlock(&(sc)->sc_ro_mtx);
	return error;
	}

	static int
	isrfc1918addr(in)
	struct in_addr *in;
	{
	/*
	* returns 1 if private address range:
	* 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16
	*/
	if ((ntohl(in->s_addr) & 0xff000000) >> 24 == 10 \|\|
	(ntohl(in->s_addr) & 0xfff00000) >> 16 == 172 * 256 + 16 \|\|
	(ntohl(in->s_addr) & 0xffff0000) >> 16 == 192 * 256 + 168)
	return 1;

	return 0;
	}

	static int
	stf_checkaddr4(sc, in, inifp)
	struct stf_softc *sc;
	struct in_addr *in;
	struct ifnet inifp; / incoming interface */
	{
	+ INIT_VNET_INET(curvnet);
	struct in_ifaddr *ia4;

	/*
	* reject packets with the following address:
	* 224.0.0.0/4 0.0.0.0/8 127.0.0.0/8 255.0.0.0/8
	*/
	if (IN_MULTICAST(ntohl(in->s_addr)))
	return -1;
	switch ((ntohl(in->s_addr) & 0xff000000) >> 24) {
	case 0: case 127: case 255:
	return -1;
	}

	/*
	* reject packets with private address range.
	* (requirement from RFC3056 section 2 1st paragraph)
	*/
	if (isrfc1918addr(in))
	return -1;

	/*
	* reject packets with broadcast
	*/
	for (ia4 = TAILQ_FIRST(&V_in_ifaddrhead);
	ia4;
	ia4 = TAILQ_NEXT(ia4, ia_link))
	{
	if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
	continue;
	if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr)
	return -1;
	}

	/*
	* perform ingress filter
	*/
	if (sc && (STF2IFP(sc)->if_flags & IFF_LINK2) == 0 && inifp) {
	struct sockaddr_in sin;
	struct rtentry *rt;

	bzero(&sin, sizeof(sin));
	sin.sin_family = AF_INET;
	sin.sin_len = sizeof(struct sockaddr_in);
	sin.sin_addr = *in;
	rt = rtalloc1_fib((struct sockaddr *)&sin, 0,
	0UL, sc->sc_fibnum);
	if (!rt \|\| rt->rt_ifp != inifp) {
	#if 0
	log(LOG_WARNING, "%s: packet from 0x%x dropped "
	"due to ingress filter\n", if_name(STF2IFP(sc)),
	(u_int32_t)ntohl(sin.sin_addr.s_addr));
	#endif
	if (rt)
	RTFREE_LOCKED(rt);
	return -1;
	}
	RTFREE_LOCKED(rt);
	}

	return 0;
	}

	static int
	stf_checkaddr6(sc, in6, inifp)
	struct stf_softc *sc;
	struct in6_addr *in6;
	struct ifnet inifp; / incoming interface */
	{
	/*
	* check 6to4 addresses
	*/
	if (IN6_IS_ADDR_6TO4(in6)) {
	struct in_addr in4;
	bcopy(GET_V4(in6), &in4, sizeof(in4));
	return stf_checkaddr4(sc, &in4, inifp);
	}

	/*
	* reject anything that look suspicious. the test is implemented
	* in ip6_input too, but we check here as well to
	* (1) reject bad packets earlier, and
	* (2) to be safe against future ip6_input change.
	*/
	if (IN6_IS_ADDR_V4COMPAT(in6) \|\| IN6_IS_ADDR_V4MAPPED(in6))
	return -1;

	return 0;
	}

	void
	in_stf_input(m, off)
	struct mbuf *m;
	int off;
	{
	int proto;
	struct stf_softc *sc;
	struct ip *ip;
	struct ip6_hdr *ip6;
	u_int8_t otos, itos;
	struct ifnet *ifp;

	proto = mtod(m, struct ip *)->ip_p;

	if (proto != IPPROTO_IPV6) {
	m_freem(m);
	return;
	}

	ip = mtod(m, struct ip *);

	sc = (struct stf_softc *)encap_getarg(m);

	if (sc == NULL \|\| (STF2IFP(sc)->if_flags & IFF_UP) == 0) {
	m_freem(m);
	return;
	}

	ifp = STF2IFP(sc);

	#ifdef MAC
	mac_ifnet_create_mbuf(ifp, m);
	#endif

	/*
	* perform sanity check against outer src/dst.
	* for source, perform ingress filter as well.
	*/
	if (stf_checkaddr4(sc, &ip->ip_dst, NULL) < 0 \|\|
	stf_checkaddr4(sc, &ip->ip_src, m->m_pkthdr.rcvif) < 0) {
	m_freem(m);
	return;
	}

	otos = ip->ip_tos;
	m_adj(m, off);

	if (m->m_len < sizeof(*ip6)) {
	m = m_pullup(m, sizeof(*ip6));
	if (!m)
	return;
	}
	ip6 = mtod(m, struct ip6_hdr *);

	/*
	* perform sanity check against inner src/dst.
	* for source, perform ingress filter as well.
	*/
	if (stf_checkaddr6(sc, &ip6->ip6_dst, NULL) < 0 \|\|
	stf_checkaddr6(sc, &ip6->ip6_src, m->m_pkthdr.rcvif) < 0) {
	m_freem(m);
	return;
	}

	itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	if ((ifp->if_flags & IFF_LINK1) != 0)
	ip_ecn_egress(ECN_ALLOWED, &otos, &itos);
	else
	ip_ecn_egress(ECN_NOCARE, &otos, &itos);
	ip6->ip6_flow &= ~htonl(0xff << 20);
	ip6->ip6_flow \|= htonl((u_int32_t)itos << 20);

	m->m_pkthdr.rcvif = ifp;

	if (bpf_peers_present(ifp->if_bpf)) {
	/*
	* We need to prepend the address family as
	* a four byte field. Cons up a dummy header
	* to pacify bpf. This is safe because bpf
	* will only read from the mbuf (i.e., it won't
	* try to free it or keep a pointer a to it).
	*/
	u_int32_t af = AF_INET6;
	bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m);
	}

	/*
	* Put the packet to the network layer input queue according to the
	* specified address family.
	* See net/if_gif.c for possible issues with packet processing
	* reorder due to extra queueing.
	*/
	ifp->if_ipackets++;
	ifp->if_ibytes += m->m_pkthdr.len;
	netisr_dispatch(NETISR_IPV6, m);
	}

	/* ARGSUSED */
	static void
	stf_rtrequest(cmd, rt, info)
	int cmd;
	struct rtentry *rt;
	struct rt_addrinfo *info;
	{
	RT_LOCK_ASSERT(rt);
	rt->rt_rmx.rmx_mtu = IPV6_MMTU;
	}

	static int
	stf_ioctl(ifp, cmd, data)
	struct ifnet *ifp;
	u_long cmd;
	caddr_t data;
	{
	struct ifaddr *ifa;
	struct ifreq *ifr;
	struct sockaddr_in6 *sin6;
	struct in_addr addr;
	int error;

	error = 0;
	switch (cmd) {
	case SIOCSIFADDR:
	ifa = (struct ifaddr *)data;
	if (ifa == NULL \|\| ifa->ifa_addr->sa_family != AF_INET6) {
	error = EAFNOSUPPORT;
	break;
	}
	sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
	if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) {
	error = EINVAL;
	break;
	}
	bcopy(GET_V4(&sin6->sin6_addr), &addr, sizeof(addr));
	if (isrfc1918addr(&addr)) {
	error = EINVAL;
	break;
	}

	ifa->ifa_rtrequest = stf_rtrequest;
	ifp->if_flags \|= IFF_UP;
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	ifr = (struct ifreq *)data;
	if (ifr && ifr->ifr_addr.sa_family == AF_INET6)
	;
	else
	error = EAFNOSUPPORT;
	break;

	default:
	error = EINVAL;
	break;
	}

	return error;
	}
	Index: head/sys/net/if_tun.c
	===================================================================
	--- head/sys/net/if_tun.c (revision 183549)
	+++ head/sys/net/if_tun.c (revision 183550)
	@@ -1,1056 +1,1065 @@
	/* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */

	/*-
	* Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
	* Nottingham University 1987.
	*
	* This source may be freely distributed, however I would be interested
	* in any changes that are made.
	*
	* This driver takes packets off the IP i/f and hands them up to a
	* user process to have its wicked way with. This driver has it's
	* roots in a similar driver written by Phil Cockcroft (formerly) at
	* UCL. This driver is based much more on read/write/poll mode of
	* operation though.
	*
	* $FreeBSD$
	*/

	#include "opt_atalk.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipx.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/socket.h>
	#include <sys/fcntl.h>
	#include <sys/filio.h>
	#include <sys/sockio.h>
	#include <sys/ttycom.h>
	#include <sys/poll.h>
	#include <sys/selinfo.h>
	#include <sys/signalvar.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/conf.h>
	#include <sys/uio.h>
	#include <sys/malloc.h>
	#include <sys/random.h>
	+#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_types.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#ifdef INET
	#include <netinet/in.h>
	#endif
	#include <net/bpf.h>
	#include <net/if_tun.h>

	#include <sys/queue.h>

	#include <security/mac/mac_framework.h>

	/*
	* tun_list is protected by global tunmtx. Other mutable fields are
	* protected by tun->tun_mtx, or by their owning subsystem. tun_dev is
	* static for the duration of a tunnel interface.
	*/
	struct tun_softc {
	TAILQ_ENTRY(tun_softc) tun_list;
	struct cdev *tun_dev;
	u_short tun_flags; /* misc flags */
	#define TUN_OPEN 0x0001
	#define TUN_INITED 0x0002
	#define TUN_RCOLL 0x0004
	#define TUN_IASET 0x0008
	#define TUN_DSTADDR 0x0010
	#define TUN_LMODE 0x0020
	#define TUN_RWAIT 0x0040
	#define TUN_ASYNC 0x0080
	#define TUN_IFHEAD 0x0100

	#define TUN_READY (TUN_OPEN \| TUN_INITED)

	/*
	* XXXRW: tun_pid is used to exclusively lock /dev/tun. Is this
	* actually needed? Can we just return EBUSY if already open?
	* Problem is that this involved inherent races when a tun device
	* is handed off from one process to another, as opposed to just
	* being slightly stale informationally.
	*/
	pid_t tun_pid; /* owning pid */
	struct ifnet tun_ifp; / the interface */
	struct sigio tun_sigio; / information for async I/O */
	struct selinfo tun_rsel; /* read select */
	struct mtx tun_mtx; /* protect mutable softc fields */
	};
	#define TUN2IFP(sc) ((sc)->tun_ifp)

	#define TUNDEBUG if (tundebug) if_printf
	#define TUNNAME "tun"

	/*
	* All mutable global variables in if_tun are locked using tunmtx, with
	* the exception of tundebug, which is used unlocked, and tunclones,
	* which is static after setup.
	*/
	static struct mtx tunmtx;
	static MALLOC_DEFINE(M_TUN, TUNNAME, "Tunnel Interface");
	static int tundebug = 0;
	static int tundclone = 1;
	static struct clonedevs *tunclones;
	static TAILQ_HEAD(,tun_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
	SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0,
	"IP tunnel software network interface.");
	SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RW, &tundclone, 0,
	"Enable legacy devfs interface creation.");

	TUNABLE_INT("net.link.tun.devfs_cloning", &tundclone);

	static void tunclone(void arg, struct ucred cred, char *name,
	int namelen, struct cdev **dev);
	static void tuncreate(const char name, struct cdev dev);
	static int tunifioctl(struct ifnet *, u_long, caddr_t);
	static int tuninit(struct ifnet *);
	static int tunmodevent(module_t, int, void *);
	static int tunoutput(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *rt);
	static void tunstart(struct ifnet *);

	static int tun_clone_create(struct if_clone *, int, caddr_t);
	static void tun_clone_destroy(struct ifnet *);

	IFC_SIMPLE_DECLARE(tun, 0);

	static d_open_t tunopen;
	static d_close_t tunclose;
	static d_read_t tunread;
	static d_write_t tunwrite;
	static d_ioctl_t tunioctl;
	static d_poll_t tunpoll;
	static d_kqfilter_t tunkqfilter;

	static int tunkqread(struct knote *, long);
	static int tunkqwrite(struct knote *, long);
	static void tunkqdetach(struct knote *);

	static struct filterops tun_read_filterops = {
	.f_isfd = 1,
	.f_attach = NULL,
	.f_detach = tunkqdetach,
	.f_event = tunkqread,
	};

	static struct filterops tun_write_filterops = {
	.f_isfd = 1,
	.f_attach = NULL,
	.f_detach = tunkqdetach,
	.f_event = tunkqwrite,
	};

	static struct cdevsw tun_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_PSEUDO \| D_NEEDGIANT \| D_NEEDMINOR,
	.d_open = tunopen,
	.d_close = tunclose,
	.d_read = tunread,
	.d_write = tunwrite,
	.d_ioctl = tunioctl,
	.d_poll = tunpoll,
	.d_kqfilter = tunkqfilter,
	.d_name = TUNNAME,
	};

	static int
	tun_clone_create(struct if_clone *ifc, int unit, caddr_t params)
	{
	struct cdev *dev;
	int i;

	/* find any existing device, or allocate new unit number */
	i = clone_create(&tunclones, &tun_cdevsw, &unit, &dev, 0);
	if (i) {
	/* No preexisting struct cdev , create one /
	dev = make_dev(&tun_cdevsw, unit,
	UID_UUCP, GID_DIALER, 0600, "%s%d", ifc->ifc_name, unit);
	if (dev != NULL) {
	dev_ref(dev);
	dev->si_flags \|= SI_CHEAPCLONE;
	}
	}
	tuncreate(ifc->ifc_name, dev);

	return (0);
	}

	static void
	tunclone(void arg, struct ucred cred, char *name, int namelen,
	struct cdev **dev)
	{
	char devname[SPECNAMELEN + 1];
	int u, i, append_unit;

	if (*dev != NULL)
	return;

	/*
	* If tun cloning is enabled, only the superuser can create an
	* interface.
	*/
	if (!tundclone \|\| priv_check_cred(cred, PRIV_NET_IFCREATE, 0) != 0)
	return;

	if (strcmp(name, TUNNAME) == 0) {
	u = -1;
	} else if (dev_stdclone(name, NULL, TUNNAME, &u) != 1)
	return; /* Don't recognise the name */
	if (u != -1 && u > IF_MAXUNIT)
	return; /* Unit number too high */

	if (u == -1)
	append_unit = 1;
	else
	append_unit = 0;

	+ CURVNET_SET(TD_TO_VNET(curthread));
	/* find any existing device, or allocate new unit number */
	i = clone_create(&tunclones, &tun_cdevsw, &u, dev, 0);
	if (i) {
	if (append_unit) {
	namelen = snprintf(devname, sizeof(devname), "%s%d", name,
	u);
	name = devname;
	}
	/* No preexisting struct cdev , create one /
	*dev = make_dev(&tun_cdevsw, u,
	UID_UUCP, GID_DIALER, 0600, "%s", name);
	if (*dev != NULL) {
	dev_ref(*dev);
	(*dev)->si_flags \|= SI_CHEAPCLONE;
	}
	}

	if_clone_create(name, namelen, NULL);
	+ CURVNET_RESTORE();
	}

	static void
	tun_destroy(struct tun_softc *tp)
	{
	struct cdev *dev;

	/* Unlocked read. */
	KASSERT((tp->tun_flags & TUN_OPEN) == 0,
	("tununits is out of sync - unit %d", TUN2IFP(tp)->if_dunit));

	+ CURVNET_SET(TUN2IFP(tp)->if_vnet);
	dev = tp->tun_dev;
	bpfdetach(TUN2IFP(tp));
	if_detach(TUN2IFP(tp));
	if_free(TUN2IFP(tp));
	destroy_dev(dev);
	knlist_destroy(&tp->tun_rsel.si_note);
	mtx_destroy(&tp->tun_mtx);
	free(tp, M_TUN);
	+ CURVNET_RESTORE();
	}

	static void
	tun_clone_destroy(struct ifnet *ifp)
	{
	struct tun_softc *tp = ifp->if_softc;

	mtx_lock(&tunmtx);
	TAILQ_REMOVE(&tunhead, tp, tun_list);
	mtx_unlock(&tunmtx);
	tun_destroy(tp);
	}

	static int
	tunmodevent(module_t mod, int type, void *data)
	{
	static eventhandler_tag tag;
	struct tun_softc *tp;

	switch (type) {
	case MOD_LOAD:
	mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF);
	clone_setup(&tunclones);
	tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
	if (tag == NULL)
	return (ENOMEM);
	if_clone_attach(&tun_cloner);
	break;
	case MOD_UNLOAD:
	if_clone_detach(&tun_cloner);
	EVENTHANDLER_DEREGISTER(dev_clone, tag);

	mtx_lock(&tunmtx);
	while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
	TAILQ_REMOVE(&tunhead, tp, tun_list);
	mtx_unlock(&tunmtx);
	tun_destroy(tp);
	mtx_lock(&tunmtx);
	}
	mtx_unlock(&tunmtx);
	clone_cleanup(&tunclones);
	mtx_destroy(&tunmtx);
	break;
	default:
	return EOPNOTSUPP;
	}
	return 0;
	}

	static moduledata_t tun_mod = {
	"if_tun",
	tunmodevent,
	0
	};

	DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);

	static void
	tunstart(struct ifnet *ifp)
	{
	struct tun_softc *tp = ifp->if_softc;
	struct mbuf *m;

	TUNDEBUG(ifp,"%s starting\n", ifp->if_xname);
	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
	IFQ_LOCK(&ifp->if_snd);
	IFQ_POLL_NOLOCK(&ifp->if_snd, m);
	if (m == NULL) {
	IFQ_UNLOCK(&ifp->if_snd);
	return;
	}
	IFQ_UNLOCK(&ifp->if_snd);
	}

	mtx_lock(&tp->tun_mtx);
	if (tp->tun_flags & TUN_RWAIT) {
	tp->tun_flags &= ~TUN_RWAIT;
	wakeup(tp);
	}
	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
	mtx_unlock(&tp->tun_mtx);
	pgsigio(&tp->tun_sigio, SIGIO, 0);
	} else
	mtx_unlock(&tp->tun_mtx);
	selwakeuppri(&tp->tun_rsel, PZERO + 1);
	KNOTE_UNLOCKED(&tp->tun_rsel.si_note, 0);
	}

	/* XXX: should return an error code so it can fail. */
	static void
	tuncreate(const char name, struct cdev dev)
	{
	struct tun_softc *sc;
	struct ifnet *ifp;

	dev->si_flags &= ~SI_CHEAPCLONE;

	MALLOC(sc, struct tun_softc , sizeof(sc), M_TUN, M_WAITOK \| M_ZERO);
	mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF);
	sc->tun_flags = TUN_INITED;
	sc->tun_dev = dev;
	mtx_lock(&tunmtx);
	TAILQ_INSERT_TAIL(&tunhead, sc, tun_list);
	mtx_unlock(&tunmtx);

	ifp = sc->tun_ifp = if_alloc(IFT_PPP);
	if (ifp == NULL)
	panic("%s%d: failed to if_alloc() interface.\n",
	name, dev2unit(dev));
	if_initname(ifp, name, dev2unit(dev));
	ifp->if_mtu = TUNMTU;
	ifp->if_ioctl = tunifioctl;
	ifp->if_output = tunoutput;
	ifp->if_start = tunstart;
	ifp->if_flags = IFF_POINTOPOINT \| IFF_MULTICAST;
	ifp->if_softc = sc;
	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
	ifp->if_snd.ifq_drv_maxlen = 0;
	IFQ_SET_READY(&ifp->if_snd);
	knlist_init(&sc->tun_rsel.si_note, NULL, NULL, NULL, NULL);

	if_attach(ifp);
	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
	dev->si_drv1 = sc;
	TUNDEBUG(ifp, "interface %s is created, minor = %#x\n",
	ifp->if_xname, dev2unit(dev));
	}

	static int
	tunopen(struct cdev dev, int flag, int mode, struct thread td)
	{
	struct ifnet *ifp;
	struct tun_softc *tp;

	/*
	* XXXRW: Non-atomic test and set of dev->si_drv1 requires
	* synchronization.
	*/
	tp = dev->si_drv1;
	if (!tp) {
	tuncreate(TUNNAME, dev);
	tp = dev->si_drv1;
	}

	/*
	* XXXRW: This use of tun_pid is subject to error due to the
	* fact that a reference to the tunnel can live beyond the
	* death of the process that created it. Can we replace this
	* with a simple busy flag?
	*/
	mtx_lock(&tp->tun_mtx);
	if (tp->tun_pid != 0 && tp->tun_pid != td->td_proc->p_pid) {
	mtx_unlock(&tp->tun_mtx);
	return (EBUSY);
	}
	tp->tun_pid = td->td_proc->p_pid;

	tp->tun_flags \|= TUN_OPEN;
	mtx_unlock(&tp->tun_mtx);
	ifp = TUN2IFP(tp);
	TUNDEBUG(ifp, "open\n");

	return (0);
	}

	/*
	* tunclose - close the device - mark i/f down & delete
	* routing info
	*/
	static int
	tunclose(struct cdev dev, int foo, int bar, struct thread td)
	{
	struct tun_softc *tp;
	struct ifnet *ifp;
	int s;

	tp = dev->si_drv1;
	ifp = TUN2IFP(tp);

	mtx_lock(&tp->tun_mtx);
	tp->tun_flags &= ~TUN_OPEN;
	tp->tun_pid = 0;

	/*
	* junk all pending output
	*/
	+ CURVNET_SET(ifp->if_vnet);
	s = splimp();
	IFQ_PURGE(&ifp->if_snd);
	splx(s);
	mtx_unlock(&tp->tun_mtx);

	if (ifp->if_flags & IFF_UP) {
	s = splimp();
	if_down(ifp);
	splx(s);
	}

	/* Delete all addresses and routes which reference this interface. */
	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
	struct ifaddr *ifa;

	s = splimp();
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	/* deal w/IPv4 PtP destination; unlocked read */
	if (ifa->ifa_addr->sa_family == AF_INET) {
	rtinit(ifa, (int)RTM_DELETE,
	tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0);
	} else {
	rtinit(ifa, (int)RTM_DELETE, 0);
	}
	}
	if_purgeaddrs(ifp);
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	splx(s);
	}
	+ CURVNET_RESTORE();

	funsetown(&tp->tun_sigio);
	selwakeuppri(&tp->tun_rsel, PZERO + 1);
	KNOTE_UNLOCKED(&tp->tun_rsel.si_note, 0);
	TUNDEBUG (ifp, "closed\n");
	return (0);
	}

	static int
	tuninit(struct ifnet *ifp)
	{
	struct tun_softc *tp = ifp->if_softc;
	struct ifaddr *ifa;
	int error = 0;

	TUNDEBUG(ifp, "tuninit\n");

	ifp->if_flags \|= IFF_UP;
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	getmicrotime(&ifp->if_lastchange);

	#ifdef INET
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family == AF_INET) {
	struct sockaddr_in *si;

	si = (struct sockaddr_in *)ifa->ifa_addr;
	mtx_lock(&tp->tun_mtx);
	if (si->sin_addr.s_addr)
	tp->tun_flags \|= TUN_IASET;

	si = (struct sockaddr_in *)ifa->ifa_dstaddr;
	if (si && si->sin_addr.s_addr)
	tp->tun_flags \|= TUN_DSTADDR;
	mtx_unlock(&tp->tun_mtx);
	}
	}
	#endif
	return (error);
	}

	/*
	* Process an ioctl request.
	*/
	static int
	tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct ifreq ifr = (struct ifreq )data;
	struct tun_softc *tp = ifp->if_softc;
	struct ifstat *ifs;
	int error = 0, s;

	s = splimp();
	switch(cmd) {
	case SIOCGIFSTATUS:
	ifs = (struct ifstat *)data;
	mtx_lock(&tp->tun_mtx);
	if (tp->tun_pid)
	sprintf(ifs->ascii + strlen(ifs->ascii),
	"\tOpened by PID %d\n", tp->tun_pid);
	mtx_unlock(&tp->tun_mtx);
	break;
	case SIOCSIFADDR:
	error = tuninit(ifp);
	TUNDEBUG(ifp, "address set, error=%d\n", error);
	break;
	case SIOCSIFDSTADDR:
	error = tuninit(ifp);
	TUNDEBUG(ifp, "destination address set, error=%d\n", error);
	break;
	case SIOCSIFMTU:
	ifp->if_mtu = ifr->ifr_mtu;
	TUNDEBUG(ifp, "mtu set\n");
	break;
	case SIOCSIFFLAGS:
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	break;
	default:
	error = EINVAL;
	}
	splx(s);
	return (error);
	}

	/*
	* tunoutput - queue packets from higher level ready to put out.
	*/
	static int
	tunoutput(
	struct ifnet *ifp,
	struct mbuf *m0,
	struct sockaddr *dst,
	struct rtentry *rt)
	{
	struct tun_softc *tp = ifp->if_softc;
	u_short cached_tun_flags;
	int error;
	u_int32_t af;

	TUNDEBUG (ifp, "tunoutput\n");

	#ifdef MAC
	error = mac_ifnet_check_transmit(ifp, m0);
	if (error) {
	m_freem(m0);
	return (error);
	}
	#endif

	/* Could be unlocked read? */
	mtx_lock(&tp->tun_mtx);
	cached_tun_flags = tp->tun_flags;
	mtx_unlock(&tp->tun_mtx);
	if ((cached_tun_flags & TUN_READY) != TUN_READY) {
	TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
	m_freem (m0);
	return (EHOSTDOWN);
	}

	if ((ifp->if_flags & IFF_UP) != IFF_UP) {
	m_freem (m0);
	return (EHOSTDOWN);
	}

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	if (bpf_peers_present(ifp->if_bpf)) {
	af = dst->sa_family;
	bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0);
	}

	/* prepend sockaddr? this may abort if the mbuf allocation fails */
	if (cached_tun_flags & TUN_LMODE) {
	/* allocate space for sockaddr */
	M_PREPEND(m0, dst->sa_len, M_DONTWAIT);

	/* if allocation failed drop packet */
	if (m0 == NULL) {
	ifp->if_iqdrops++;
	ifp->if_oerrors++;
	return (ENOBUFS);
	} else {
	bcopy(dst, m0->m_data, dst->sa_len);
	}
	}

	if (cached_tun_flags & TUN_IFHEAD) {
	/* Prepend the address family */
	M_PREPEND(m0, 4, M_DONTWAIT);

	/* if allocation failed drop packet */
	if (m0 == NULL) {
	ifp->if_iqdrops++;
	ifp->if_oerrors++;
	return (ENOBUFS);
	} else
	(u_int32_t )m0->m_data = htonl(dst->sa_family);
	} else {
	#ifdef INET
	if (dst->sa_family != AF_INET)
	#endif
	{
	m_freem(m0);
	return (EAFNOSUPPORT);
	}
	}

	IFQ_HANDOFF(ifp, m0, error);
	if (error) {
	ifp->if_collisions++;
	return (ENOBUFS);
	}
	ifp->if_opackets++;
	return (0);
	}

	/*
	* the cdevsw interface is now pretty minimal.
	*/
	static int
	tunioctl(struct cdev dev, u_long cmd, caddr_t data, int flag, struct thread td)
	{
	int s;
	int error;
	struct tun_softc *tp = dev->si_drv1;
	struct tuninfo *tunp;

	switch (cmd) {
	case TUNSIFINFO:
	tunp = (struct tuninfo *)data;
	if (tunp->mtu < IF_MINMTU)
	return (EINVAL);
	if (TUN2IFP(tp)->if_mtu != tunp->mtu) {
	error = priv_check(td, PRIV_NET_SETIFMTU);
	if (error)
	return (error);
	}
	TUN2IFP(tp)->if_mtu = tunp->mtu;
	TUN2IFP(tp)->if_type = tunp->type;
	TUN2IFP(tp)->if_baudrate = tunp->baudrate;
	break;
	case TUNGIFINFO:
	tunp = (struct tuninfo *)data;
	tunp->mtu = TUN2IFP(tp)->if_mtu;
	tunp->type = TUN2IFP(tp)->if_type;
	tunp->baudrate = TUN2IFP(tp)->if_baudrate;
	break;
	case TUNSDEBUG:
	tundebug = (int )data;
	break;
	case TUNGDEBUG:
	(int )data = tundebug;
	break;
	case TUNSLMODE:
	mtx_lock(&tp->tun_mtx);
	if ((int )data) {
	tp->tun_flags \|= TUN_LMODE;
	tp->tun_flags &= ~TUN_IFHEAD;
	} else
	tp->tun_flags &= ~TUN_LMODE;
	mtx_unlock(&tp->tun_mtx);
	break;
	case TUNSIFHEAD:
	mtx_lock(&tp->tun_mtx);
	if ((int )data) {
	tp->tun_flags \|= TUN_IFHEAD;
	tp->tun_flags &= ~TUN_LMODE;
	} else
	tp->tun_flags &= ~TUN_IFHEAD;
	mtx_unlock(&tp->tun_mtx);
	break;
	case TUNGIFHEAD:
	/* Could be unlocked read? */
	mtx_lock(&tp->tun_mtx);
	(int )data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0;
	mtx_unlock(&tp->tun_mtx);
	break;
	case TUNSIFMODE:
	/* deny this if UP */
	if (TUN2IFP(tp)->if_flags & IFF_UP)
	return(EBUSY);

	switch ((int )data & ~IFF_MULTICAST) {
	case IFF_POINTOPOINT:
	case IFF_BROADCAST:
	TUN2IFP(tp)->if_flags &=
	~(IFF_BROADCAST\|IFF_POINTOPOINT\|IFF_MULTICAST);
	TUN2IFP(tp)->if_flags \|= (int )data;
	break;
	default:
	return(EINVAL);
	}
	break;
	case TUNSIFPID:
	mtx_lock(&tp->tun_mtx);
	tp->tun_pid = curthread->td_proc->p_pid;
	mtx_unlock(&tp->tun_mtx);
	break;
	case FIONBIO:
	break;
	case FIOASYNC:
	mtx_lock(&tp->tun_mtx);
	if ((int )data)
	tp->tun_flags \|= TUN_ASYNC;
	else
	tp->tun_flags &= ~TUN_ASYNC;
	mtx_unlock(&tp->tun_mtx);
	break;
	case FIONREAD:
	s = splimp();
	if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) {
	struct mbuf *mb;
	IFQ_LOCK(&TUN2IFP(tp)->if_snd);
	IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb);
	for( (int )data = 0; mb != 0; mb = mb->m_next)
	(int )data += mb->m_len;
	IFQ_UNLOCK(&TUN2IFP(tp)->if_snd);
	} else
	(int )data = 0;
	splx(s);
	break;
	case FIOSETOWN:
	return (fsetown((int )data, &tp->tun_sigio));

	case FIOGETOWN:
	(int )data = fgetown(&tp->tun_sigio);
	return (0);

	/* This is deprecated, FIOSETOWN should be used instead. */
	case TIOCSPGRP:
	return (fsetown(-((int )data), &tp->tun_sigio));

	/* This is deprecated, FIOGETOWN should be used instead. */
	case TIOCGPGRP:
	(int )data = -fgetown(&tp->tun_sigio);
	return (0);

	default:
	return (ENOTTY);
	}
	return (0);
	}

	/*
	* The cdevsw read interface - reads a packet at a time, or at
	* least as much of a packet as can be read.
	*/
	static int
	tunread(struct cdev dev, struct uio uio, int flag)
	{
	struct tun_softc *tp = dev->si_drv1;
	struct ifnet *ifp = TUN2IFP(tp);
	struct mbuf *m;
	int error=0, len, s;

	TUNDEBUG (ifp, "read\n");
	mtx_lock(&tp->tun_mtx);
	if ((tp->tun_flags & TUN_READY) != TUN_READY) {
	mtx_unlock(&tp->tun_mtx);
	TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
	return (EHOSTDOWN);
	}

	tp->tun_flags &= ~TUN_RWAIT;
	mtx_unlock(&tp->tun_mtx);

	s = splimp();
	do {
	IFQ_DEQUEUE(&ifp->if_snd, m);
	if (m == NULL) {
	if (flag & O_NONBLOCK) {
	splx(s);
	return (EWOULDBLOCK);
	}
	mtx_lock(&tp->tun_mtx);
	tp->tun_flags \|= TUN_RWAIT;
	mtx_unlock(&tp->tun_mtx);
	if ((error = tsleep(tp, PCATCH \| (PZERO + 1),
	"tunread", 0)) != 0) {
	splx(s);
	return (error);
	}
	}
	} while (m == NULL);
	splx(s);

	while (m && uio->uio_resid > 0 && error == 0) {
	len = min(uio->uio_resid, m->m_len);
	if (len != 0)
	error = uiomove(mtod(m, void *), len, uio);
	m = m_free(m);
	}

	if (m) {
	TUNDEBUG(ifp, "Dropping mbuf\n");
	m_freem(m);
	}
	return (error);
	}

	/*
	* the cdevsw write interface - an atomic write is a packet - or else!
	*/
	static int
	tunwrite(struct cdev dev, struct uio uio, int flag)
	{
	struct tun_softc *tp = dev->si_drv1;
	struct ifnet *ifp = TUN2IFP(tp);
	struct mbuf *m;
	int error = 0;
	uint32_t family;
	int isr;

	TUNDEBUG(ifp, "tunwrite\n");

	if ((ifp->if_flags & IFF_UP) != IFF_UP)
	/* ignore silently */
	return (0);

	if (uio->uio_resid == 0)
	return (0);

	if (uio->uio_resid < 0 \|\| uio->uio_resid > TUNMRU) {
	TUNDEBUG(ifp, "len=%d!\n", uio->uio_resid);
	return (EIO);
	}

	if ((m = m_uiotombuf(uio, M_DONTWAIT, 0, 0, M_PKTHDR)) == NULL) {
	ifp->if_ierrors++;
	return (error);
	}

	m->m_pkthdr.rcvif = ifp;
	#ifdef MAC
	mac_ifnet_create_mbuf(ifp, m);
	#endif

	/* Could be unlocked read? */
	mtx_lock(&tp->tun_mtx);
	if (tp->tun_flags & TUN_IFHEAD) {
	mtx_unlock(&tp->tun_mtx);
	if (m->m_len < sizeof(family) &&
	(m = m_pullup(m, sizeof(family))) == NULL)
	return (ENOBUFS);
	family = ntohl(mtod(m, u_int32_t ));
	m_adj(m, sizeof(family));
	} else {
	mtx_unlock(&tp->tun_mtx);
	family = AF_INET;
	}

	BPF_MTAP2(ifp, &family, sizeof(family), m);

	switch (family) {
	#ifdef INET
	case AF_INET:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	isr = NETISR_IPV6;
	break;
	#endif
	#ifdef IPX
	case AF_IPX:
	isr = NETISR_IPX;
	break;
	#endif
	#ifdef NETATALK
	case AF_APPLETALK:
	isr = NETISR_ATALK2;
	break;
	#endif
	default:
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	/* First chunk of an mbuf contains good junk */
	if (harvest.point_to_point)
	random_harvest(m, 16, 3, 0, RANDOM_NET);
	ifp->if_ibytes += m->m_pkthdr.len;
	ifp->if_ipackets++;
	+ CURVNET_SET(ifp->if_vnet);
	netisr_dispatch(isr, m);
	+ CURVNET_RESTORE();
	return (0);
	}

	/*
	* tunpoll - the poll interface, this is only useful on reads
	* really. The write detect always returns true, write never blocks
	* anyway, it either accepts the packet or drops it.
	*/
	static int
	tunpoll(struct cdev dev, int events, struct thread td)
	{
	int s;
	struct tun_softc *tp = dev->si_drv1;
	struct ifnet *ifp = TUN2IFP(tp);
	int revents = 0;
	struct mbuf *m;

	s = splimp();
	TUNDEBUG(ifp, "tunpoll\n");

	if (events & (POLLIN \| POLLRDNORM)) {
	IFQ_LOCK(&ifp->if_snd);
	IFQ_POLL_NOLOCK(&ifp->if_snd, m);
	if (m != NULL) {
	TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len);
	revents \|= events & (POLLIN \| POLLRDNORM);
	} else {
	TUNDEBUG(ifp, "tunpoll waiting\n");
	selrecord(td, &tp->tun_rsel);
	}
	IFQ_UNLOCK(&ifp->if_snd);
	}
	if (events & (POLLOUT \| POLLWRNORM))
	revents \|= events & (POLLOUT \| POLLWRNORM);

	splx(s);
	return (revents);
	}

	/*
	* tunkqfilter - support for the kevent() system call.
	*/
	static int
	tunkqfilter(struct cdev dev, struct knote kn)
	{
	int s;
	struct tun_softc *tp = dev->si_drv1;
	struct ifnet *ifp = TUN2IFP(tp);

	s = splimp();
	switch(kn->kn_filter) {
	case EVFILT_READ:
	TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n",
	ifp->if_xname, dev2unit(dev));
	kn->kn_fop = &tun_read_filterops;
	break;

	case EVFILT_WRITE:
	TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n",
	ifp->if_xname, dev2unit(dev));
	kn->kn_fop = &tun_write_filterops;
	break;

	default:
	TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n",
	ifp->if_xname, dev2unit(dev));
	splx(s);
	return(EINVAL);
	}
	splx(s);

	kn->kn_hook = (caddr_t) dev;
	knlist_add(&tp->tun_rsel.si_note, kn, 0);

	return (0);
	}

	/*
	* Return true of there is data in the interface queue.
	*/
	static int
	tunkqread(struct knote *kn, long hint)
	{
	int ret, s;
	struct cdev dev = (struct cdev )(kn->kn_hook);
	struct tun_softc *tp = dev->si_drv1;
	struct ifnet *ifp = TUN2IFP(tp);

	s = splimp();
	if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) {
	TUNDEBUG(ifp,
	"%s have data in the queue. Len = %d, minor = %#x\n",
	ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev));
	ret = 1;
	} else {
	TUNDEBUG(ifp,
	"%s waiting for data, minor = %#x\n", ifp->if_xname,
	dev2unit(dev));
	ret = 0;
	}
	splx(s);

	return (ret);
	}

	/*
	* Always can write, always return MTU in kn->data.
	*/
	static int
	tunkqwrite(struct knote *kn, long hint)
	{
	int s;
	struct tun_softc tp = ((struct cdev )kn->kn_hook)->si_drv1;
	struct ifnet *ifp = TUN2IFP(tp);

	s = splimp();
	kn->kn_data = ifp->if_mtu;
	splx(s);

	return (1);
	}

	static void
	tunkqdetach(struct knote *kn)
	{
	struct tun_softc tp = ((struct cdev )kn->kn_hook)->si_drv1;

	knlist_remove(&tp->tun_rsel.si_note, kn, 0);
	}
	Index: head/sys/net/if_var.h
	===================================================================
	--- head/sys/net/if_var.h (revision 183549)
	+++ head/sys/net/if_var.h (revision 183550)
	@@ -1,718 +1,720 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* From: @(#)if.h 8.1 (Berkeley) 6/10/93
	* $FreeBSD$
	*/

	#ifndef _NET_IF_VAR_H_
	#define _NET_IF_VAR_H_

	/*
	* Structures defining a network interface, providing a packet
	* transport mechanism (ala level 0 of the PUP protocols).
	*
	* Each interface accepts output datagrams of a specified maximum
	* length, and provides higher level routines with input datagrams
	* received from its medium.
	*
	* Output occurs when the routine if_output is called, with three parameters:
	* (*ifp->if_output)(ifp, m, dst, rt)
	* Here m is the mbuf chain to be sent and dst is the destination address.
	* The output routine encapsulates the supplied datagram if necessary,
	* and then transmits it on its medium.
	*
	* On input, each interface unwraps the data received by it, and either
	* places it on the input queue of an internetwork datagram routine
	* and posts the associated software interrupt, or passes the datagram to a raw
	* packet input routine.
	*
	* Routines exist for locating interfaces by their addresses
	* or for locating an interface on a certain network, as well as more general
	* routing and gateway routines maintaining information used to locate
	* interfaces. These routines live in the files if.c and route.c
	*/

	#ifdef __STDC__
	/*
	* Forward structure declarations for function prototypes [sic].
	*/
	struct mbuf;
	struct thread;
	struct rtentry;
	struct rt_addrinfo;
	struct socket;
	struct ether_header;
	struct carp_if;
	struct ifvlantrunk;
	#endif

	#include <sys/queue.h> /* get TAILQ macros */

	#ifdef _KERNEL
	#include <sys/mbuf.h>
	#include <sys/eventhandler.h>
	#endif /* _KERNEL */
	#include <sys/lock.h> /* XXX */
	#include <sys/mutex.h> /* XXX */
	#include <sys/event.h> /* XXX */
	#include <sys/_task.h>

	#define IF_DUNIT_NONE -1

	#include <altq/if_altq.h>

	TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */
	TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */
	TAILQ_HEAD(ifprefixhead, ifprefix);
	TAILQ_HEAD(ifmultihead, ifmultiaddr);
	TAILQ_HEAD(ifgrouphead, ifg_group);

	/*
	* Structure defining a queue for a network interface.
	*/
	struct ifqueue {
	struct mbuf *ifq_head;
	struct mbuf *ifq_tail;
	int ifq_len;
	int ifq_maxlen;
	int ifq_drops;
	struct mtx ifq_mtx;
	};

	/*
	* Structure defining a network interface.
	*
	* (Would like to call this struct ``if'', but C isn't PL/1.)
	*/

	struct ifnet {
	void if_softc; / pointer to driver state */
	void if_l2com; / pointer to protocol bits */
	TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */
	char if_xname[IFNAMSIZ]; /* external name (name + unit) */
	const char if_dname; / driver name */
	int if_dunit; /* unit or IF_DUNIT_NONE */
	struct ifaddrhead if_addrhead; /* linked list of addresses per if */
	/*
	* if_addrhead is the list of all addresses associated to
	* an interface.
	* Some code in the kernel assumes that first element
	* of the list has type AF_LINK, and contains sockaddr_dl
	* addresses which store the link-level address and the name
	* of the interface.
	* However, access to the AF_LINK address through this
	* field is deprecated. Use if_addr or ifaddr_byindex() instead.
	*/
	struct knlist if_klist; /* events attached to this if */
	int if_pcount; /* number of promiscuous listeners */
	struct carp_if if_carp; / carp interface structure */
	struct bpf_if if_bpf; / packet filter structure */
	u_short if_index; /* numeric abbreviation for this if */
	short if_timer; /* time 'til if_watchdog called */
	struct ifvlantrunk if_vlantrunk; / pointer to 802.1q data */
	int if_flags; /* up/down, broadcast, etc. */
	int if_capabilities; /* interface features & capabilities */
	int if_capenable; /* enabled features & capabilities */
	void if_linkmib; / link-type-specific MIB data */
	size_t if_linkmiblen; /* length of above data */
	struct if_data if_data;
	struct ifmultihead if_multiaddrs; /* multicast addresses configured */
	int if_amcount; /* number of all-multicast requests */
	/* procedure handles */
	int (if_output) / output routine (enqueue) */
	(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *);
	void (if_input) / input routine (from h/w driver) */
	(struct ifnet , struct mbuf );
	void (if_start) / initiate output routine */
	(struct ifnet *);
	int (if_ioctl) / ioctl routine */
	(struct ifnet *, u_long, caddr_t);
	void (if_watchdog) / timer routine */
	(struct ifnet *);
	void (if_init) / Init routine */
	(void *);
	int (if_resolvemulti) / validate/resolve multicast */
	(struct ifnet , struct sockaddr , struct sockaddr );
	struct ifaddr if_addr; / pointer to link-level address */
	void if_llsoftc; / link layer softc */
	int if_drv_flags; /* driver-managed status flags */
	u_int if_spare_flags2; /* spare flags 2 */
	struct ifaltq if_snd; /* output queue (includes altq) */
	const u_int8_t if_broadcastaddr; / linklevel broadcast bytestring */

	void if_bridge; / bridge glue */

	struct lltable lltables; / list of L3-L2 resolution tables */

	struct label if_label; / interface MAC label */

	/* these are only used by IPv6 */
	struct ifprefixhead if_prefixhead; /* list of prefixes per if */
	void *if_afdata[AF_MAX];
	int if_afdata_initialized;
	struct mtx if_afdata_mtx;
	struct task if_starttask; /* task for IFF_NEEDSGIANT */
	struct task if_linktask; /* task for link change events */
	struct mtx if_addr_mtx; /* mutex to protect address lists */
	LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */
	TAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if */
	/* protected by if_addr_mtx */
	void *if_pf_kif;
	void if_lagg; / lagg glue */
	void if_pspare[10]; / multiq/TOE 3; vimage 3; general use 4 */
	int if_ispare[2]; /* general use 2 */
	};

	typedef void if_init_f_t(void *);

	/*
	* XXX These aliases are terribly dangerous because they could apply
	* to anything.
	*/
	#define if_mtu if_data.ifi_mtu
	#define if_type if_data.ifi_type
	#define if_physical if_data.ifi_physical
	#define if_addrlen if_data.ifi_addrlen
	#define if_hdrlen if_data.ifi_hdrlen
	#define if_metric if_data.ifi_metric
	#define if_link_state if_data.ifi_link_state
	#define if_baudrate if_data.ifi_baudrate
	#define if_hwassist if_data.ifi_hwassist
	#define if_ipackets if_data.ifi_ipackets
	#define if_ierrors if_data.ifi_ierrors
	#define if_opackets if_data.ifi_opackets
	#define if_oerrors if_data.ifi_oerrors
	#define if_collisions if_data.ifi_collisions
	#define if_ibytes if_data.ifi_ibytes
	#define if_obytes if_data.ifi_obytes
	#define if_imcasts if_data.ifi_imcasts
	#define if_omcasts if_data.ifi_omcasts
	#define if_iqdrops if_data.ifi_iqdrops
	#define if_noproto if_data.ifi_noproto
	#define if_lastchange if_data.ifi_lastchange
	#define if_rawoutput(if, m, sa) if_output(if, m, sa, (struct rtentry *)NULL)

	/* for compatibility with other BSDs */
	#define if_addrlist if_addrhead
	#define if_list if_link
	#define if_name(ifp) ((ifp)->if_xname)

	/*
	* Locks for address lists on the network interface.
	*/
	#define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_mtx, \
	"if_addr_mtx", NULL, MTX_DEF)
	#define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_mtx)
	#define IF_ADDR_LOCK(if) mtx_lock(&(if)->if_addr_mtx)
	#define IF_ADDR_UNLOCK(if) mtx_unlock(&(if)->if_addr_mtx)
	#define IF_ADDR_LOCK_ASSERT(if) mtx_assert(&(if)->if_addr_mtx, MA_OWNED)

	/*
	* Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq)
	* are queues of messages stored on ifqueue structures
	* (defined above). Entries are added to and deleted from these structures
	* by these macros, which should be called with ipl raised to splimp().
	*/
	#define IF_LOCK(ifq) mtx_lock(&(ifq)->ifq_mtx)
	#define IF_UNLOCK(ifq) mtx_unlock(&(ifq)->ifq_mtx)
	#define IF_LOCK_ASSERT(ifq) mtx_assert(&(ifq)->ifq_mtx, MA_OWNED)
	#define _IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen)
	#define _IF_DROP(ifq) ((ifq)->ifq_drops++)
	#define _IF_QLEN(ifq) ((ifq)->ifq_len)

	#define _IF_ENQUEUE(ifq, m) do { \
	(m)->m_nextpkt = NULL; \
	if ((ifq)->ifq_tail == NULL) \
	(ifq)->ifq_head = m; \
	else \
	(ifq)->ifq_tail->m_nextpkt = m; \
	(ifq)->ifq_tail = m; \
	(ifq)->ifq_len++; \
	} while (0)

	#define IF_ENQUEUE(ifq, m) do { \
	IF_LOCK(ifq); \
	_IF_ENQUEUE(ifq, m); \
	IF_UNLOCK(ifq); \
	} while (0)

	#define _IF_PREPEND(ifq, m) do { \
	(m)->m_nextpkt = (ifq)->ifq_head; \
	if ((ifq)->ifq_tail == NULL) \
	(ifq)->ifq_tail = (m); \
	(ifq)->ifq_head = (m); \
	(ifq)->ifq_len++; \
	} while (0)

	#define IF_PREPEND(ifq, m) do { \
	IF_LOCK(ifq); \
	_IF_PREPEND(ifq, m); \
	IF_UNLOCK(ifq); \
	} while (0)

	#define _IF_DEQUEUE(ifq, m) do { \
	(m) = (ifq)->ifq_head; \
	if (m) { \
	if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \
	(ifq)->ifq_tail = NULL; \
	(m)->m_nextpkt = NULL; \
	(ifq)->ifq_len--; \
	} \
	} while (0)

	#define IF_DEQUEUE(ifq, m) do { \
	IF_LOCK(ifq); \
	_IF_DEQUEUE(ifq, m); \
	IF_UNLOCK(ifq); \
	} while (0)

	#define _IF_POLL(ifq, m) ((m) = (ifq)->ifq_head)
	#define IF_POLL(ifq, m) _IF_POLL(ifq, m)

	#define _IF_DRAIN(ifq) do { \
	struct mbuf *m; \
	for (;;) { \
	_IF_DEQUEUE(ifq, m); \
	if (m == NULL) \
	break; \
	m_freem(m); \
	} \
	} while (0)

	#define IF_DRAIN(ifq) do { \
	IF_LOCK(ifq); \
	_IF_DRAIN(ifq); \
	IF_UNLOCK(ifq); \
	} while(0)

	#ifdef _KERNEL
	/* interface address change event */
	typedef void (ifaddr_event_handler_t)(void , struct ifnet *);
	EVENTHANDLER_DECLARE(ifaddr_event, ifaddr_event_handler_t);
	/* new interface arrival event */
	typedef void (ifnet_arrival_event_handler_t)(void , struct ifnet *);
	EVENTHANDLER_DECLARE(ifnet_arrival_event, ifnet_arrival_event_handler_t);
	/* interface departure event */
	typedef void (ifnet_departure_event_handler_t)(void , struct ifnet *);
	EVENTHANDLER_DECLARE(ifnet_departure_event, ifnet_departure_event_handler_t);

	/*
	* interface groups
	*/
	struct ifg_group {
	char ifg_group[IFNAMSIZ];
	u_int ifg_refcnt;
	void *ifg_pf_kif;
	TAILQ_HEAD(, ifg_member) ifg_members;
	TAILQ_ENTRY(ifg_group) ifg_next;
	};

	struct ifg_member {
	TAILQ_ENTRY(ifg_member) ifgm_next;
	struct ifnet *ifgm_ifp;
	};

	struct ifg_list {
	struct ifg_group *ifgl_group;
	TAILQ_ENTRY(ifg_list) ifgl_next;
	};

	/* group attach event */
	typedef void (group_attach_event_handler_t)(void , struct ifg_group *);
	EVENTHANDLER_DECLARE(group_attach_event, group_attach_event_handler_t);
	/* group detach event */
	typedef void (group_detach_event_handler_t)(void , struct ifg_group *);
	EVENTHANDLER_DECLARE(group_detach_event, group_detach_event_handler_t);
	/* group change event */
	typedef void (group_change_event_handler_t)(void , const char *);
	EVENTHANDLER_DECLARE(group_change_event, group_change_event_handler_t);

	#define IF_AFDATA_LOCK_INIT(ifp) \
	mtx_init(&(ifp)->if_afdata_mtx, "if_afdata", NULL, MTX_DEF)
	#define IF_AFDATA_LOCK(ifp) mtx_lock(&(ifp)->if_afdata_mtx)
	#define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_mtx)
	#define IF_AFDATA_UNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_mtx)
	#define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_mtx)

	#define IFF_LOCKGIANT(ifp) do { \
	if ((ifp)->if_flags & IFF_NEEDSGIANT) \
	mtx_lock(&Giant); \
	} while (0)

	#define IFF_UNLOCKGIANT(ifp) do { \
	if ((ifp)->if_flags & IFF_NEEDSGIANT) \
	mtx_unlock(&Giant); \
	} while (0)

	int if_handoff(struct ifqueue ifq, struct mbuf m, struct ifnet *ifp,
	int adjust);
	#define IF_HANDOFF(ifq, m, ifp) \
	if_handoff((struct ifqueue *)ifq, m, ifp, 0)
	#define IF_HANDOFF_ADJ(ifq, m, ifp, adj) \
	if_handoff((struct ifqueue *)ifq, m, ifp, adj)

	void if_start(struct ifnet *);

	#define IFQ_ENQUEUE(ifq, m, err) \
	do { \
	IF_LOCK(ifq); \
	if (ALTQ_IS_ENABLED(ifq)) \
	ALTQ_ENQUEUE(ifq, m, NULL, err); \
	else { \
	if (_IF_QFULL(ifq)) { \
	m_freem(m); \
	(err) = ENOBUFS; \
	} else { \
	_IF_ENQUEUE(ifq, m); \
	(err) = 0; \
	} \
	} \
	if (err) \
	(ifq)->ifq_drops++; \
	IF_UNLOCK(ifq); \
	} while (0)

	#define IFQ_DEQUEUE_NOLOCK(ifq, m) \
	do { \
	if (TBR_IS_ENABLED(ifq)) \
	(m) = tbr_dequeue_ptr(ifq, ALTDQ_REMOVE); \
	else if (ALTQ_IS_ENABLED(ifq)) \
	ALTQ_DEQUEUE(ifq, m); \
	else \
	_IF_DEQUEUE(ifq, m); \
	} while (0)

	#define IFQ_DEQUEUE(ifq, m) \
	do { \
	IF_LOCK(ifq); \
	IFQ_DEQUEUE_NOLOCK(ifq, m); \
	IF_UNLOCK(ifq); \
	} while (0)

	#define IFQ_POLL_NOLOCK(ifq, m) \
	do { \
	if (TBR_IS_ENABLED(ifq)) \
	(m) = tbr_dequeue_ptr(ifq, ALTDQ_POLL); \
	else if (ALTQ_IS_ENABLED(ifq)) \
	ALTQ_POLL(ifq, m); \
	else \
	_IF_POLL(ifq, m); \
	} while (0)

	#define IFQ_POLL(ifq, m) \
	do { \
	IF_LOCK(ifq); \
	IFQ_POLL_NOLOCK(ifq, m); \
	IF_UNLOCK(ifq); \
	} while (0)

	#define IFQ_PURGE_NOLOCK(ifq) \
	do { \
	if (ALTQ_IS_ENABLED(ifq)) { \
	ALTQ_PURGE(ifq); \
	} else \
	_IF_DRAIN(ifq); \
	} while (0)

	#define IFQ_PURGE(ifq) \
	do { \
	IF_LOCK(ifq); \
	IFQ_PURGE_NOLOCK(ifq); \
	IF_UNLOCK(ifq); \
	} while (0)

	#define IFQ_SET_READY(ifq) \
	do { ((ifq)->altq_flags \|= ALTQF_READY); } while (0)

	#define IFQ_LOCK(ifq) IF_LOCK(ifq)
	#define IFQ_UNLOCK(ifq) IF_UNLOCK(ifq)
	#define IFQ_LOCK_ASSERT(ifq) IF_LOCK_ASSERT(ifq)
	#define IFQ_IS_EMPTY(ifq) ((ifq)->ifq_len == 0)
	#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++)
	#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len)
	#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++)
	#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len))

	/*
	* The IFF_DRV_OACTIVE test should really occur in the device driver, not in
	* the handoff logic, as that flag is locked by the device driver.
	*/
	#define IFQ_HANDOFF_ADJ(ifp, m, adj, err) \
	do { \
	int len; \
	short mflags; \
	\
	len = (m)->m_pkthdr.len; \
	mflags = (m)->m_flags; \
	IFQ_ENQUEUE(&(ifp)->if_snd, m, err); \
	if ((err) == 0) { \
	(ifp)->if_obytes += len + (adj); \
	if (mflags & M_MCAST) \
	(ifp)->if_omcasts++; \
	if (((ifp)->if_drv_flags & IFF_DRV_OACTIVE) == 0) \
	if_start(ifp); \
	} \
	} while (0)

	#define IFQ_HANDOFF(ifp, m, err) \
	IFQ_HANDOFF_ADJ(ifp, m, 0, err)

	#define IFQ_DRV_DEQUEUE(ifq, m) \
	do { \
	(m) = (ifq)->ifq_drv_head; \
	if (m) { \
	if (((ifq)->ifq_drv_head = (m)->m_nextpkt) == NULL) \
	(ifq)->ifq_drv_tail = NULL; \
	(m)->m_nextpkt = NULL; \
	(ifq)->ifq_drv_len--; \
	} else { \
	IFQ_LOCK(ifq); \
	IFQ_DEQUEUE_NOLOCK(ifq, m); \
	while ((ifq)->ifq_drv_len < (ifq)->ifq_drv_maxlen) { \
	struct mbuf *m0; \
	IFQ_DEQUEUE_NOLOCK(ifq, m0); \
	if (m0 == NULL) \
	break; \
	m0->m_nextpkt = NULL; \
	if ((ifq)->ifq_drv_tail == NULL) \
	(ifq)->ifq_drv_head = m0; \
	else \
	(ifq)->ifq_drv_tail->m_nextpkt = m0; \
	(ifq)->ifq_drv_tail = m0; \
	(ifq)->ifq_drv_len++; \
	} \
	IFQ_UNLOCK(ifq); \
	} \
	} while (0)

	#define IFQ_DRV_PREPEND(ifq, m) \
	do { \
	(m)->m_nextpkt = (ifq)->ifq_drv_head; \
	if ((ifq)->ifq_drv_tail == NULL) \
	(ifq)->ifq_drv_tail = (m); \
	(ifq)->ifq_drv_head = (m); \
	(ifq)->ifq_drv_len++; \
	} while (0)

	#define IFQ_DRV_IS_EMPTY(ifq) \
	(((ifq)->ifq_drv_len == 0) && ((ifq)->ifq_len == 0))

	#define IFQ_DRV_PURGE(ifq) \
	do { \
	struct mbuf m, n = (ifq)->ifq_drv_head; \
	while((m = n) != NULL) { \
	n = m->m_nextpkt; \
	m_freem(m); \
	} \
	(ifq)->ifq_drv_head = (ifq)->ifq_drv_tail = NULL; \
	(ifq)->ifq_drv_len = 0; \
	IFQ_PURGE(ifq); \
	} while (0)

	/*
	* 72 was chosen below because it is the size of a TCP/IP
	* header (40) + the minimum mss (32).
	*/
	#define IF_MINMTU 72
	#define IF_MAXMTU 65535

	#endif /* _KERNEL */

	/*
	* The ifaddr structure contains information about one address
	* of an interface. They are maintained by the different address families,
	* are allocated and attached when an address is set, and are linked
	* together so all addresses for an interface can be located.
	*
	* NOTE: a 'struct ifaddr' is always at the beginning of a larger
	* chunk of malloc'ed memory, where we store the three addresses
	* (ifa_addr, ifa_dstaddr and ifa_netmask) referenced here.
	*/
	struct ifaddr {
	struct sockaddr ifa_addr; / address of interface */
	struct sockaddr ifa_dstaddr; / other end of p-to-p link */
	#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */
	struct sockaddr ifa_netmask; / used to determine subnet */
	struct if_data if_data; /* not all members are meaningful */
	struct ifnet ifa_ifp; / back-pointer to interface */
	TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */
	void (ifa_rtrequest) / check or clean routes (+ or -)'d */
	(int, struct rtentry , struct rt_addrinfo );
	u_short ifa_flags; /* mostly rt_flags for cloning */
	u_int ifa_refcnt; /* references to this structure */
	int ifa_metric; /* cost of going out this interface */
	int (ifa_claim_addr) / check if an addr goes to this if */
	(struct ifaddr , struct sockaddr );
	struct mtx ifa_mtx;
	};
	#define IFA_ROUTE RTF_UP /* route installed */

	/* for compatibility with other BSDs */
	#define ifa_list ifa_link

	#define IFA_LOCK_INIT(ifa) \
	mtx_init(&(ifa)->ifa_mtx, "ifaddr", NULL, MTX_DEF)
	#define IFA_LOCK(ifa) mtx_lock(&(ifa)->ifa_mtx)
	#define IFA_UNLOCK(ifa) mtx_unlock(&(ifa)->ifa_mtx)
	#define IFA_DESTROY(ifa) mtx_destroy(&(ifa)->ifa_mtx)

	/*
	* The prefix structure contains information about one prefix
	* of an interface. They are maintained by the different address families,
	* are allocated and attached when a prefix or an address is set,
	* and are linked together so all prefixes for an interface can be located.
	*/
	struct ifprefix {
	struct sockaddr ifpr_prefix; / prefix of interface */
	struct ifnet ifpr_ifp; / back-pointer to interface */
	TAILQ_ENTRY(ifprefix) ifpr_list; /* queue macro glue */
	u_char ifpr_plen; /* prefix length in bits */
	u_char ifpr_type; /* protocol dependent prefix type */
	};

	/*
	* Multicast address structure. This is analogous to the ifaddr
	* structure except that it keeps track of multicast addresses.
	*/
	struct ifmultiaddr {
	TAILQ_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */
	struct sockaddr ifma_addr; / address this membership is for */
	struct sockaddr ifma_lladdr; / link-layer translation, if any */
	struct ifnet ifma_ifp; / back-pointer to interface */
	u_int ifma_refcount; /* reference count */
	void ifma_protospec; / protocol-specific state, if any */
	struct ifmultiaddr ifma_llifma; / pointer to ifma for ifma_lladdr */
	};

	#ifdef _KERNEL
	#define IFAFREE(ifa) \
	do { \
	IFA_LOCK(ifa); \
	KASSERT((ifa)->ifa_refcnt > 0, \
	("ifa %p !(ifa_refcnt > 0)", ifa)); \
	if (--(ifa)->ifa_refcnt == 0) { \
	IFA_DESTROY(ifa); \
	free(ifa, M_IFADDR); \
	} else \
	IFA_UNLOCK(ifa); \
	} while (0)

	#define IFAREF(ifa) \
	do { \
	IFA_LOCK(ifa); \
	++(ifa)->ifa_refcnt; \
	IFA_UNLOCK(ifa); \
	} while (0)

	extern struct mtx ifnet_lock;
	#define IFNET_LOCK_INIT() \
	mtx_init(&ifnet_lock, "ifnet", NULL, MTX_DEF \| MTX_RECURSE)
	#define IFNET_WLOCK() mtx_lock(&ifnet_lock)
	#define IFNET_WUNLOCK() mtx_unlock(&ifnet_lock)
	#define IFNET_WLOCK_ASSERT() mtx_assert(&ifnet_lock, MA_OWNED)
	#define IFNET_RLOCK() IFNET_WLOCK()
	#define IFNET_RUNLOCK() IFNET_WUNLOCK()

	struct ifindex_entry {
	struct ifnet *ife_ifnet;
	struct cdev *ife_dev;
	};

	struct ifnet *ifnet_byindex(u_short idx);

	/*
	* Given the index, ifaddr_byindex() returns the one and only
	* link-level ifaddr for the interface. You are not supposed to use
	* it to traverse the list of addresses associated to the interface.
	*/
	struct ifaddr *ifaddr_byindex(u_short idx);
	struct cdev *ifdev_byindex(u_short idx);

	extern struct ifnethead ifnet;
	extern int ifqmaxlen;
	extern struct ifnet loif; / first loopback interface */
	extern int if_index;

	int if_addgroup(struct ifnet , const char );
	int if_delgroup(struct ifnet , const char );
	int if_addmulti(struct ifnet , struct sockaddr , struct ifmultiaddr **);
	int if_allmulti(struct ifnet *, int);
	struct ifnet* if_alloc(u_char);
	void if_attach(struct ifnet *);
	int if_delmulti(struct ifnet , struct sockaddr );
	void if_delmulti_ifma(struct ifmultiaddr *);
	void if_detach(struct ifnet *);
	void if_purgeaddrs(struct ifnet *);
	void if_purgemaddrs(struct ifnet *);
	void if_down(struct ifnet *);
	struct ifmultiaddr *
	if_findmulti(struct ifnet , struct sockaddr );
	void if_free(struct ifnet *);
	void if_free_type(struct ifnet *, u_char);
	void if_initname(struct ifnet , const char , int);
	void if_link_state_change(struct ifnet *, int);
	int if_printf(struct ifnet , const char , ...) __printflike(2, 3);
	int if_setlladdr(struct ifnet , const u_char , int);
	void if_up(struct ifnet *);
	/void ifinit(void);/ /* declared in systm.h for main() */
	int ifioctl(struct socket , u_long, caddr_t, struct thread );
	int ifpromisc(struct ifnet *, int);
	struct ifnet ifunit(const char );

	struct ifaddr ifa_ifwithaddr(struct sockaddr );
	struct ifaddr ifa_ifwithbroadaddr(struct sockaddr );
	struct ifaddr ifa_ifwithdstaddr(struct sockaddr );
	struct ifaddr ifa_ifwithnet(struct sockaddr );
	struct ifaddr ifa_ifwithroute(int, struct sockaddr , struct sockaddr *);
	struct ifaddr ifa_ifwithroute_fib(int, struct sockaddr , struct sockaddr *, u_int);

	struct ifaddr ifaof_ifpforaddr(struct sockaddr , struct ifnet *);

	int if_simloop(struct ifnet ifp, struct mbuf m, int af, int hlen);

	typedef void if_com_alloc_t(u_char type, struct ifnet ifp);
	typedef void if_com_free_t(void *com, u_char type);
	void if_register_com_alloc(u_char type, if_com_alloc_t a, if_com_free_t f);
	void if_deregister_com_alloc(u_char type);

	#define IF_LLADDR(ifp) \
	LLADDR((struct sockaddr_dl *)((ifp)->if_addr->ifa_addr))

	#ifdef DEVICE_POLLING
	enum poll_cmd { POLL_ONLY, POLL_AND_CHECK_STATUS };

	typedef void poll_handler_t(struct ifnet *ifp, enum poll_cmd cmd, int count);
	int ether_poll_register(poll_handler_t h, struct ifnet ifp);
	int ether_poll_deregister(struct ifnet *ifp);
	#endif /* DEVICE_POLLING */

	+#include <net/vnet.h>
	+
	#endif /* _KERNEL */

	#endif /* !_NET_IF_VAR_H_ */
	Index: head/sys/net/if_vlan.c
	===================================================================
	--- head/sys/net/if_vlan.c (revision 183549)
	+++ head/sys/net/if_vlan.c (revision 183550)
	@@ -1,1422 +1,1426 @@
	/*-
	* Copyright 1998 Massachusetts Institute of Technology
	*
	* Permission to use, copy, modify, and distribute this software and
	* its documentation for any purpose and without fee is hereby
	* granted, provided that both the above copyright notice and this
	* permission notice appear in all copies, that both the above
	* copyright notice and this permission notice appear in all
	* supporting documentation, and that the name of M.I.T. not be used
	* in advertising or publicity pertaining to distribution of the
	* software without specific, written prior permission. M.I.T. makes
	* no representations about the suitability of this software for any
	* purpose. It is provided "as is" without express or implied
	* warranty.
	*
	* THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
	* ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
	* SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs.
	* Might be extended some day to also handle IEEE 802.1p priority
	* tagging. This is sort of sneaky in the implementation, since
	* we need to pretend to be enough of an Ethernet implementation
	* to make arp work. The way we do this is by telling everyone
	* that we are an Ethernet, and then catch the packets that
	* ether_output() left on our output queue when it calls
	* if_start(), rewrite them for use by the real outgoing interface,
	* and ask it to send them.
	*/

	#include "opt_vlan.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/rwlock.h>
	#include <sys/queue.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/if_vlan_var.h>

	#define VLANNAME "vlan"
	#define VLAN_DEF_HWIDTH 4
	#define VLAN_IFFLAGS (IFF_BROADCAST \| IFF_MULTICAST)

	#define UP_AND_RUNNING(ifp) \
	((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING)

	LIST_HEAD(ifvlanhead, ifvlan);

	struct ifvlantrunk {
	struct ifnet parent; / parent interface of this trunk */
	struct rwlock rw;
	#ifdef VLAN_ARRAY
	#define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1)
	struct ifvlan vlans[VLAN_ARRAY_SIZE]; / static table */
	#else
	struct ifvlanhead hash; / dynamic hash-list table */
	uint16_t hmask;
	uint16_t hwidth;
	#endif
	int refcnt;
	};

	struct vlan_mc_entry {
	struct ether_addr mc_addr;
	SLIST_ENTRY(vlan_mc_entry) mc_entries;
	};

	struct ifvlan {
	struct ifvlantrunk *ifv_trunk;
	struct ifnet *ifv_ifp;
	#define TRUNK(ifv) ((ifv)->ifv_trunk)
	#define PARENT(ifv) ((ifv)->ifv_trunk->parent)
	int ifv_pflags; /* special flags we have set on parent */
	struct ifv_linkmib {
	int ifvm_encaplen; /* encapsulation length */
	int ifvm_mtufudge; /* MTU fudged by this much */
	int ifvm_mintu; /* min transmission unit */
	uint16_t ifvm_proto; /* encapsulation ethertype */
	uint16_t ifvm_tag; /* tag to apply on packets leaving if */
	} ifv_mib;
	SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead;
	#ifndef VLAN_ARRAY
	LIST_ENTRY(ifvlan) ifv_list;
	#endif
	};
	#define ifv_proto ifv_mib.ifvm_proto
	#define ifv_tag ifv_mib.ifvm_tag
	#define ifv_encaplen ifv_mib.ifvm_encaplen
	#define ifv_mtufudge ifv_mib.ifvm_mtufudge
	#define ifv_mintu ifv_mib.ifvm_mintu

	/* Special flags we should propagate to parent. */
	static struct {
	int flag;
	int (func)(struct ifnet , int);
	} vlan_pflags[] = {
	{IFF_PROMISC, ifpromisc},
	{IFF_ALLMULTI, if_allmulti},
	{0, NULL}
	};

	SYSCTL_DECL(_net_link);
	SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN");
	SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency");

	static int soft_pad = 0;
	SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW, &soft_pad, 0,
	"pad short frames before tagging");

	static MALLOC_DEFINE(M_VLAN, VLANNAME, "802.1Q Virtual LAN Interface");

	static eventhandler_tag ifdetach_tag;

	/*
	* We have a global mutex, that is used to serialize configuration
	* changes and isn't used in normal packet delivery.
	*
	* We also have a per-trunk rwlock, that is locked shared on packet
	* processing and exclusive when configuration is changed.
	*
	* The VLAN_ARRAY substitutes the dynamic hash with a static array
	* with 4096 entries. In theory this can give a boost in processing,
	* however on practice it does not. Probably this is because array
	* is too big to fit into CPU cache.
	*/
	static struct mtx ifv_mtx;
	#define VLAN_LOCK_INIT() mtx_init(&ifv_mtx, "vlan_global", NULL, MTX_DEF)
	#define VLAN_LOCK_DESTROY() mtx_destroy(&ifv_mtx)
	#define VLAN_LOCK_ASSERT() mtx_assert(&ifv_mtx, MA_OWNED)
	#define VLAN_LOCK() mtx_lock(&ifv_mtx)
	#define VLAN_UNLOCK() mtx_unlock(&ifv_mtx)
	#define TRUNK_LOCK_INIT(trunk) rw_init(&(trunk)->rw, VLANNAME)
	#define TRUNK_LOCK_DESTROY(trunk) rw_destroy(&(trunk)->rw)
	#define TRUNK_LOCK(trunk) rw_wlock(&(trunk)->rw)
	#define TRUNK_UNLOCK(trunk) rw_wunlock(&(trunk)->rw)
	#define TRUNK_LOCK_ASSERT(trunk) rw_assert(&(trunk)->rw, RA_WLOCKED)
	#define TRUNK_RLOCK(trunk) rw_rlock(&(trunk)->rw)
	#define TRUNK_RUNLOCK(trunk) rw_runlock(&(trunk)->rw)
	#define TRUNK_LOCK_RASSERT(trunk) rw_assert(&(trunk)->rw, RA_RLOCKED)

	#ifndef VLAN_ARRAY
	static void vlan_inithash(struct ifvlantrunk *trunk);
	static void vlan_freehash(struct ifvlantrunk *trunk);
	static int vlan_inshash(struct ifvlantrunk trunk, struct ifvlan ifv);
	static int vlan_remhash(struct ifvlantrunk trunk, struct ifvlan ifv);
	static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch);
	static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk,
	uint16_t tag);
	#endif
	static void trunk_destroy(struct ifvlantrunk *trunk);

	static void vlan_start(struct ifnet *ifp);
	static void vlan_init(void *foo);
	static void vlan_input(struct ifnet ifp, struct mbuf m);
	static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
	static int vlan_setflag(struct ifnet *ifp, int flag, int status,
	int (func)(struct ifnet , int));
	static int vlan_setflags(struct ifnet *ifp, int status);
	static int vlan_setmulti(struct ifnet *ifp);
	static int vlan_unconfig(struct ifnet *ifp);
	static int vlan_unconfig_locked(struct ifnet *ifp);
	static int vlan_config(struct ifvlan ifv, struct ifnet p, uint16_t tag);
	static void vlan_link_state(struct ifnet *ifp, int link);
	static void vlan_capabilities(struct ifvlan *ifv);
	static void vlan_trunk_capabilities(struct ifnet *ifp);

	static struct ifnet vlan_clone_match_ethertag(struct if_clone ,
	const char , int );
	static int vlan_clone_match(struct if_clone , const char );
	static int vlan_clone_create(struct if_clone , char , size_t, caddr_t);
	static int vlan_clone_destroy(struct if_clone , struct ifnet );

	static void vlan_ifdetach(void arg, struct ifnet ifp);

	static struct if_clone vlan_cloner = IFC_CLONE_INITIALIZER(VLANNAME, NULL,
	IF_MAXUNIT, NULL, vlan_clone_match, vlan_clone_create, vlan_clone_destroy);

	#ifndef VLAN_ARRAY
	#define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m))

	static void
	vlan_inithash(struct ifvlantrunk *trunk)
	{
	int i, n;

	/*
	* The trunk must not be locked here since we call malloc(M_WAITOK).
	* It is OK in case this function is called before the trunk struct
	* gets hooked up and becomes visible from other threads.
	*/

	KASSERT(trunk->hwidth == 0 && trunk->hash == NULL,
	("%s: hash already initialized", __func__));

	trunk->hwidth = VLAN_DEF_HWIDTH;
	n = 1 << trunk->hwidth;
	trunk->hmask = n - 1;
	trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK);
	for (i = 0; i < n; i++)
	LIST_INIT(&trunk->hash[i]);
	}

	static void
	vlan_freehash(struct ifvlantrunk *trunk)
	{
	#ifdef INVARIANTS
	int i;

	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));
	for (i = 0; i < (1 << trunk->hwidth); i++)
	KASSERT(LIST_EMPTY(&trunk->hash[i]),
	("%s: hash table not empty", __func__));
	#endif
	free(trunk->hash, M_VLAN);
	trunk->hash = NULL;
	trunk->hwidth = trunk->hmask = 0;
	}

	static int
	vlan_inshash(struct ifvlantrunk trunk, struct ifvlan ifv)
	{
	int i, b;
	struct ifvlan *ifv2;

	TRUNK_LOCK_ASSERT(trunk);
	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));

	b = 1 << trunk->hwidth;
	i = HASH(ifv->ifv_tag, trunk->hmask);
	LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
	if (ifv->ifv_tag == ifv2->ifv_tag)
	return (EEXIST);

	/*
	* Grow the hash when the number of vlans exceeds half of the number of
	* hash buckets squared. This will make the average linked-list length
	* buckets/2.
	*/
	if (trunk->refcnt > (b * b) / 2) {
	vlan_growhash(trunk, 1);
	i = HASH(ifv->ifv_tag, trunk->hmask);
	}
	LIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list);
	trunk->refcnt++;

	return (0);
	}

	static int
	vlan_remhash(struct ifvlantrunk trunk, struct ifvlan ifv)
	{
	int i, b;
	struct ifvlan *ifv2;

	TRUNK_LOCK_ASSERT(trunk);
	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));

	b = 1 << trunk->hwidth;
	i = HASH(ifv->ifv_tag, trunk->hmask);
	LIST_FOREACH(ifv2, &trunk->hash[i], ifv_list)
	if (ifv2 == ifv) {
	trunk->refcnt--;
	LIST_REMOVE(ifv2, ifv_list);
	if (trunk->refcnt < (b * b) / 2)
	vlan_growhash(trunk, -1);
	return (0);
	}

	panic("%s: vlan not found\n", __func__);
	return (ENOENT); /NOTREACHED/
	}

	/*
	* Grow the hash larger or smaller if memory permits.
	*/
	static void
	vlan_growhash(struct ifvlantrunk *trunk, int howmuch)
	{
	struct ifvlan *ifv;
	struct ifvlanhead *hash2;
	int hwidth2, i, j, n, n2;

	TRUNK_LOCK_ASSERT(trunk);
	KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__));

	if (howmuch == 0) {
	/* Harmless yet obvious coding error */
	printf("%s: howmuch is 0\n", __func__);
	return;
	}

	hwidth2 = trunk->hwidth + howmuch;
	n = 1 << trunk->hwidth;
	n2 = 1 << hwidth2;
	/* Do not shrink the table below the default */
	if (hwidth2 < VLAN_DEF_HWIDTH)
	return;

	/* M_NOWAIT because we're called with trunk mutex held */
	hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_NOWAIT);
	if (hash2 == NULL) {
	printf("%s: out of memory -- hash size not changed\n",
	__func__);
	return; /* We can live with the old hash table */
	}
	for (j = 0; j < n2; j++)
	LIST_INIT(&hash2[j]);
	for (i = 0; i < n; i++)
	while ((ifv = LIST_FIRST(&trunk->hash[i])) != NULL) {
	LIST_REMOVE(ifv, ifv_list);
	j = HASH(ifv->ifv_tag, n2 - 1);
	LIST_INSERT_HEAD(&hash2[j], ifv, ifv_list);
	}
	free(trunk->hash, M_VLAN);
	trunk->hash = hash2;
	trunk->hwidth = hwidth2;
	trunk->hmask = n2 - 1;

	if (bootverbose)
	if_printf(trunk->parent,
	"VLAN hash table resized from %d to %d buckets\n", n, n2);
	}

	static __inline struct ifvlan *
	vlan_gethash(struct ifvlantrunk *trunk, uint16_t tag)
	{
	struct ifvlan *ifv;

	TRUNK_LOCK_RASSERT(trunk);

	LIST_FOREACH(ifv, &trunk->hash[HASH(tag, trunk->hmask)], ifv_list)
	if (ifv->ifv_tag == tag)
	return (ifv);
	return (NULL);
	}

	#if 0
	/* Debugging code to view the hashtables. */
	static void
	vlan_dumphash(struct ifvlantrunk *trunk)
	{
	int i;
	struct ifvlan *ifv;

	for (i = 0; i < (1 << trunk->hwidth); i++) {
	printf("%d: ", i);
	LIST_FOREACH(ifv, &trunk->hash[i], ifv_list)
	printf("%s ", ifv->ifv_ifp->if_xname);
	printf("\n");
	}
	}
	#endif /* 0 */
	#endif /* !VLAN_ARRAY */

	static void
	trunk_destroy(struct ifvlantrunk *trunk)
	{
	VLAN_LOCK_ASSERT();

	TRUNK_LOCK(trunk);
	#ifndef VLAN_ARRAY
	vlan_freehash(trunk);
	#endif
	trunk->parent->if_vlantrunk = NULL;
	TRUNK_UNLOCK(trunk);
	TRUNK_LOCK_DESTROY(trunk);
	free(trunk, M_VLAN);
	}

	/*
	* Program our multicast filter. What we're actually doing is
	* programming the multicast filter of the parent. This has the
	* side effect of causing the parent interface to receive multicast
	* traffic that it doesn't really want, which ends up being discarded
	* later by the upper protocol layers. Unfortunately, there's no way
	* to avoid this: there really is only one physical interface.
	*
	* XXX: There is a possible race here if more than one thread is
	* modifying the multicast state of the vlan interface at the same time.
	*/
	static int
	vlan_setmulti(struct ifnet *ifp)
	{
	struct ifnet *ifp_p;
	struct ifmultiaddr ifma, rifma = NULL;
	struct ifvlan *sc;
	struct vlan_mc_entry *mc;
	struct sockaddr_dl sdl;
	int error;

	/VLAN_LOCK_ASSERT();/

	/* Find the parent. */
	sc = ifp->if_softc;
	ifp_p = PARENT(sc);

	+ CURVNET_SET_QUIET(ifp_p->if_vnet);
	+
	bzero((char *)&sdl, sizeof(sdl));
	sdl.sdl_len = sizeof(sdl);
	sdl.sdl_family = AF_LINK;
	sdl.sdl_index = ifp_p->if_index;
	sdl.sdl_type = IFT_ETHER;
	sdl.sdl_alen = ETHER_ADDR_LEN;

	/* First, remove any existing filter entries. */
	while ((mc = SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) {
	bcopy((char *)&mc->mc_addr, LLADDR(&sdl), ETHER_ADDR_LEN);
	error = if_delmulti(ifp_p, (struct sockaddr *)&sdl);
	if (error)
	return (error);
	SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries);
	free(mc, M_VLAN);
	}

	/* Now program new ones. */
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_LINK)
	continue;
	mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT);
	if (mc == NULL)
	return (ENOMEM);
	bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
	(char *)&mc->mc_addr, ETHER_ADDR_LEN);
	SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries);
	bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
	LLADDR(&sdl), ETHER_ADDR_LEN);
	error = if_addmulti(ifp_p, (struct sockaddr *)&sdl, &rifma);
	if (error)
	return (error);
	}

	+ CURVNET_RESTORE();
	return (0);
	}

	/*
	* A handler for network interface departure events.
	* Track departure of trunks here so that we don't access invalid
	* pointers or whatever if a trunk is ripped from under us, e.g.,
	* by ejecting its hot-plug card.
	*/
	static void
	vlan_ifdetach(void arg __unused, struct ifnet ifp)
	{
	struct ifvlan *ifv;
	int i;

	/*
	* Check if it's a trunk interface first of all
	* to avoid needless locking.
	*/
	if (ifp->if_vlantrunk == NULL)
	return;

	VLAN_LOCK();
	/*
	* OK, it's a trunk. Loop over and detach all vlan's on it.
	* Check trunk pointer after each vlan_unconfig() as it will
	* free it and set to NULL after the last vlan was detached.
	*/
	#ifdef VLAN_ARRAY
	for (i = 0; i < VLAN_ARRAY_SIZE; i++)
	if ((ifv = ifp->if_vlantrunk->vlans[i])) {
	vlan_unconfig_locked(ifv->ifv_ifp);
	if (ifp->if_vlantrunk == NULL)
	break;
	}
	#else /* VLAN_ARRAY */
	restart:
	for (i = 0; i < (1 << ifp->if_vlantrunk->hwidth); i++)
	if ((ifv = LIST_FIRST(&ifp->if_vlantrunk->hash[i]))) {
	vlan_unconfig_locked(ifv->ifv_ifp);
	if (ifp->if_vlantrunk)
	goto restart; /* trunk->hwidth can change */
	else
	break;
	}
	#endif /* VLAN_ARRAY */
	/* Trunk should have been destroyed in vlan_unconfig(). */
	KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__));
	VLAN_UNLOCK();
	}

	/*
	* VLAN support can be loaded as a module. The only place in the
	* system that's intimately aware of this is ether_input. We hook
	* into this code through vlan_input_p which is defined there and
	* set here. Noone else in the system should be aware of this so
	* we use an explicit reference here.
	*/
	extern void (vlan_input_p)(struct ifnet , struct mbuf *);

	/* For if_link_state_change() eyes only... */
	extern void (vlan_link_state_p)(struct ifnet , int);

	static int
	vlan_modevent(module_t mod, int type, void *data)
	{

	switch (type) {
	case MOD_LOAD:
	ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
	vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
	if (ifdetach_tag == NULL)
	return (ENOMEM);
	VLAN_LOCK_INIT();
	vlan_input_p = vlan_input;
	vlan_link_state_p = vlan_link_state;
	vlan_trunk_cap_p = vlan_trunk_capabilities;
	if_clone_attach(&vlan_cloner);
	if (bootverbose)
	printf("vlan: initialized, using "
	#ifdef VLAN_ARRAY
	"full-size arrays"
	#else
	"hash tables with chaining"
	#endif

	"\n");
	break;
	case MOD_UNLOAD:
	if_clone_detach(&vlan_cloner);
	EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag);
	vlan_input_p = NULL;
	vlan_link_state_p = NULL;
	vlan_trunk_cap_p = NULL;
	VLAN_LOCK_DESTROY();
	if (bootverbose)
	printf("vlan: unloaded\n");
	break;
	default:
	return (EOPNOTSUPP);
	}
	return (0);
	}

	static moduledata_t vlan_mod = {
	"if_vlan",
	vlan_modevent,
	0
	};

	DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	MODULE_VERSION(if_vlan, 3);
	MODULE_DEPEND(if_vlan, miibus, 1, 1, 1);

	static struct ifnet *
	vlan_clone_match_ethertag(struct if_clone ifc, const char name, int *tag)
	{
	+ INIT_VNET_NET(curvnet);
	const char *cp;
	struct ifnet *ifp;
	int t = 0;

	/* Check for <etherif>.<vlan> style interface names. */
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (ifp->if_type != IFT_ETHER)
	continue;
	if (strncmp(ifp->if_xname, name, strlen(ifp->if_xname)) != 0)
	continue;
	cp = name + strlen(ifp->if_xname);
	if (*cp != '.')
	continue;
	for(; *cp != '\0'; cp++) {
	if (cp < '0' \|\| cp > '9')
	continue;
	t = (t * 10) + (*cp - '0');
	}
	if (tag != NULL)
	*tag = t;
	break;
	}
	IFNET_RUNLOCK();

	return (ifp);
	}

	static int
	vlan_clone_match(struct if_clone ifc, const char name)
	{
	const char *cp;

	if (vlan_clone_match_ethertag(ifc, name, NULL) != NULL)
	return (1);

	if (strncmp(VLANNAME, name, strlen(VLANNAME)) != 0)
	return (0);
	for (cp = name + 4; *cp != '\0'; cp++) {
	if (cp < '0' \|\| cp > '9')
	return (0);
	}

	return (1);
	}

	static int
	vlan_clone_create(struct if_clone ifc, char name, size_t len, caddr_t params)
	{
	char *dp;
	int wildcard;
	int unit;
	int error;
	int tag;
	int ethertag;
	struct ifvlan *ifv;
	struct ifnet *ifp;
	struct ifnet *p;
	struct vlanreq vlr;
	static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */

	/*
	* There are 3 (ugh) ways to specify the cloned device:
	* o pass a parameter block with the clone request.
	* o specify parameters in the text of the clone device name
	* o specify no parameters and get an unattached device that
	* must be configured separately.
	* The first technique is preferred; the latter two are
	* supported for backwards compatibilty.
	*/
	if (params) {
	error = copyin(params, &vlr, sizeof(vlr));
	if (error)
	return error;
	p = ifunit(vlr.vlr_parent);
	if (p == NULL)
	return ENXIO;
	/*
	* Don't let the caller set up a VLAN tag with
	* anything except VLID bits.
	*/
	if (vlr.vlr_tag & ~EVL_VLID_MASK)
	return (EINVAL);
	error = ifc_name2unit(name, &unit);
	if (error != 0)
	return (error);

	ethertag = 1;
	tag = vlr.vlr_tag;
	wildcard = (unit < 0);
	} else if ((p = vlan_clone_match_ethertag(ifc, name, &tag)) != NULL) {
	ethertag = 1;
	unit = -1;
	wildcard = 0;

	/*
	* Don't let the caller set up a VLAN tag with
	* anything except VLID bits.
	*/
	if (tag & ~EVL_VLID_MASK)
	return (EINVAL);
	} else {
	ethertag = 0;

	error = ifc_name2unit(name, &unit);
	if (error != 0)
	return (error);

	wildcard = (unit < 0);
	}

	error = ifc_alloc_unit(ifc, &unit);
	if (error != 0)
	return (error);

	/* In the wildcard case, we need to update the name. */
	if (wildcard) {
	for (dp = name; *dp != '\0'; dp++);
	if (snprintf(dp, len - (dp-name), "%d", unit) >
	len - (dp-name) - 1) {
	panic("%s: interface name too long", __func__);
	}
	}

	ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK \| M_ZERO);
	ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER);
	if (ifp == NULL) {
	ifc_free_unit(ifc, unit);
	free(ifv, M_VLAN);
	return (ENOSPC);
	}
	SLIST_INIT(&ifv->vlan_mc_listhead);

	ifp->if_softc = ifv;
	/*
	* Set the name manually rather than using if_initname because
	* we don't conform to the default naming convention for interfaces.
	*/
	strlcpy(ifp->if_xname, name, IFNAMSIZ);
	ifp->if_dname = ifc->ifc_name;
	ifp->if_dunit = unit;
	/* NB: flags are not set here */
	ifp->if_linkmib = &ifv->ifv_mib;
	ifp->if_linkmiblen = sizeof(ifv->ifv_mib);
	/* NB: mtu is not set here */

	ifp->if_init = vlan_init;
	ifp->if_start = vlan_start;
	ifp->if_ioctl = vlan_ioctl;
	ifp->if_snd.ifq_maxlen = ifqmaxlen;
	ifp->if_flags = VLAN_IFFLAGS;
	ether_ifattach(ifp, eaddr);
	/* Now undo some of the damage... */
	ifp->if_baudrate = 0;
	ifp->if_type = IFT_L2VLAN;
	ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN;

	if (ethertag) {
	error = vlan_config(ifv, p, tag);
	if (error != 0) {
	/*
	* Since we've partialy failed, we need to back
	* out all the way, otherwise userland could get
	* confused. Thus, we destroy the interface.
	*/
	ether_ifdetach(ifp);
	vlan_unconfig(ifp);
	if_free_type(ifp, IFT_ETHER);
	free(ifv, M_VLAN);

	return (error);
	}

	/* Update flags on the parent, if necessary. */
	vlan_setflags(ifp, 1);
	}

	return (0);
	}

	static int
	vlan_clone_destroy(struct if_clone ifc, struct ifnet ifp)
	{
	struct ifvlan *ifv = ifp->if_softc;
	int unit = ifp->if_dunit;

	ether_ifdetach(ifp); /* first, remove it from system-wide lists */
	vlan_unconfig(ifp); /* now it can be unconfigured and freed */
	if_free_type(ifp, IFT_ETHER);
	free(ifv, M_VLAN);
	ifc_free_unit(ifc, unit);

	return (0);
	}

	/*
	* The ifp->if_init entry point for vlan(4) is a no-op.
	*/
	static void
	vlan_init(void *foo __unused)
	{
	}

	/*
	* The if_start method for vlan(4) interface. It doesn't
	* raises the IFF_DRV_OACTIVE flag, since it is called
	* only from IFQ_HANDOFF() macro in ether_output_frame().
	* If the interface queue is full, and vlan_start() is
	* not called, the queue would never get emptied and
	* interface would stall forever.
	*/
	static void
	vlan_start(struct ifnet *ifp)
	{
	struct ifvlan *ifv;
	struct ifnet *p;
	struct mbuf *m;
	int error;

	ifv = ifp->if_softc;
	p = PARENT(ifv);

	for (;;) {
	IF_DEQUEUE(&ifp->if_snd, m);
	if (m == NULL)
	break;
	BPF_MTAP(ifp, m);

	/*
	* Do not run parent's if_start() if the parent is not up,
	* or parent's driver will cause a system crash.
	*/
	if (!UP_AND_RUNNING(p)) {
	m_freem(m);
	ifp->if_collisions++;
	continue;
	}

	/*
	* Pad the frame to the minimum size allowed if told to.
	* This option is in accord with IEEE Std 802.1Q, 2003 Ed.,
	* paragraph C.4.4.3.b. It can help to work around buggy
	* bridges that violate paragraph C.4.4.3.a from the same
	* document, i.e., fail to pad short frames after untagging.
	* E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but
	* untagging it will produce a 62-byte frame, which is a runt
	* and requires padding. There are VLAN-enabled network
	* devices that just discard such runts instead or mishandle
	* them somehow.
	*/
	if (soft_pad) {
	static char pad[8]; /* just zeros */
	int n;

	for (n = ETHERMIN + ETHER_HDR_LEN - m->m_pkthdr.len;
	n > 0; n -= sizeof(pad))
	if (!m_append(m, min(n, sizeof(pad)), pad))
	break;

	if (n > 0) {
	if_printf(ifp, "cannot pad short frame\n");
	ifp->if_oerrors++;
	m_freem(m);
	continue;
	}
	}

	/*
	* If underlying interface can do VLAN tag insertion itself,
	* just pass the packet along. However, we need some way to
	* tell the interface where the packet came from so that it
	* knows how to find the VLAN tag to use, so we attach a
	* packet tag that holds it.
	*/
	if (p->if_capenable & IFCAP_VLAN_HWTAGGING) {
	m->m_pkthdr.ether_vtag = ifv->ifv_tag;
	m->m_flags \|= M_VLANTAG;
	} else {
	m = ether_vlanencap(m, ifv->ifv_tag);
	if (m == NULL) {
	if_printf(ifp,
	"unable to prepend VLAN header\n");
	ifp->if_oerrors++;
	continue;
	}
	}

	/*
	* Send it, precisely as ether_output() would have.
	* We are already running at splimp.
	*/
	IFQ_HANDOFF(p, m, error);
	if (!error)
	ifp->if_opackets++;
	else
	ifp->if_oerrors++;
	}
	}

	static void
	vlan_input(struct ifnet ifp, struct mbuf m)
	{
	struct ifvlantrunk *trunk = ifp->if_vlantrunk;
	struct ifvlan *ifv;
	uint16_t tag;

	KASSERT(trunk != NULL, ("%s: no trunk", __func__));

	if (m->m_flags & M_VLANTAG) {
	/*
	* Packet is tagged, but m contains a normal
	* Ethernet frame; the tag is stored out-of-band.
	*/
	tag = EVL_VLANOFTAG(m->m_pkthdr.ether_vtag);
	m->m_flags &= ~M_VLANTAG;
	} else {
	struct ether_vlan_header *evl;

	/*
	* Packet is tagged in-band as specified by 802.1q.
	*/
	switch (ifp->if_type) {
	case IFT_ETHER:
	if (m->m_len < sizeof(*evl) &&
	(m = m_pullup(m, sizeof(*evl))) == NULL) {
	if_printf(ifp, "cannot pullup VLAN header\n");
	return;
	}
	evl = mtod(m, struct ether_vlan_header *);
	tag = EVL_VLANOFTAG(ntohs(evl->evl_tag));

	/*
	* Remove the 802.1q header by copying the Ethernet
	* addresses over it and adjusting the beginning of
	* the data in the mbuf. The encapsulated Ethernet
	* type field is already in place.
	*/
	bcopy((char )evl, (char )evl + ETHER_VLAN_ENCAP_LEN,
	ETHER_HDR_LEN - ETHER_TYPE_LEN);
	m_adj(m, ETHER_VLAN_ENCAP_LEN);
	break;

	default:
	#ifdef INVARIANTS
	panic("%s: %s has unsupported if_type %u",
	__func__, ifp->if_xname, ifp->if_type);
	#endif
	m_freem(m);
	ifp->if_noproto++;
	return;
	}
	}

	TRUNK_RLOCK(trunk);
	#ifdef VLAN_ARRAY
	ifv = trunk->vlans[tag];
	#else
	ifv = vlan_gethash(trunk, tag);
	#endif
	if (ifv == NULL \|\| !UP_AND_RUNNING(ifv->ifv_ifp)) {
	TRUNK_RUNLOCK(trunk);
	m_freem(m);
	ifp->if_noproto++;
	return;
	}
	TRUNK_RUNLOCK(trunk);

	m->m_pkthdr.rcvif = ifv->ifv_ifp;
	ifv->ifv_ifp->if_ipackets++;

	/* Pass it back through the parent's input routine. */
	(*ifp->if_input)(ifv->ifv_ifp, m);
	}

	static int
	vlan_config(struct ifvlan ifv, struct ifnet p, uint16_t tag)
	{
	struct ifvlantrunk *trunk;
	struct ifnet *ifp;
	int error = 0;

	/* VID numbers 0x0 and 0xFFF are reserved */
	if (tag == 0 \|\| tag == 0xFFF)
	return (EINVAL);
	if (p->if_type != IFT_ETHER)
	return (EPROTONOSUPPORT);
	if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS)
	return (EPROTONOSUPPORT);
	if (ifv->ifv_trunk)
	return (EBUSY);

	if (p->if_vlantrunk == NULL) {
	trunk = malloc(sizeof(struct ifvlantrunk),
	M_VLAN, M_WAITOK \| M_ZERO);
	#ifndef VLAN_ARRAY
	vlan_inithash(trunk);
	#endif
	VLAN_LOCK();
	if (p->if_vlantrunk != NULL) {
	/* A race that that is very unlikely to be hit. */
	#ifndef VLAN_ARRAY
	vlan_freehash(trunk);
	#endif
	free(trunk, M_VLAN);
	goto exists;
	}
	TRUNK_LOCK_INIT(trunk);
	TRUNK_LOCK(trunk);
	p->if_vlantrunk = trunk;
	trunk->parent = p;
	} else {
	VLAN_LOCK();
	exists:
	trunk = p->if_vlantrunk;
	TRUNK_LOCK(trunk);
	}

	ifv->ifv_tag = tag; /* must set this before vlan_inshash() */
	#ifdef VLAN_ARRAY
	if (trunk->vlans[tag] != NULL) {
	error = EEXIST;
	goto done;
	}
	trunk->vlans[tag] = ifv;
	trunk->refcnt++;
	#else
	error = vlan_inshash(trunk, ifv);
	if (error)
	goto done;
	#endif
	ifv->ifv_proto = ETHERTYPE_VLAN;
	ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN;
	ifv->ifv_mintu = ETHERMIN;
	ifv->ifv_pflags = 0;

	/*
	* If the parent supports the VLAN_MTU capability,
	* i.e. can Tx/Rx larger than ETHER_MAX_LEN frames,
	* use it.
	*/
	if (p->if_capenable & IFCAP_VLAN_MTU) {
	/*
	* No need to fudge the MTU since the parent can
	* handle extended frames.
	*/
	ifv->ifv_mtufudge = 0;
	} else {
	/*
	* Fudge the MTU by the encapsulation size. This
	* makes us incompatible with strictly compliant
	* 802.1Q implementations, but allows us to use
	* the feature with other NetBSD implementations,
	* which might still be useful.
	*/
	ifv->ifv_mtufudge = ifv->ifv_encaplen;
	}

	ifv->ifv_trunk = trunk;
	ifp = ifv->ifv_ifp;
	ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge;
	ifp->if_baudrate = p->if_baudrate;
	/*
	* Copy only a selected subset of flags from the parent.
	* Other flags are none of our business.
	*/
	#define VLAN_COPY_FLAGS (IFF_SIMPLEX)
	ifp->if_flags &= ~VLAN_COPY_FLAGS;
	ifp->if_flags \|= p->if_flags & VLAN_COPY_FLAGS;
	#undef VLAN_COPY_FLAGS

	ifp->if_link_state = p->if_link_state;

	vlan_capabilities(ifv);

	/*
	* Set up our ``Ethernet address'' to reflect the underlying
	* physical interface's.
	*/
	bcopy(IF_LLADDR(p), IF_LLADDR(ifp), ETHER_ADDR_LEN);

	/*
	* Configure multicast addresses that may already be
	* joined on the vlan device.
	*/
	(void)vlan_setmulti(ifp); /* XXX: VLAN lock held */

	/* We are ready for operation now. */
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	done:
	TRUNK_UNLOCK(trunk);
	if (error == 0)
	EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_tag);
	VLAN_UNLOCK();

	return (error);
	}

	static int
	vlan_unconfig(struct ifnet *ifp)
	{
	int ret;

	VLAN_LOCK();
	ret = vlan_unconfig_locked(ifp);
	VLAN_UNLOCK();
	return (ret);
	}

	static int
	vlan_unconfig_locked(struct ifnet *ifp)
	{
	struct ifvlantrunk *trunk;
	struct vlan_mc_entry *mc;
	struct ifvlan *ifv;
	struct ifnet *parent;
	int error;

	VLAN_LOCK_ASSERT();

	ifv = ifp->if_softc;
	trunk = ifv->ifv_trunk;
	parent = NULL;

	if (trunk != NULL) {
	struct sockaddr_dl sdl;

	TRUNK_LOCK(trunk);
	parent = trunk->parent;

	/*
	* Since the interface is being unconfigured, we need to
	* empty the list of multicast groups that we may have joined
	* while we were alive from the parent's list.
	*/
	bzero((char *)&sdl, sizeof(sdl));
	sdl.sdl_len = sizeof(sdl);
	sdl.sdl_family = AF_LINK;
	sdl.sdl_index = parent->if_index;
	sdl.sdl_type = IFT_ETHER;
	sdl.sdl_alen = ETHER_ADDR_LEN;

	while ((mc = SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) {
	bcopy((char *)&mc->mc_addr, LLADDR(&sdl),
	ETHER_ADDR_LEN);
	error = if_delmulti(parent, (struct sockaddr *)&sdl);
	if (error)
	return (error);
	SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries);
	free(mc, M_VLAN);
	}

	vlan_setflags(ifp, 0); /* clear special flags on parent */
	#ifdef VLAN_ARRAY
	trunk->vlans[ifv->ifv_tag] = NULL;
	trunk->refcnt--;
	#else
	vlan_remhash(trunk, ifv);
	#endif
	ifv->ifv_trunk = NULL;

	/*
	* Check if we were the last.
	*/
	if (trunk->refcnt == 0) {
	trunk->parent->if_vlantrunk = NULL;
	/*
	* XXXGL: If some ithread has already entered
	* vlan_input() and is now blocked on the trunk
	* lock, then it should preempt us right after
	* unlock and finish its work. Then we will acquire
	* lock again in trunk_destroy().
	*/
	TRUNK_UNLOCK(trunk);
	trunk_destroy(trunk);
	} else
	TRUNK_UNLOCK(trunk);
	}

	/* Disconnect from parent. */
	if (ifv->ifv_pflags)
	if_printf(ifp, "%s: ifv_pflags unclean\n", __func__);
	ifp->if_mtu = ETHERMTU;
	ifp->if_link_state = LINK_STATE_UNKNOWN;
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;

	/*
	* Only dispatch an event if vlan was
	* attached, otherwise there is nothing
	* to cleanup anyway.
	*/
	if (parent != NULL)
	EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_tag);

	return (0);
	}

	/* Handle a reference counted flag that should be set on the parent as well */
	static int
	vlan_setflag(struct ifnet *ifp, int flag, int status,
	int (func)(struct ifnet , int))
	{
	struct ifvlan *ifv;
	int error;

	/* XXX VLAN_LOCK_ASSERT(); */

	ifv = ifp->if_softc;
	status = status ? (ifp->if_flags & flag) : 0;
	/* Now "status" contains the flag value or 0 */

	/*
	* See if recorded parent's status is different from what
	* we want it to be. If it is, flip it. We record parent's
	* status in ifv_pflags so that we won't clear parent's flag
	* we haven't set. In fact, we don't clear or set parent's
	* flags directly, but get or release references to them.
	* That's why we can be sure that recorded flags still are
	* in accord with actual parent's flags.
	*/
	if (status != (ifv->ifv_pflags & flag)) {
	error = (*func)(PARENT(ifv), status);
	if (error)
	return (error);
	ifv->ifv_pflags &= ~flag;
	ifv->ifv_pflags \|= status;
	}
	return (0);
	}

	/*
	* Handle IFF_* flags that require certain changes on the parent:
	* if "status" is true, update parent's flags respective to our if_flags;
	* if "status" is false, forcedly clear the flags set on parent.
	*/
	static int
	vlan_setflags(struct ifnet *ifp, int status)
	{
	int error, i;

	for (i = 0; vlan_pflags[i].flag; i++) {
	error = vlan_setflag(ifp, vlan_pflags[i].flag,
	status, vlan_pflags[i].func);
	if (error)
	return (error);
	}
	return (0);
	}

	/* Inform all vlans that their parent has changed link state */
	static void
	vlan_link_state(struct ifnet *ifp, int link)
	{
	struct ifvlantrunk *trunk = ifp->if_vlantrunk;
	struct ifvlan *ifv;
	int i;

	TRUNK_LOCK(trunk);
	#ifdef VLAN_ARRAY
	for (i = 0; i < VLAN_ARRAY_SIZE; i++)
	if (trunk->vlans[i] != NULL) {
	ifv = trunk->vlans[i];
	#else
	for (i = 0; i < (1 << trunk->hwidth); i++)
	LIST_FOREACH(ifv, &trunk->hash[i], ifv_list) {
	#endif
	ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate;
	if_link_state_change(ifv->ifv_ifp,
	trunk->parent->if_link_state);
	}
	TRUNK_UNLOCK(trunk);
	}

	static void
	vlan_capabilities(struct ifvlan *ifv)
	{
	struct ifnet *p = PARENT(ifv);
	struct ifnet *ifp = ifv->ifv_ifp;

	TRUNK_LOCK_ASSERT(TRUNK(ifv));

	/*
	* If the parent interface can do checksum offloading
	* on VLANs, then propagate its hardware-assisted
	* checksumming flags. Also assert that checksum
	* offloading requires hardware VLAN tagging.
	*/
	if (p->if_capabilities & IFCAP_VLAN_HWCSUM)
	ifp->if_capabilities = p->if_capabilities & IFCAP_HWCSUM;

	if (p->if_capenable & IFCAP_VLAN_HWCSUM &&
	p->if_capenable & IFCAP_VLAN_HWTAGGING) {
	ifp->if_capenable = p->if_capenable & IFCAP_HWCSUM;
	ifp->if_hwassist = p->if_hwassist;
	} else {
	ifp->if_capenable = 0;
	ifp->if_hwassist = 0;
	}
	}

	static void
	vlan_trunk_capabilities(struct ifnet *ifp)
	{
	struct ifvlantrunk *trunk = ifp->if_vlantrunk;
	struct ifvlan *ifv;
	int i;

	TRUNK_LOCK(trunk);
	#ifdef VLAN_ARRAY
	for (i = 0; i < VLAN_ARRAY_SIZE; i++)
	if (trunk->vlans[i] != NULL) {
	ifv = trunk->vlans[i];
	#else
	for (i = 0; i < (1 << trunk->hwidth); i++) {
	LIST_FOREACH(ifv, &trunk->hash[i], ifv_list)
	#endif
	vlan_capabilities(ifv);
	}
	TRUNK_UNLOCK(trunk);
	}

	static int
	vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
	{
	struct ifaddr *ifa;
	struct ifnet *p;
	struct ifreq *ifr;
	struct ifvlan *ifv;
	struct vlanreq vlr;
	int error = 0;

	ifr = (struct ifreq *)data;
	ifa = (struct ifaddr *)data;
	ifv = ifp->if_softc;

	switch (cmd) {
	case SIOCGIFMEDIA:
	VLAN_LOCK();
	if (TRUNK(ifv) != NULL) {
	error = (*PARENT(ifv)->if_ioctl)(PARENT(ifv),
	SIOCGIFMEDIA, data);
	VLAN_UNLOCK();
	/* Limit the result to the parent's current config. */
	if (error == 0) {
	struct ifmediareq *ifmr;

	ifmr = (struct ifmediareq *)data;
	if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) {
	ifmr->ifm_count = 1;
	error = copyout(&ifmr->ifm_current,
	ifmr->ifm_ulist,
	sizeof(int));
	}
	}
	} else {
	VLAN_UNLOCK();
	error = EINVAL;
	}
	break;

	case SIOCSIFMEDIA:
	error = EINVAL;
	break;

	case SIOCSIFMTU:
	/*
	* Set the interface MTU.
	*/
	VLAN_LOCK();
	if (TRUNK(ifv) != NULL) {
	if (ifr->ifr_mtu >
	(PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) \|\|
	ifr->ifr_mtu <
	(ifv->ifv_mintu - ifv->ifv_mtufudge))
	error = EINVAL;
	else
	ifp->if_mtu = ifr->ifr_mtu;
	} else
	error = EINVAL;
	VLAN_UNLOCK();
	break;

	case SIOCSETVLAN:
	error = copyin(ifr->ifr_data, &vlr, sizeof(vlr));
	if (error)
	break;
	if (vlr.vlr_parent[0] == '\0') {
	vlan_unconfig(ifp);
	break;
	}
	p = ifunit(vlr.vlr_parent);
	if (p == 0) {
	error = ENOENT;
	break;
	}
	/*
	* Don't let the caller set up a VLAN tag with
	* anything except VLID bits.
	*/
	if (vlr.vlr_tag & ~EVL_VLID_MASK) {
	error = EINVAL;
	break;
	}
	error = vlan_config(ifv, p, vlr.vlr_tag);
	if (error)
	break;

	/* Update flags on the parent, if necessary. */
	vlan_setflags(ifp, 1);
	break;

	case SIOCGETVLAN:
	bzero(&vlr, sizeof(vlr));
	VLAN_LOCK();
	if (TRUNK(ifv) != NULL) {
	strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname,
	sizeof(vlr.vlr_parent));
	vlr.vlr_tag = ifv->ifv_tag;
	}
	VLAN_UNLOCK();
	error = copyout(&vlr, ifr->ifr_data, sizeof(vlr));
	break;

	case SIOCSIFFLAGS:
	/*
	* We should propagate selected flags to the parent,
	* e.g., promiscuous mode.
	*/
	if (TRUNK(ifv) != NULL)
	error = vlan_setflags(ifp, 1);
	break;

	case SIOCADDMULTI:
	case SIOCDELMULTI:
	/*
	* If we don't have a parent, just remember the membership for
	* when we do.
	*/
	if (TRUNK(ifv) != NULL)
	error = vlan_setmulti(ifp);
	break;

	default:
	error = ether_ioctl(ifp, cmd, data);
	}

	return (error);
	}
	Index: head/sys/net/raw_cb.c
	===================================================================
	--- head/sys/net/raw_cb.c (revision 183549)
	+++ head/sys/net/raw_cb.c (revision 183550)
	@@ -1,116 +1,118 @@
	/*-
	* Copyright (c) 1980, 1986, 1993
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)raw_cb.c 8.1 (Berkeley) 6/10/93
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/domain.h>
	#include <sys/lock.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	+#include <net/if.h>
	#include <net/raw_cb.h>

	/*
	* Routines to manage the raw protocol control blocks.
	*
	* TODO:
	* hash lookups by protocol family/protocol + address family
	* take care of unique address problems per AF?
	* redo address binding to allow wildcards
	*/

	struct mtx rawcb_mtx;
	struct rawcb_list_head rawcb_list;

	SYSCTL_NODE(_net, OID_AUTO, raw, CTLFLAG_RW, 0, "Raw socket infrastructure");

	static u_long raw_sendspace = RAWSNDQ;
	SYSCTL_ULONG(_net_raw, OID_AUTO, sendspace, CTLFLAG_RW, &raw_sendspace, 0,
	"Default raw socket send space");

	static u_long raw_recvspace = RAWRCVQ;
	SYSCTL_ULONG(_net_raw, OID_AUTO, recvspace, CTLFLAG_RW, &raw_recvspace, 0,
	"Default raw socket receive space");

	/*
	* Allocate a control block and a nominal amount of buffer space for the
	* socket.
	*/
	int
	raw_attach(struct socket *so, int proto)
	{
	+ INIT_VNET_NET(so->so_vnet);
	struct rawcb *rp = sotorawcb(so);
	int error;

	/*
	* It is assumed that raw_attach is called after space has been
	* allocated for the rawcb; consumer protocols may simply allocate
	* type struct rawcb, or a wrapper data structure that begins with a
	* struct rawcb.
	*/
	KASSERT(rp != NULL, ("raw_attach: rp == NULL"));

	error = soreserve(so, raw_sendspace, raw_recvspace);
	if (error)
	return (error);
	rp->rcb_socket = so;
	rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
	rp->rcb_proto.sp_protocol = proto;
	mtx_lock(&rawcb_mtx);
	LIST_INSERT_HEAD(&V_rawcb_list, rp, list);
	mtx_unlock(&rawcb_mtx);
	return (0);
	}

	/*
	* Detach the raw connection block and discard socket resources.
	*/
	void
	raw_detach(struct rawcb *rp)
	{
	struct socket *so = rp->rcb_socket;

	KASSERT(so->so_pcb == rp, ("raw_detach: so_pcb != rp"));

	so->so_pcb = NULL;
	mtx_lock(&rawcb_mtx);
	LIST_REMOVE(rp, list);
	mtx_unlock(&rawcb_mtx);
	free((caddr_t)(rp), M_PCB);
	}
	Index: head/sys/net/raw_usrreq.c
	===================================================================
	--- head/sys/net/raw_usrreq.c (revision 183549)
	+++ head/sys/net/raw_usrreq.c (revision 183550)
	@@ -1,263 +1,266 @@
	/*-
	* Copyright (c) 1980, 1986, 1993
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	+#include <net/if.h>
	#include <net/raw_cb.h>

	MTX_SYSINIT(rawcb_mtx, &rawcb_mtx, "rawcb", MTX_DEF);

	/*
	* Initialize raw connection block q.
	*/
	void
	raw_init(void)
	{
	+ INIT_VNET_NET(curvnet);

	LIST_INIT(&V_rawcb_list);
	}

	/*
	* Raw protocol input routine. Find the socket associated with the packet(s)
	* and move them over. If nothing exists for this packet, drop it.
	*/
	/*
	* Raw protocol interface.
	*/
	void
	raw_input(struct mbuf m0, struct sockproto proto, struct sockaddr *src)
	{
	+ INIT_VNET_NET(curvnet);
	struct rawcb *rp;
	struct mbuf *m = m0;
	struct socket *last;

	last = 0;
	mtx_lock(&rawcb_mtx);
	LIST_FOREACH(rp, &V_rawcb_list, list) {
	if (rp->rcb_proto.sp_family != proto->sp_family)
	continue;
	if (rp->rcb_proto.sp_protocol &&
	rp->rcb_proto.sp_protocol != proto->sp_protocol)
	continue;
	if (last) {
	struct mbuf *n;
	n = m_copy(m, 0, (int)M_COPYALL);
	if (n) {
	if (sbappendaddr(&last->so_rcv, src,
	n, (struct mbuf *)0) == 0)
	/* should notify about lost packet */
	m_freem(n);
	else
	sorwakeup(last);
	}
	}
	last = rp->rcb_socket;
	}
	if (last) {
	if (sbappendaddr(&last->so_rcv, src,
	m, (struct mbuf *)0) == 0)
	m_freem(m);
	else
	sorwakeup(last);
	} else
	m_freem(m);
	mtx_unlock(&rawcb_mtx);
	}

	/ARGSUSED/
	void
	raw_ctlinput(int cmd, struct sockaddr arg, void dummy)
	{

	if (cmd < 0 \|\| cmd >= PRC_NCMDS)
	return;
	/* INCOMPLETE */
	}

	static void
	raw_uabort(struct socket *so)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_uabort: rp == NULL"));

	soisdisconnected(so);
	}

	static void
	raw_uclose(struct socket *so)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_uabort: rp == NULL"));

	soisdisconnected(so);
	}

	/* pru_accept is EOPNOTSUPP */

	static int
	raw_uattach(struct socket so, int proto, struct thread td)
	{
	int error;

	/*
	* Implementors of raw sockets will already have allocated the PCB,
	* so it must be non-NULL here.
	*/
	KASSERT(sotorawcb(so) != NULL, ("raw_uattach: so_pcb == NULL"));

	if (td != NULL) {
	error = priv_check(td, PRIV_NET_RAW);
	if (error)
	return (error);
	}
	return (raw_attach(so, proto));
	}

	static int
	raw_ubind(struct socket so, struct sockaddr nam, struct thread *td)
	{

	return (EINVAL);
	}

	static int
	raw_uconnect(struct socket so, struct sockaddr nam, struct thread *td)
	{

	return (EINVAL);
	}

	/* pru_connect2 is EOPNOTSUPP */
	/* pru_control is EOPNOTSUPP */

	static void
	raw_udetach(struct socket *so)
	{
	struct rawcb *rp = sotorawcb(so);

	KASSERT(rp != NULL, ("raw_udetach: rp == NULL"));

	raw_detach(rp);
	}

	static int
	raw_udisconnect(struct socket *so)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_udisconnect: rp == NULL"));

	return (ENOTCONN);
	}

	/* pru_listen is EOPNOTSUPP */

	static int
	raw_upeeraddr(struct socket so, struct sockaddr *nam)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_upeeraddr: rp == NULL"));

	return (ENOTCONN);
	}

	/* pru_rcvd is EOPNOTSUPP */
	/* pru_rcvoob is EOPNOTSUPP */

	static int
	raw_usend(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_usend: rp == NULL"));

	if ((flags & PRUS_OOB) \|\| (control && control->m_len)) {
	/* XXXRW: Should control also be freed here? */
	if (m != NULL)
	m_freem(m);
	return (EOPNOTSUPP);
	}

	/*
	* For historical (bad?) reasons, we effectively ignore the address
	* argument to sendto(2). Perhaps we should return an error instead?
	*/
	return ((*so->so_proto->pr_output)(m, so));
	}

	/* pru_sense is null */

	static int
	raw_ushutdown(struct socket *so)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_ushutdown: rp == NULL"));

	socantsendmore(so);
	return (0);
	}

	static int
	raw_usockaddr(struct socket so, struct sockaddr *nam)
	{

	KASSERT(sotorawcb(so) != NULL, ("raw_usockaddr: rp == NULL"));

	return (EINVAL);
	}

	struct pr_usrreqs raw_usrreqs = {
	.pru_abort = raw_uabort,
	.pru_attach = raw_uattach,
	.pru_bind = raw_ubind,
	.pru_connect = raw_uconnect,
	.pru_detach = raw_udetach,
	.pru_disconnect = raw_udisconnect,
	.pru_peeraddr = raw_upeeraddr,
	.pru_send = raw_usend,
	.pru_shutdown = raw_ushutdown,
	.pru_sockaddr = raw_usockaddr,
	.pru_close = raw_uclose,
	};
	Index: head/sys/net/route.c
	===================================================================
	--- head/sys/net/route.c (revision 183549)
	+++ head/sys/net/route.c (revision 183550)
	@@ -1,1782 +1,1789 @@
	/*-
	* Copyright (c) 1980, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)route.c 8.3.1.1 (Berkeley) 2/23/95
	* $FreeBSD$
	*/
	/************************************************************************
	* Note: In this file a 'fib' is a "forwarding information base" *
	* Which is the new name for an in kernel routing (next hop) table. *
	***********************************************************************/

	#include "opt_inet.h"
	#include "opt_route.h"
	#include "opt_mrouting.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/proc.h>
	#include <sys/domain.h>
	#include <sys/kernel.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/ip_mroute.h>

	#include <vm/uma.h>

	u_int rt_numfibs = RT_NUMFIBS;
	SYSCTL_INT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
	/*
	* Allow the boot code to allow LESS than RT_MAXFIBS to be used.
	* We can't do more because storage is statically allocated for now.
	* (for compatibility reasons.. this will change).
	*/
	TUNABLE_INT("net.fibs", &rt_numfibs);

	/*
	* By default add routes to all fibs for new interfaces.
	* Once this is set to 0 then only allocate routes on interface
	* changes for the FIB of the caller when adding a new set of addresses
	* to an interface. XXX this is a shotgun aproach to a problem that needs
	* a more fine grained solution.. that will come.
	*/
	u_int rt_add_addr_allfibs = 1;
	SYSCTL_INT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
	&rt_add_addr_allfibs, 0, "");
	TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);

	static struct rtstat rtstat;

	/* by default only the first 'row' of tables will be accessed. */
	/*
	* XXXMRT When we fix netstat, and do this differnetly,
	* we can allocate this dynamically. As long as we are keeping
	* things backwards compaitble we need to allocate this
	* statically.
	*/
	struct radix_node_head *rt_tables[RT_MAXFIBS][AF_MAX+1];

	static int rttrash; /* routes not in table but not freed */

	static void rt_maskedcopy(struct sockaddr *,
	struct sockaddr , struct sockaddr );

	/* compare two sockaddr structures */
	#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)

	/*
	* Convert a 'struct radix_node ' to a 'struct rtentry '.
	* The operation can be done safely (in this code) because a
	* 'struct rtentry' starts with two 'struct radix_node''s, the first
	* one representing leaf nodes in the routing tree, which is
	* what the code in radix.c passes us as a 'struct radix_node'.
	*
	* But because there are a lot of assumptions in this conversion,
	* do not cast explicitly, but always use the macro below.
	*/
	#define RNTORT(p) ((struct rtentry *)(p))

	static uma_zone_t rtzone; /* Routing table UMA zone. */

	#if 0
	/* default fib for tunnels to use */
	u_int tunnel_fib = 0;
	SYSCTL_INT(_net, OID_AUTO, tunnelfib, CTLFLAG_RD, &tunnel_fib, 0, "");
	#endif

	/*
	* handler for net.my_fibnum
	*/
	static int
	sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
	{
	int fibnum;
	int error;

	fibnum = curthread->td_proc->p_fibnum;
	error = sysctl_handle_int(oidp, &fibnum, 0, req);
	return (error);
	}

	SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT\|CTLFLAG_RD,
	NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");

	static void
	route_init(void)
	{
	int table;
	struct domain *dom;
	int fam;

	/* whack the tunable ints into line. */
	if (rt_numfibs > RT_MAXFIBS)
	rt_numfibs = RT_MAXFIBS;
	if (rt_numfibs == 0)
	rt_numfibs = 1;
	rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, 0);
	rn_init(); /* initialize all zeroes, all ones, mask table */

	for (dom = domains; dom; dom = dom->dom_next) {
	if (dom->dom_rtattach) {
	for (table = 0; table < rt_numfibs; table++) {
	if ( (fam = dom->dom_family) == AF_INET \|\|
	table == 0) {
	/* for now only AF_INET has > 1 table */
	/* XXX MRT
	* rtattach will be also called
	* from vfs_export.c but the
	* offset will be 0
	* (only for AF_INET and AF_INET6
	* which don't need it anyhow)
	*/
	dom->dom_rtattach(
	(void **)&V_rt_tables[table][fam],
	dom->dom_rtoffset);
	} else {
	break;
	}
	}
	}
	}
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setfib_args {
	int fibnum;
	};
	#endif
	int
	setfib(struct thread td, struct setfib_args uap)
	{
	if (uap->fibnum < 0 \|\| uap->fibnum >= rt_numfibs)
	return EINVAL;
	td->td_proc->p_fibnum = uap->fibnum;
	return (0);
	}

	/*
	* Packet routing routines.
	*/
	void
	rtalloc(struct route *ro)
	{
	rtalloc_ign_fib(ro, 0UL, 0);
	}

	void
	rtalloc_fib(struct route *ro, u_int fibnum)
	{
	rtalloc_ign_fib(ro, 0UL, fibnum);
	}

	void
	rtalloc_ign(struct route *ro, u_long ignore)
	{
	struct rtentry *rt;

	if ((rt = ro->ro_rt) != NULL) {
	if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
	return;
	RTFREE(rt);
	ro->ro_rt = NULL;
	}
	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
	if (ro->ro_rt)
	RT_UNLOCK(ro->ro_rt);
	}

	void
	rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
	{
	struct rtentry *rt;

	if ((rt = ro->ro_rt) != NULL) {
	if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
	return;
	RTFREE(rt);
	ro->ro_rt = NULL;
	}
	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
	if (ro->ro_rt)
	RT_UNLOCK(ro->ro_rt);
	}

	/*
	* Look up the route that matches the address given
	* Or, at least try.. Create a cloned route if needed.
	*
	* The returned route, if any, is locked.
	*/
	struct rtentry *
	rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
	{
	return (rtalloc1_fib(dst, report, ignflags, 0));
	}

	struct rtentry *
	rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
	u_int fibnum)
	{
	+ INIT_VNET_NET(curvnet);
	struct radix_node_head *rnh;
	struct rtentry *rt;
	struct radix_node *rn;
	struct rtentry *newrt;
	struct rt_addrinfo info;
	u_long nflags;
	int err = 0, msgtype = RTM_MISS;

	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
	if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
	fibnum = 0;
	rnh = V_rt_tables[fibnum][dst->sa_family];
	newrt = NULL;
	/*
	* Look up the address in the table for that Address Family
	*/
	if (rnh == NULL) {
	V_rtstat.rts_unreach++;
	goto miss2;
	}
	RADIX_NODE_HEAD_LOCK(rnh);
	if ((rn = rnh->rnh_matchaddr(dst, rnh)) &&
	(rn->rn_flags & RNF_ROOT) == 0) {
	/*
	* If we find it and it's not the root node, then
	* get a reference on the rtentry associated.
	*/
	newrt = rt = RNTORT(rn);
	nflags = rt->rt_flags & ~ignflags;
	if (report && (nflags & RTF_CLONING)) {
	/*
	* We are apparently adding (report = 0 in delete).
	* If it requires that it be cloned, do so.
	* (This implies it wasn't a HOST route.)
	*/
	err = rtrequest_fib(RTM_RESOLVE, dst, NULL,
	NULL, 0, &newrt, fibnum);
	if (err) {
	/*
	* If the cloning didn't succeed, maybe
	* what we have will do. Return that.
	*/
	newrt = rt; /* existing route */
	RT_LOCK(newrt);
	RT_ADDREF(newrt);
	goto miss;
	}
	KASSERT(newrt, ("no route and no error"));
	RT_LOCK(newrt);
	if (newrt->rt_flags & RTF_XRESOLVE) {
	/*
	* If the new route specifies it be
	* externally resolved, then go do that.
	*/
	msgtype = RTM_RESOLVE;
	goto miss;
	}
	/* Inform listeners of the new route. */
	bzero(&info, sizeof(info));
	info.rti_info[RTAX_DST] = rt_key(newrt);
	info.rti_info[RTAX_NETMASK] = rt_mask(newrt);
	info.rti_info[RTAX_GATEWAY] = newrt->rt_gateway;
	if (newrt->rt_ifp != NULL) {
	info.rti_info[RTAX_IFP] =
	newrt->rt_ifp->if_addr->ifa_addr;
	info.rti_info[RTAX_IFA] = newrt->rt_ifa->ifa_addr;
	}
	rt_missmsg(RTM_ADD, &info, newrt->rt_flags, 0);
	} else {
	RT_LOCK(newrt);
	RT_ADDREF(newrt);
	}
	RADIX_NODE_HEAD_UNLOCK(rnh);
	} else {
	/*
	* Either we hit the root or couldn't find any match,
	* Which basically means
	* "caint get there frm here"
	*/
	V_rtstat.rts_unreach++;
	miss:
	RADIX_NODE_HEAD_UNLOCK(rnh);
	miss2: if (report) {
	/*
	* If required, report the failure to the supervising
	* Authorities.
	* For a delete, this is not an error. (report == 0)
	*/
	bzero(&info, sizeof(info));
	info.rti_info[RTAX_DST] = dst;
	rt_missmsg(msgtype, &info, 0, err);
	}
	}
	if (newrt)
	RT_LOCK_ASSERT(newrt);
	return (newrt);
	}

	/*
	* Remove a reference count from an rtentry.
	* If the count gets low enough, take it out of the routing table
	*/
	void
	rtfree(struct rtentry *rt)
	{
	+ INIT_VNET_NET(curvnet);
	struct radix_node_head *rnh;

	KASSERT(rt != NULL,("%s: NULL rt", __func__));
	rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family];
	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));

	RT_LOCK_ASSERT(rt);

	/*
	* The callers should use RTFREE_LOCKED() or RTFREE(), so
	* we should come here exactly with the last reference.
	*/
	RT_REMREF(rt);
	if (rt->rt_refcnt > 0) {
	printf("%s: %p has %lu refs\n", __func__, rt, rt->rt_refcnt);
	goto done;
	}

	/*
	* On last reference give the "close method" a chance
	* to cleanup private state. This also permits (for
	* IPv4 and IPv6) a chance to decide if the routing table
	* entry should be purged immediately or at a later time.
	* When an immediate purge is to happen the close routine
	* typically calls rtexpunge which clears the RTF_UP flag
	* on the entry so that the code below reclaims the storage.
	*/
	if (rt->rt_refcnt == 0 && rnh->rnh_close)
	rnh->rnh_close((struct radix_node *)rt, rnh);

	/*
	* If we are no longer "up" (and ref == 0)
	* then we can free the resources associated
	* with the route.
	*/
	if ((rt->rt_flags & RTF_UP) == 0) {
	if (rt->rt_nodes->rn_flags & (RNF_ACTIVE \| RNF_ROOT))
	panic("rtfree 2");
	/*
	* the rtentry must have been removed from the routing table
	* so it is represented in rttrash.. remove that now.
	*/
	V_rttrash--;
	#ifdef DIAGNOSTIC
	if (rt->rt_refcnt < 0) {
	printf("rtfree: %p not freed (neg refs)\n", rt);
	goto done;
	}
	#endif
	/*
	* release references on items we hold them on..
	* e.g other routes and ifaddrs.
	*/
	if (rt->rt_ifa)
	IFAFREE(rt->rt_ifa);
	rt->rt_parent = NULL; /* NB: no refcnt on parent */

	/*
	* The key is separatly alloc'd so free it (see rt_setgate()).
	* This also frees the gateway, as they are always malloc'd
	* together.
	*/
	Free(rt_key(rt));

	/*
	* and the rtentry itself of course
	*/
	RT_LOCK_DESTROY(rt);
	uma_zfree(rtzone, rt);
	return;
	}
	done:
	RT_UNLOCK(rt);
	}


	/*
	* Force a routing table entry to the specified
	* destination to go through the given gateway.
	* Normally called as a result of a routing redirect
	* message from the network layer.
	*/
	void
	rtredirect(struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct sockaddr *src)
	{
	rtredirect_fib(dst, gateway, netmask, flags, src, 0);
	}

	void
	rtredirect_fib(struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct sockaddr *src,
	u_int fibnum)
	{
	+ INIT_VNET_NET(curvnet);
	struct rtentry rt, rt0 = NULL;
	int error = 0;
	short *stat = NULL;
	struct rt_addrinfo info;
	struct ifaddr *ifa;

	/* verify the gateway is directly reachable */
	if ((ifa = ifa_ifwithnet(gateway)) == NULL) {
	error = ENETUNREACH;
	goto out;
	}
	rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */
	/*
	* If the redirect isn't from our current router for this dst,
	* it's either old or wrong. If it redirects us to ourselves,
	* we have a routing loop, perhaps as a result of an interface
	* going down recently.
	*/
	if (!(flags & RTF_DONE) && rt &&
	(!sa_equal(src, rt->rt_gateway) \|\| rt->rt_ifa != ifa))
	error = EINVAL;
	else if (ifa_ifwithaddr(gateway))
	error = EHOSTUNREACH;
	if (error)
	goto done;
	/*
	* Create a new entry if we just got back a wildcard entry
	* or the the lookup failed. This is necessary for hosts
	* which use routing redirects generated by smart gateways
	* to dynamically build the routing tables.
	*/
	if (rt == NULL \|\| (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
	goto create;
	/*
	* Don't listen to the redirect if it's
	* for a route to an interface.
	*/
	if (rt->rt_flags & RTF_GATEWAY) {
	if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
	/*
	* Changing from route to net => route to host.
	* Create new route, rather than smashing route to net.
	*/
	create:
	rt0 = rt;
	rt = NULL;

	flags \|= RTF_GATEWAY \| RTF_DYNAMIC;
	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = gateway;
	info.rti_info[RTAX_NETMASK] = netmask;
	info.rti_ifa = ifa;
	info.rti_flags = flags;
	error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
	if (rt != NULL) {
	RT_LOCK(rt);
	EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
	flags = rt->rt_flags;
	}
	if (rt0)
	RTFREE_LOCKED(rt0);

	stat = &V_rtstat.rts_dynamic;
	} else {
	struct rtentry *gwrt;

	/*
	* Smash the current notion of the gateway to
	* this destination. Should check about netmask!!!
	*/
	rt->rt_flags \|= RTF_MODIFIED;
	flags \|= RTF_MODIFIED;
	stat = &V_rtstat.rts_newgateway;
	/*
	* add the key and gateway (in one malloc'd chunk).
	*/
	rt_setgate(rt, rt_key(rt), gateway);
	gwrt = rtalloc1(gateway, 1, 0);
	EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
	RTFREE_LOCKED(gwrt);
	}
	} else
	error = EHOSTUNREACH;
	done:
	if (rt)
	RTFREE_LOCKED(rt);
	out:
	if (error)
	V_rtstat.rts_badredirect++;
	else if (stat != NULL)
	(*stat)++;
	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = gateway;
	info.rti_info[RTAX_NETMASK] = netmask;
	info.rti_info[RTAX_AUTHOR] = src;
	rt_missmsg(RTM_REDIRECT, &info, flags, error);
	}

	int
	rtioctl(u_long req, caddr_t data)
	{
	return (rtioctl_fib(req, data, 0));
	}

	/*
	* Routing table ioctl interface.
	*/
	int
	rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
	{

	/*
	* If more ioctl commands are added here, make sure the proper
	* super-user checks are being performed because it is possible for
	* prison-root to make it this far if raw sockets have been enabled
	* in jails.
	*/
	#ifdef INET
	/* Multicast goop, grrr... */
	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
	#else /* INET */
	return ENXIO;
	#endif /* INET */
	}

	struct ifaddr *
	ifa_ifwithroute(int flags, struct sockaddr dst, struct sockaddr gateway)
	{
	return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
	}

	struct ifaddr *
	ifa_ifwithroute_fib(int flags, struct sockaddr dst, struct sockaddr gateway,
	u_int fibnum)
	{
	register struct ifaddr *ifa;
	int not_found = 0;

	if ((flags & RTF_GATEWAY) == 0) {
	/*
	* If we are adding a route to an interface,
	* and the interface is a pt to pt link
	* we should search for the destination
	* as our clue to the interface. Otherwise
	* we can use the local address.
	*/
	ifa = NULL;
	if (flags & RTF_HOST)
	ifa = ifa_ifwithdstaddr(dst);
	if (ifa == NULL)
	ifa = ifa_ifwithaddr(gateway);
	} else {
	/*
	* If we are adding a route to a remote net
	* or host, the gateway may still be on the
	* other end of a pt to pt link.
	*/
	ifa = ifa_ifwithdstaddr(gateway);
	}
	if (ifa == NULL)
	ifa = ifa_ifwithnet(gateway);
	if (ifa == NULL) {
	struct rtentry *rt = rtalloc1_fib(gateway, 0, 0UL, fibnum);
	if (rt == NULL)
	return (NULL);
	/*
	* dismiss a gateway that is reachable only
	* through the default router
	*/
	switch (gateway->sa_family) {
	case AF_INET:
	if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
	not_found = 1;
	break;
	case AF_INET6:
	if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
	not_found = 1;
	break;
	default:
	break;
	}
	RT_REMREF(rt);
	RT_UNLOCK(rt);
	if (not_found)
	return (NULL);
	if ((ifa = rt->rt_ifa) == NULL)
	return (NULL);
	}
	if (ifa->ifa_addr->sa_family != dst->sa_family) {
	struct ifaddr *oifa = ifa;
	ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
	if (ifa == NULL)
	ifa = oifa;
	}
	return (ifa);
	}

	static walktree_f_t rt_fixdelete;
	static walktree_f_t rt_fixchange;

	struct rtfc_arg {
	struct rtentry *rt0;
	struct radix_node_head *rnh;
	};

	/*
	* Do appropriate manipulations of a routing tree given
	* all the bits of info needed
	*/
	int
	rtrequest(int req,
	struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct rtentry **ret_nrt)
	{
	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
	}

	int
	rtrequest_fib(int req,
	struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct rtentry **ret_nrt,
	u_int fibnum)
	{
	struct rt_addrinfo info;

	if (dst->sa_len == 0)
	return(EINVAL);

	bzero((caddr_t)&info, sizeof(info));
	info.rti_flags = flags;
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = gateway;
	info.rti_info[RTAX_NETMASK] = netmask;
	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
	}

	/*
	* These (questionable) definitions of apparent local variables apply
	* to the next two functions. XXXXXX!!!
	*/
	#define dst info->rti_info[RTAX_DST]
	#define gateway info->rti_info[RTAX_GATEWAY]
	#define netmask info->rti_info[RTAX_NETMASK]
	#define ifaaddr info->rti_info[RTAX_IFA]
	#define ifpaddr info->rti_info[RTAX_IFP]
	#define flags info->rti_flags

	int
	rt_getifa(struct rt_addrinfo *info)
	{
	return (rt_getifa_fib(info, 0));
	}

	int
	rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
	{
	struct ifaddr *ifa;
	int error = 0;

	/*
	* ifp may be specified by sockaddr_dl
	* when protocol address is ambiguous.
	*/
	if (info->rti_ifp == NULL && ifpaddr != NULL &&
	ifpaddr->sa_family == AF_LINK &&
	(ifa = ifa_ifwithnet(ifpaddr)) != NULL)
	info->rti_ifp = ifa->ifa_ifp;
	if (info->rti_ifa == NULL && ifaaddr != NULL)
	info->rti_ifa = ifa_ifwithaddr(ifaaddr);
	if (info->rti_ifa == NULL) {
	struct sockaddr *sa;

	sa = ifaaddr != NULL ? ifaaddr :
	(gateway != NULL ? gateway : dst);
	if (sa != NULL && info->rti_ifp != NULL)
	info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
	else if (dst != NULL && gateway != NULL)
	info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
	fibnum);
	else if (sa != NULL)
	info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
	fibnum);
	}
	if ((ifa = info->rti_ifa) != NULL) {
	if (info->rti_ifp == NULL)
	info->rti_ifp = ifa->ifa_ifp;
	} else
	error = ENETUNREACH;
	return (error);
	}

	/*
	* Expunges references to a route that's about to be reclaimed.
	* The route must be locked.
	*/
	int
	rtexpunge(struct rtentry *rt)
	{
	+ INIT_VNET_NET(curvnet);
	struct radix_node *rn;
	struct radix_node_head *rnh;
	struct ifaddr *ifa;
	int error = 0;

	RT_LOCK_ASSERT(rt);
	#if 0
	/*
	* We cannot assume anything about the reference count
	* because protocols call us in many situations; often
	* before unwinding references to the table entry.
	*/
	KASSERT(rt->rt_refcnt <= 1, ("bogus refcnt %ld", rt->rt_refcnt));
	#endif
	/*
	* Find the correct routing tree to use for this Address Family
	*/
	rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family];
	if (rnh == NULL)
	return (EAFNOSUPPORT);

	RADIX_NODE_HEAD_LOCK(rnh);

	/*
	* Remove the item from the tree; it should be there,
	* but when callers invoke us blindly it may not (sigh).
	*/
	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
	if (rn == NULL) {
	error = ESRCH;
	goto bad;
	}
	KASSERT((rn->rn_flags & (RNF_ACTIVE \| RNF_ROOT)) == 0,
	("unexpected flags 0x%x", rn->rn_flags));
	KASSERT(rt == RNTORT(rn),
	("lookup mismatch, rt %p rn %p", rt, rn));

	rt->rt_flags &= ~RTF_UP;

	/*
	* Now search what's left of the subtree for any cloned
	* routes which might have been formed from this node.
	*/
	if ((rt->rt_flags & RTF_CLONING) && rt_mask(rt))
	rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
	rt_fixdelete, rt);

	/*
	* Remove any external references we may have.
	* This might result in another rtentry being freed if
	* we held its last reference.
	*/
	if (rt->rt_gwroute) {
	RTFREE(rt->rt_gwroute);
	rt->rt_gwroute = NULL;
	}

	/*
	* Give the protocol a chance to keep things in sync.
	*/
	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
	struct rt_addrinfo info;

	bzero((caddr_t)&info, sizeof(info));
	info.rti_flags = rt->rt_flags;
	info.rti_info[RTAX_DST] = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
	ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
	}

	/*
	* one more rtentry floating around that is not
	* linked to the routing table.
	*/
	V_rttrash++;
	bad:
	RADIX_NODE_HEAD_UNLOCK(rnh);
	return (error);
	}

	int
	rtrequest1(int req, struct rt_addrinfo info, struct rtentry *ret_nrt)
	{
	return (rtrequest1_fib(req, info, ret_nrt, 0));
	}

	int
	rtrequest1_fib(int req, struct rt_addrinfo info, struct rtentry *ret_nrt,
	u_int fibnum)
	{
	+ INIT_VNET_NET(curvnet);
	int error = 0;
	register struct rtentry *rt;
	register struct radix_node *rn;
	register struct radix_node_head *rnh;
	struct ifaddr *ifa;
	struct sockaddr *ndst;
	#define senderr(x) { error = x ; goto bad; }

	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
	if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
	fibnum = 0;
	/*
	* Find the correct routing tree to use for this Address Family
	*/
	rnh = V_rt_tables[fibnum][dst->sa_family];
	if (rnh == NULL)
	return (EAFNOSUPPORT);
	RADIX_NODE_HEAD_LOCK(rnh);
	/*
	* If we are adding a host route then we don't want to put
	* a netmask in the tree, nor do we want to clone it.
	*/
	if (flags & RTF_HOST) {
	netmask = NULL;
	flags &= ~RTF_CLONING;
	}
	switch (req) {
	case RTM_DELETE:
	#ifdef RADIX_MPATH
	/*
	* if we got multipath routes, we require users to specify
	* a matching RTAX_GATEWAY.
	*/
	if (rn_mpath_capable(rnh)) {
	struct rtentry *rto = NULL;

	rn = rnh->rnh_matchaddr(dst, rnh);
	if (rn == NULL)
	senderr(ESRCH);
	rto = rt = RNTORT(rn);
	rt = rt_mpath_matchgate(rt, gateway);
	if (!rt)
	senderr(ESRCH);
	/*
	* this is the first entry in the chain
	*/
	if (rto == rt) {
	rn = rn_mpath_next((struct radix_node *)rt);
	/*
	* there is another entry, now it's active
	*/
	if (rn) {
	rto = RNTORT(rn);
	RT_LOCK(rto);
	rto->rt_flags \|= RTF_UP;
	RT_UNLOCK(rto);
	} else if (rt->rt_flags & RTF_GATEWAY) {
	/*
	* For gateway routes, we need to
	* make sure that we we are deleting
	* the correct gateway.
	* rt_mpath_matchgate() does not
	* check the case when there is only
	* one route in the chain.
	*/
	if (gateway &&
	(rt->rt_gateway->sa_len != gateway->sa_len \|\|
	memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
	senderr(ESRCH);
	}
	/*
	* use the normal delete code to remove
	* the first entry
	*/
	goto normal_rtdel;
	}
	/*
	* if the entry is 2nd and on up
	*/
	if (!rt_mpath_deldup(rto, rt))
	panic ("rtrequest1: rt_mpath_deldup");
	RT_LOCK(rt);
	RT_ADDREF(rt);
	rt->rt_flags &= ~RTF_UP;
	goto deldone; /* done with the RTM_DELETE command */
	}

	normal_rtdel:
	#endif
	/*
	* Remove the item from the tree and return it.
	* Complain if it is not there and do no more processing.
	*/
	rn = rnh->rnh_deladdr(dst, netmask, rnh);
	if (rn == NULL)
	senderr(ESRCH);
	if (rn->rn_flags & (RNF_ACTIVE \| RNF_ROOT))
	panic ("rtrequest delete");
	rt = RNTORT(rn);
	RT_LOCK(rt);
	RT_ADDREF(rt);
	rt->rt_flags &= ~RTF_UP;

	/*
	* Now search what's left of the subtree for any cloned
	* routes which might have been formed from this node.
	*/
	if ((rt->rt_flags & RTF_CLONING) &&
	rt_mask(rt)) {
	rnh->rnh_walktree_from(rnh, dst, rt_mask(rt),
	rt_fixdelete, rt);
	}

	/*
	* Remove any external references we may have.
	* This might result in another rtentry being freed if
	* we held its last reference.
	*/
	if (rt->rt_gwroute) {
	RTFREE(rt->rt_gwroute);
	rt->rt_gwroute = NULL;
	}

	/*
	* give the protocol a chance to keep things in sync.
	*/
	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
	ifa->ifa_rtrequest(RTM_DELETE, rt, info);

	#ifdef RADIX_MPATH
	deldone:
	#endif
	/*
	* One more rtentry floating around that is not
	* linked to the routing table. rttrash will be decremented
	* when RTFREE(rt) is eventually called.
	*/
	V_rttrash++;

	/*
	* If the caller wants it, then it can have it,
	* but it's up to it to free the rtentry as we won't be
	* doing it.
	*/
	if (ret_nrt) {
	*ret_nrt = rt;
	RT_UNLOCK(rt);
	} else
	RTFREE_LOCKED(rt);
	break;

	case RTM_RESOLVE:
	if (ret_nrt == NULL \|\| (rt = *ret_nrt) == NULL)
	senderr(EINVAL);
	ifa = rt->rt_ifa;
	/* XXX locking? */
	flags = rt->rt_flags &
	~(RTF_CLONING \| RTF_STATIC);
	flags \|= RTF_WASCLONED;
	gateway = rt->rt_gateway;
	if ((netmask = rt->rt_genmask) == NULL)
	flags \|= RTF_HOST;
	goto makeroute;

	case RTM_ADD:
	if ((flags & RTF_GATEWAY) && !gateway)
	senderr(EINVAL);
	if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
	(gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
	senderr(EINVAL);

	if (info->rti_ifa == NULL && (error = rt_getifa_fib(info, fibnum)))
	senderr(error);
	ifa = info->rti_ifa;

	makeroute:
	rt = uma_zalloc(rtzone, M_NOWAIT \| M_ZERO);
	if (rt == NULL)
	senderr(ENOBUFS);
	RT_LOCK_INIT(rt);
	rt->rt_flags = RTF_UP \| flags;
	rt->rt_fibnum = fibnum;
	/*
	* Add the gateway. Possibly re-malloc-ing the storage for it
	* also add the rt_gwroute if possible.
	*/
	RT_LOCK(rt);
	if ((error = rt_setgate(rt, dst, gateway)) != 0) {
	RT_LOCK_DESTROY(rt);
	uma_zfree(rtzone, rt);
	senderr(error);
	}

	/*
	* point to the (possibly newly malloc'd) dest address.
	*/
	ndst = (struct sockaddr *)rt_key(rt);

	/*
	* make sure it contains the value we want (masked if needed).
	*/
	if (netmask) {
	rt_maskedcopy(dst, ndst, netmask);
	} else
	bcopy(dst, ndst, dst->sa_len);

	/*
	* Note that we now have a reference to the ifa.
	* This moved from below so that rnh->rnh_addaddr() can
	* examine the ifa and ifa->ifa_ifp if it so desires.
	*/
	IFAREF(ifa);
	rt->rt_ifa = ifa;
	rt->rt_ifp = ifa->ifa_ifp;

	#ifdef RADIX_MPATH
	/* do not permit exactly the same dst/mask/gw pair */
	if (rn_mpath_capable(rnh) &&
	rt_mpath_conflict(rnh, rt, netmask)) {
	if (rt->rt_gwroute)
	RTFREE(rt->rt_gwroute);
	if (rt->rt_ifa) {
	IFAFREE(rt->rt_ifa);
	}
	Free(rt_key(rt));
	RT_LOCK_DESTROY(rt);
	uma_zfree(rtzone, rt);
	senderr(EEXIST);
	}
	#endif

	/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
	rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
	if (rn == NULL) {
	struct rtentry *rt2;
	/*
	* Uh-oh, we already have one of these in the tree.
	* We do a special hack: if the route that's already
	* there was generated by the cloning mechanism
	* then we just blow it away and retry the insertion
	* of the new one.
	*/
	rt2 = rtalloc1_fib(dst, 0, 0, fibnum);
	if (rt2 && rt2->rt_parent) {
	rtexpunge(rt2);
	RT_UNLOCK(rt2);
	rn = rnh->rnh_addaddr(ndst, netmask,
	rnh, rt->rt_nodes);
	} else if (rt2) {
	/* undo the extra ref we got */
	RTFREE_LOCKED(rt2);
	}
	}

	/*
	* If it still failed to go into the tree,
	* then un-make it (this should be a function)
	*/
	if (rn == NULL) {
	if (rt->rt_gwroute)
	RTFREE(rt->rt_gwroute);
	if (rt->rt_ifa)
	IFAFREE(rt->rt_ifa);
	Free(rt_key(rt));
	RT_LOCK_DESTROY(rt);
	uma_zfree(rtzone, rt);
	senderr(EEXIST);
	}

	rt->rt_parent = NULL;

	/*
	* If we got here from RESOLVE, then we are cloning
	* so clone the rest, and note that we
	* are a clone (and increment the parent's references)
	*/
	if (req == RTM_RESOLVE) {
	KASSERT(ret_nrt && *ret_nrt,
	("no route to clone from"));
	rt->rt_rmx = (ret_nrt)->rt_rmx; / copy metrics */
	rt->rt_rmx.rmx_pksent = 0; /* reset packet counter */
	if ((*ret_nrt)->rt_flags & RTF_CLONING) {
	/*
	* NB: We do not bump the refcnt on the parent
	* entry under the assumption that it will
	* remain so long as we do. This is
	* important when deleting the parent route
	* as this operation requires traversing
	* the tree to delete all clones and futzing
	* with refcnts requires us to double-lock
	* parent through this back reference.
	*/
	rt->rt_parent = *ret_nrt;
	}
	}

	/*
	* If this protocol has something to add to this then
	* allow it to do that as well.
	*/
	if (ifa->ifa_rtrequest)
	ifa->ifa_rtrequest(req, rt, info);

	/*
	* We repeat the same procedure from rt_setgate() here because
	* it doesn't fire when we call it there because the node
	* hasn't been added to the tree yet.
	*/
	if (req == RTM_ADD &&
	!(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) {
	struct rtfc_arg arg;
	arg.rnh = rnh;
	arg.rt0 = rt;
	rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
	rt_fixchange, &arg);
	}

	/*
	* actually return a resultant rtentry and
	* give the caller a single reference.
	*/
	if (ret_nrt) {
	*ret_nrt = rt;
	RT_ADDREF(rt);
	}
	RT_UNLOCK(rt);
	break;
	default:
	error = EOPNOTSUPP;
	}
	bad:
	RADIX_NODE_HEAD_UNLOCK(rnh);
	return (error);
	#undef senderr
	}

	#undef dst
	#undef gateway
	#undef netmask
	#undef ifaaddr
	#undef ifpaddr
	#undef flags

	/*
	* Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family''
	* (i.e., the routes related to it by the operation of cloning). This
	* routine is iterated over all potential former-child-routes by way of
	* rnh->rnh_walktree_from() above, and those that actually are children of
	* the late parent (passed in as VP here) are themselves deleted.
	*/
	static int
	rt_fixdelete(struct radix_node rn, void vp)
	{
	struct rtentry *rt = RNTORT(rn);
	struct rtentry *rt0 = vp;

	if (rt->rt_parent == rt0 &&
	!(rt->rt_flags & (RTF_PINNED \| RTF_CLONING))) {
	return rtrequest_fib(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt),
	rt->rt_flags, NULL, rt->rt_fibnum);
	}
	return 0;
	}

	/*
	* This routine is called from rt_setgate() to do the analogous thing for
	* adds and changes. There is the added complication in this case of a
	* middle insert; i.e., insertion of a new network route between an older
	* network route and (cloned) host routes. For this reason, a simple check
	* of rt->rt_parent is insufficient; each candidate route must be tested
	* against the (mask, value) of the new route (passed as before in vp)
	* to see if the new route matches it.
	*
	* XXX - it may be possible to do fixdelete() for changes and reserve this
	* routine just for adds. I'm not sure why I thought it was necessary to do
	* changes this way.
	*/

	static int
	rt_fixchange(struct radix_node rn, void vp)
	{
	struct rtentry *rt = RNTORT(rn);
	struct rtfc_arg *ap = vp;
	struct rtentry *rt0 = ap->rt0;
	struct radix_node_head *rnh = ap->rnh;
	u_char xk1, xm1, xk2, xmp;
	int i, len, mlen;

	/* make sure we have a parent, and route is not pinned or cloning */
	if (!rt->rt_parent \|\|
	(rt->rt_flags & (RTF_PINNED \| RTF_CLONING)))
	return 0;

	if (rt->rt_parent == rt0) /* parent match */
	goto delete_rt;
	/*
	* There probably is a function somewhere which does this...
	* if not, there should be.
	*/
	len = imin(rt_key(rt0)->sa_len, rt_key(rt)->sa_len);

	xk1 = (u_char *)rt_key(rt0);
	xm1 = (u_char *)rt_mask(rt0);
	xk2 = (u_char *)rt_key(rt);

	/* avoid applying a less specific route */
	xmp = (u_char *)rt_mask(rt->rt_parent);
	mlen = rt_key(rt->rt_parent)->sa_len;
	if (mlen > rt_key(rt0)->sa_len) /* less specific route */
	return 0;
	for (i = rnh->rnh_treetop->rn_offset; i < mlen; i++)
	if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i])
	return 0; /* less specific route */

	for (i = rnh->rnh_treetop->rn_offset; i < len; i++)
	if ((xk2[i] & xm1[i]) != xk1[i])
	return 0; /* no match */

	/*
	* OK, this node is a clone, and matches the node currently being
	* changed/added under the node's mask. So, get rid of it.
	*/
	delete_rt:
	return rtrequest_fib(RTM_DELETE, rt_key(rt), NULL,
	rt_mask(rt), rt->rt_flags, NULL, rt->rt_fibnum);
	}

	int
	rt_setgate(struct rtentry rt, struct sockaddr dst, struct sockaddr *gate)
	{
	+ INIT_VNET_NET(curvnet);
	/* XXX dst may be overwritten, can we move this to below */
	struct radix_node_head *rnh =
	V_rt_tables[rt->rt_fibnum][dst->sa_family];
	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);

	again:
	RT_LOCK_ASSERT(rt);

	/*
	* A host route with the destination equal to the gateway
	* will interfere with keeping LLINFO in the routing
	* table, so disallow it.
	*/
	if (((rt->rt_flags & (RTF_HOST\|RTF_GATEWAY\|RTF_LLINFO)) ==
	(RTF_HOST\|RTF_GATEWAY)) &&
	dst->sa_len == gate->sa_len &&
	bcmp(dst, gate, dst->sa_len) == 0) {
	/*
	* The route might already exist if this is an RTM_CHANGE
	* or a routing redirect, so try to delete it.
	*/
	if (rt_key(rt))
	rtexpunge(rt);
	return EADDRNOTAVAIL;
	}

	/*
	* Cloning loop avoidance in case of bad configuration.
	*/
	if (rt->rt_flags & RTF_GATEWAY) {
	struct rtentry *gwrt;

	RT_UNLOCK(rt); /* XXX workaround LOR */
	gwrt = rtalloc1_fib(gate, 1, 0, rt->rt_fibnum);
	if (gwrt == rt) {
	RT_REMREF(rt);
	return (EADDRINUSE); /* failure */
	}
	/*
	* Try to reacquire the lock on rt, and if it fails,
	* clean state and restart from scratch.
	*/
	if (!RT_TRYLOCK(rt)) {
	RTFREE_LOCKED(gwrt);
	RT_LOCK(rt);
	goto again;
	}
	/*
	* If there is already a gwroute, then drop it. If we
	* are asked to replace route with itself, then do
	* not leak its refcounter.
	*/
	if (rt->rt_gwroute != NULL) {
	if (rt->rt_gwroute == gwrt) {
	RT_REMREF(rt->rt_gwroute);
	} else
	RTFREE(rt->rt_gwroute);
	}

	if ((rt->rt_gwroute = gwrt) != NULL)
	RT_UNLOCK(rt->rt_gwroute);
	}

	/*
	* Prepare to store the gateway in rt->rt_gateway.
	* Both dst and gateway are stored one after the other in the same
	* malloc'd chunk. If we have room, we can reuse the old buffer,
	* rt_gateway already points to the right place.
	* Otherwise, malloc a new block and update the 'dst' address.
	*/
	if (rt->rt_gateway == NULL \|\| glen > SA_SIZE(rt->rt_gateway)) {
	caddr_t new;

	R_Malloc(new, caddr_t, dlen + glen);
	if (new == NULL)
	return ENOBUFS;
	/*
	* XXX note, we copy from dst and not rt_key(rt) because
	* rt_setgate() can be called to initialize a newly
	* allocated route entry, in which case rt_key(rt) == NULL
	* (and also rt->rt_gateway == NULL).
	* Free()/free() handle a NULL argument just fine.
	*/
	bcopy(dst, new, dlen);
	Free(rt_key(rt)); /* free old block, if any */
	rt_key(rt) = (struct sockaddr *)new;
	rt->rt_gateway = (struct sockaddr *)(new + dlen);
	}

	/*
	* Copy the new gateway value into the memory chunk.
	*/
	bcopy(gate, rt->rt_gateway, glen);

	/*
	* This isn't going to do anything useful for host routes, so
	* don't bother. Also make sure we have a reasonable mask
	* (we don't yet have one during adds).
	*/
	if (!(rt->rt_flags & RTF_HOST) && rt_mask(rt) != 0) {
	struct rtfc_arg arg;

	arg.rnh = rnh;
	arg.rt0 = rt;
	RT_UNLOCK(rt); /* XXX workaround LOR */
	RADIX_NODE_HEAD_LOCK(rnh);
	RT_LOCK(rt);
	rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
	rt_fixchange, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	}

	return 0;
	}

	static void
	rt_maskedcopy(struct sockaddr src, struct sockaddr dst, struct sockaddr *netmask)
	{
	register u_char cp1 = (u_char )src;
	register u_char cp2 = (u_char )dst;
	register u_char cp3 = (u_char )netmask;
	u_char cplim = cp2 + cp3;
	u_char cplim2 = cp2 + cp1;

	cp2++ = cp1++; cp2++ = cp1++; /* copies sa_len & sa_family */
	cp3 += 2;
	if (cplim > cplim2)
	cplim = cplim2;
	while (cp2 < cplim)
	cp2++ = cp1++ & *cp3++;
	if (cp2 < cplim2)
	bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
	}

	/*
	* Set up a routing table entry, normally
	* for an interface.
	*/
	#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
	static inline int
	rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
	{
	+ INIT_VNET_NET(curvnet);
	struct sockaddr *dst;
	struct sockaddr *netmask;
	struct rtentry *rt = NULL;
	struct rt_addrinfo info;
	int error = 0;
	int startfib, endfib;
	char tempbuf[_SOCKADDR_TMPSIZE];
	int didwork = 0;
	int a_failure = 0;

	if (flags & RTF_HOST) {
	dst = ifa->ifa_dstaddr;
	netmask = NULL;
	} else {
	dst = ifa->ifa_addr;
	netmask = ifa->ifa_netmask;
	}
	if ( dst->sa_family != AF_INET)
	fibnum = 0;
	if (fibnum == -1) {
	if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
	startfib = endfib = curthread->td_proc->p_fibnum;
	} else {
	startfib = 0;
	endfib = rt_numfibs - 1;
	}
	} else {
	KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
	startfib = fibnum;
	endfib = fibnum;
	}
	if (dst->sa_len == 0)
	return(EINVAL);

	/*
	* If it's a delete, check that if it exists,
	* it's on the correct interface or we might scrub
	* a route to another ifa which would
	* be confusing at best and possibly worse.
	*/
	if (cmd == RTM_DELETE) {
	/*
	* It's a delete, so it should already exist..
	* If it's a net, mask off the host bits
	* (Assuming we have a mask)
	* XXX this is kinda inet specific..
	*/
	if (netmask != NULL) {
	rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
	dst = (struct sockaddr *)tempbuf;
	}
	}
	/*
	* Now go through all the requested tables (fibs) and do the
	* requested action. Realistically, this will either be fib 0
	* for protocols that don't do multiple tables or all the
	* tables for those that do. XXX For this version only AF_INET.
	* When that changes code should be refactored to protocol
	* independent parts and protocol dependent parts.
	*/
	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
	if (cmd == RTM_DELETE) {
	struct radix_node_head *rnh;
	struct radix_node *rn;
	/*
	* Look up an rtentry that is in the routing tree and
	* contains the correct info.
	*/
	if ((rnh = V_rt_tables[fibnum][dst->sa_family]) == NULL)
	/* this table doesn't exist but others might */
	continue;
	RADIX_NODE_HEAD_LOCK(rnh);
	#ifdef RADIX_MPATH
	if (rn_mpath_capable(rnh)) {

	rn = rnh->rnh_matchaddr(dst, rnh);
	if (rn == NULL)
	error = ESRCH;
	else {
	rt = RNTORT(rn);
	/*
	* for interface route the
	* rt->rt_gateway is sockaddr_intf
	* for cloning ARP entries, so
	* rt_mpath_matchgate must use the
	* interface address
	*/
	rt = rt_mpath_matchgate(rt,
	ifa->ifa_addr);
	if (!rt)
	error = ESRCH;
	}
	}
	else
	#endif
	rn = rnh->rnh_lookup(dst, netmask, rnh);
	error = (rn == NULL \|\|
	(rn->rn_flags & RNF_ROOT) \|\|
	RNTORT(rn)->rt_ifa != ifa \|\|
	!sa_equal((struct sockaddr *)rn->rn_key, dst));
	RADIX_NODE_HEAD_UNLOCK(rnh);
	if (error) {
	/* this is only an error if bad on ALL tables */
	continue;
	}
	}
	/*
	* Do the actual request
	*/
	bzero((caddr_t)&info, sizeof(info));
	info.rti_ifa = ifa;
	info.rti_flags = flags \| ifa->ifa_flags;
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
	info.rti_info[RTAX_NETMASK] = netmask;
	error = rtrequest1_fib(cmd, &info, &rt, fibnum);
	if (error == 0 && rt != NULL) {
	/*
	* notify any listening routing agents of the change
	*/
	RT_LOCK(rt);
	#ifdef RADIX_MPATH
	/*
	* in case address alias finds the first address
	* e.g. ifconfig bge0 192.103.54.246/24
	* e.g. ifconfig bge0 192.103.54.247/24
	* the address set in the route is 192.103.54.246
	* so we need to replace it with 192.103.54.247
	*/
	if (memcmp(rt->rt_ifa->ifa_addr,
	ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
	IFAFREE(rt->rt_ifa);
	IFAREF(ifa);
	rt->rt_ifp = ifa->ifa_ifp;
	rt->rt_ifa = ifa;
	}
	#endif
	rt_newaddrmsg(cmd, ifa, error, rt);
	if (cmd == RTM_DELETE) {
	/*
	* If we are deleting, and we found an entry,
	* then it's been removed from the tree..
	* now throw it away.
	*/
	RTFREE_LOCKED(rt);
	} else {
	if (cmd == RTM_ADD) {
	/*
	* We just wanted to add it..
	* we don't actually need a reference.
	*/
	RT_REMREF(rt);
	}
	RT_UNLOCK(rt);
	}
	didwork = 1;
	}
	if (error)
	a_failure = error;
	}
	if (cmd == RTM_DELETE) {
	if (didwork) {
	error = 0;
	} else {
	/* we only give an error if it wasn't in any table */
	error = ((flags & RTF_HOST) ?
	EHOSTUNREACH : ENETUNREACH);
	}
	} else {
	if (a_failure) {
	/* return an error if any of them failed */
	error = a_failure;
	}
	}
	return (error);
	}

	/* special one for inet internal use. may not use. */
	int
	rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
	{
	return (rtinit1(ifa, cmd, flags, -1));
	}

	/*
	* Set up a routing table entry, normally
	* for an interface.
	*/
	int
	rtinit(struct ifaddr *ifa, int cmd, int flags)
	{
	struct sockaddr *dst;
	int fib = 0;

	if (flags & RTF_HOST) {
	dst = ifa->ifa_dstaddr;
	} else {
	dst = ifa->ifa_addr;
	}

	if (dst->sa_family == AF_INET)
	fib = -1;
	return (rtinit1(ifa, cmd, flags, fib));
	}

	/*
	* rt_check() is invoked on each layer 2 output path, prior to
	* encapsulating outbound packets.
	*
	* The function is mostly used to find a routing entry for the gateway,
	* which in some protocol families could also point to the link-level
	* address for the gateway itself (the side effect of revalidating the
	* route to the destination is rather pointless at this stage, we did it
	* already a moment before in the pr_output() routine to locate the ifp
	* and gateway to use).
	*
	* When we remove the layer-3 to layer-2 mapping tables from the
	* routing table, this function can be removed.
	*
	* === On input ===
	* *dst is the address of the NEXT HOP (which coincides with the
	* final destination if directly reachable);
	* *lrt0 points to the cached route to the final destination;
	* *lrt is not meaningful;
	* (*lrt0 has no ref held on it by us so REMREF is not needed.
	* Refs only account for major structural references and not usages,
	* which is actually a bit of a problem.)
	*
	* === Operation ===
	* If the route is marked down try to find a new route. If the route
	* to the gateway is gone, try to setup a new route. Otherwise,
	* if the route is marked for packets to be rejected, enforce that.
	* Note that rtalloc returns an rtentry with an extra REF that we may
	* need to lose.
	*
	* === On return ===
	* *dst is unchanged;
	* *lrt0 points to the (possibly new) route to the final destination
	* *lrt points to the route to the next hop [LOCKED]
	*
	* Their values are meaningful ONLY if no error is returned.
	*
	* To follow this you have to remember that:
	* RT_REMREF reduces the reference count by 1 but doesn't check it for 0 (!)
	* RTFREE_LOCKED includes an RT_REMREF (or an rtfree if refs == 1)
	* and an RT_UNLOCK
	* RTFREE does an RT_LOCK and an RTFREE_LOCKED
	* The gwroute pointer counts as a reference on the rtentry to which it points.
	* so when we add it we use the ref that rtalloc gives us and when we lose it
	* we need to remove the reference.
	* RT_TEMP_UNLOCK does an RT_ADDREF before freeing the lock, and
	* RT_RELOCK locks it (it can't have gone away due to the ref) and
	* drops the ref, possibly freeing it and zeroing the pointer if
	* the ref goes to 0 (unlocking in the process).
	*/
	int
	rt_check(struct rtentry lrt, struct rtentry lrt0, struct sockaddr *dst)
	{
	struct rtentry *rt;
	struct rtentry *rt0;
	u_int fibnum;

	KASSERT(*lrt0 != NULL, ("rt_check"));
	rt0 = *lrt0;
	rt = NULL;
	fibnum = rt0->rt_fibnum;

	/* NB: the locking here is tortuous... */
	RT_LOCK(rt0);
	retry:
	if (rt0 && (rt0->rt_flags & RTF_UP) == 0) {
	/* Current rt0 is useless, try get a replacement. */
	RT_UNLOCK(rt0);
	rt0 = NULL;
	}
	if (rt0 == NULL) {
	rt0 = rtalloc1_fib(dst, 1, 0UL, fibnum);
	if (rt0 == NULL) {
	return (EHOSTUNREACH);
	}
	RT_REMREF(rt0); /* don't need the reference. */
	}

	if (rt0->rt_flags & RTF_GATEWAY) {
	if ((rt = rt0->rt_gwroute) != NULL) {
	RT_LOCK(rt); /* NB: gwroute */
	if ((rt->rt_flags & RTF_UP) == 0) {
	/* gw route is dud. ignore/lose it */
	RTFREE_LOCKED(rt); /* unref (&unlock) gwroute */
	rt = rt0->rt_gwroute = NULL;
	}
	}

	if (rt == NULL) { /* NOT AN ELSE CLAUSE */
	RT_TEMP_UNLOCK(rt0); /* MUST return to undo this */
	rt = rtalloc1_fib(rt0->rt_gateway, 1, 0UL, fibnum);
	if ((rt == rt0) \|\| (rt == NULL)) {
	/* the best we can do is not good enough */
	if (rt) {
	RT_REMREF(rt); /* assumes ref > 0 */
	RT_UNLOCK(rt);
	}
	RTFREE(rt0); /* lock, unref, (unlock) */
	return (ENETUNREACH);
	}
	/*
	* Relock it and lose the added reference.
	* All sorts of things could have happenned while we
	* had no lock on it, so check for them.
	*/
	RT_RELOCK(rt0);
	if (rt0 == NULL \|\| ((rt0->rt_flags & RTF_UP) == 0))
	/* Ru-roh.. what we had is no longer any good */
	goto retry;
	/*
	* While we were away, someone replaced the gateway.
	* Since a reference count is involved we can't just
	* overwrite it.
	*/
	if (rt0->rt_gwroute) {
	if (rt0->rt_gwroute != rt) {
	RTFREE_LOCKED(rt);
	goto retry;
	}
	} else {
	rt0->rt_gwroute = rt;
	}
	}
	RT_LOCK_ASSERT(rt);
	RT_UNLOCK(rt0);
	} else {
	/* think of rt as having the lock from now on.. */
	rt = rt0;
	}
	/* XXX why are we inspecting rmx_expire? */
	if ((rt->rt_flags & RTF_REJECT) &&
	(rt->rt_rmx.rmx_expire == 0 \|\|
	time_uptime < rt->rt_rmx.rmx_expire)) {
	RT_UNLOCK(rt);
	return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH);
	}

	*lrt = rt;
	*lrt0 = rt0;
	return (0);
	}

	/* This must be before ip6_init2(), which is now SI_ORDER_MIDDLE */
	SYSINIT(route, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);
	Index: head/sys/net/rtsock.c
	===================================================================
	--- head/sys/net/rtsock.c (revision 183549)
	+++ head/sys/net/rtsock.c (revision 183550)
	@@ -1,1333 +1,1338 @@
	/*-
	* Copyright (c) 1988, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
	* $FreeBSD$
	*/
	#include "opt_sctp.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/domain.h>
	#include <sys/kernel.h>
	#include <sys/jail.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/netisr.h>
	#include <net/raw_cb.h>
	#include <net/route.h>

	#include <netinet/in.h>

	#ifdef SCTP
	extern void sctp_addr_change(struct ifaddr *ifa, int cmd);
	#endif /* SCTP */

	MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");

	/* NB: these are not modified */
	static struct sockaddr route_src = { 2, PF_ROUTE, };
	static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, };

	static struct {
	int ip_count; /* attached w/ AF_INET */
	int ip6_count; /* attached w/ AF_INET6 */
	int ipx_count; /* attached w/ AF_IPX */
	int any_count; /* total attached */
	} route_cb;

	struct mtx rtsock_mtx;
	MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);

	#define RTSOCK_LOCK() mtx_lock(&rtsock_mtx)
	#define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx)
	#define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED)

	static struct ifqueue rtsintrq;

	SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD, 0, "");
	SYSCTL_INT(_net_route, OID_AUTO, netisr_maxqlen, CTLFLAG_RW,
	&rtsintrq.ifq_maxlen, 0, "maximum routing socket dispatch queue length");

	struct walkarg {
	int w_tmemsize;
	int w_op, w_arg;
	caddr_t w_tmem;
	struct sysctl_req *w_req;
	};

	static void rts_input(struct mbuf *m);
	static struct mbuf rt_msg1(int type, struct rt_addrinfo rtinfo);
	static int rt_msg2(int type, struct rt_addrinfo *rtinfo,
	caddr_t cp, struct walkarg *w);
	static int rt_xaddrs(caddr_t cp, caddr_t cplim,
	struct rt_addrinfo *rtinfo);
	static int sysctl_dumpentry(struct radix_node rn, void vw);
	static int sysctl_iflist(int af, struct walkarg *w);
	static int sysctl_ifmalist(int af, struct walkarg *w);
	static int route_output(struct mbuf m, struct socket so);
	static void rt_setmetrics(u_long which, const struct rt_metrics *in,
	struct rt_metrics_lite *out);
	static void rt_getmetrics(const struct rt_metrics_lite *in,
	struct rt_metrics *out);
	static void rt_dispatch(struct mbuf , const struct sockaddr );

	static void
	rts_init(void)
	{
	int tmp;

	rtsintrq.ifq_maxlen = 256;
	if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
	rtsintrq.ifq_maxlen = tmp;
	mtx_init(&rtsintrq.ifq_mtx, "rts_inq", NULL, MTX_DEF);
	netisr_register(NETISR_ROUTE, rts_input, &rtsintrq, 0);
	}
	SYSINIT(rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rts_init, 0);

	static void
	rts_input(struct mbuf *m)
	{
	struct sockproto route_proto;
	unsigned short *family;
	struct m_tag *tag;

	route_proto.sp_family = PF_ROUTE;
	tag = m_tag_find(m, PACKET_TAG_RTSOCKFAM, NULL);
	if (tag != NULL) {
	family = (unsigned short *)(tag + 1);
	route_proto.sp_protocol = *family;
	m_tag_delete(m, tag);
	} else
	route_proto.sp_protocol = 0;

	raw_input(m, &route_proto, &route_src);
	}

	/*
	* It really doesn't make any sense at all for this code to share much
	* with raw_usrreq.c, since its functionality is so restricted. XXX
	*/
	static void
	rts_abort(struct socket *so)
	{

	raw_usrreqs.pru_abort(so);
	}

	static void
	rts_close(struct socket *so)
	{

	raw_usrreqs.pru_close(so);
	}

	/* pru_accept is EOPNOTSUPP */

	static int
	rts_attach(struct socket so, int proto, struct thread td)
	{
	struct rawcb *rp;
	int s, error;

	KASSERT(so->so_pcb == NULL, ("rts_attach: so_pcb != NULL"));

	/* XXX */
	MALLOC(rp, struct rawcb , sizeof rp, M_PCB, M_WAITOK \| M_ZERO);
	if (rp == NULL)
	return ENOBUFS;

	/*
	* The splnet() is necessary to block protocols from sending
	* error notifications (like RTM_REDIRECT or RTM_LOSING) while
	* this PCB is extant but incompletely initialized.
	* Probably we should try to do more of this work beforehand and
	* eliminate the spl.
	*/
	s = splnet();
	so->so_pcb = (caddr_t)rp;
	so->so_fibnum = td->td_proc->p_fibnum;
	error = raw_attach(so, proto);
	rp = sotorawcb(so);
	if (error) {
	splx(s);
	so->so_pcb = NULL;
	free(rp, M_PCB);
	return error;
	}
	RTSOCK_LOCK();
	switch(rp->rcb_proto.sp_protocol) {
	case AF_INET:
	route_cb.ip_count++;
	break;
	case AF_INET6:
	route_cb.ip6_count++;
	break;
	case AF_IPX:
	route_cb.ipx_count++;
	break;
	}
	route_cb.any_count++;
	RTSOCK_UNLOCK();
	soisconnected(so);
	so->so_options \|= SO_USELOOPBACK;
	splx(s);
	return 0;
	}

	static int
	rts_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{

	return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */
	}

	static int
	rts_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{

	return (raw_usrreqs.pru_connect(so, nam, td)); /* XXX just EINVAL */
	}

	/* pru_connect2 is EOPNOTSUPP */
	/* pru_control is EOPNOTSUPP */

	static void
	rts_detach(struct socket *so)
	{
	struct rawcb *rp = sotorawcb(so);

	KASSERT(rp != NULL, ("rts_detach: rp == NULL"));

	RTSOCK_LOCK();
	switch(rp->rcb_proto.sp_protocol) {
	case AF_INET:
	route_cb.ip_count--;
	break;
	case AF_INET6:
	route_cb.ip6_count--;
	break;
	case AF_IPX:
	route_cb.ipx_count--;
	break;
	}
	route_cb.any_count--;
	RTSOCK_UNLOCK();
	raw_usrreqs.pru_detach(so);
	}

	static int
	rts_disconnect(struct socket *so)
	{

	return (raw_usrreqs.pru_disconnect(so));
	}

	/* pru_listen is EOPNOTSUPP */

	static int
	rts_peeraddr(struct socket so, struct sockaddr *nam)
	{

	return (raw_usrreqs.pru_peeraddr(so, nam));
	}

	/* pru_rcvd is EOPNOTSUPP */
	/* pru_rcvoob is EOPNOTSUPP */

	static int
	rts_send(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{

	return (raw_usrreqs.pru_send(so, flags, m, nam, control, td));
	}

	/* pru_sense is null */

	static int
	rts_shutdown(struct socket *so)
	{

	return (raw_usrreqs.pru_shutdown(so));
	}

	static int
	rts_sockaddr(struct socket so, struct sockaddr *nam)
	{

	return (raw_usrreqs.pru_sockaddr(so, nam));
	}

	static struct pr_usrreqs route_usrreqs = {
	.pru_abort = rts_abort,
	.pru_attach = rts_attach,
	.pru_bind = rts_bind,
	.pru_connect = rts_connect,
	.pru_detach = rts_detach,
	.pru_disconnect = rts_disconnect,
	.pru_peeraddr = rts_peeraddr,
	.pru_send = rts_send,
	.pru_shutdown = rts_shutdown,
	.pru_sockaddr = rts_sockaddr,
	.pru_close = rts_close,
	};

	/ARGSUSED/
	static int
	route_output(struct mbuf m, struct socket so)
	{
	#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)
	+ INIT_VNET_NET(so->so_vnet);
	struct rt_msghdr *rtm = NULL;
	struct rtentry *rt = NULL;
	struct radix_node_head *rnh;
	struct rt_addrinfo info;
	int len, error = 0;
	struct ifnet *ifp = NULL;
	struct sockaddr_in jail;

	#define senderr(e) { error = e; goto flush;}
	if (m == NULL \|\| ((m->m_len < sizeof(long)) &&
	(m = m_pullup(m, sizeof(long))) == NULL))
	return (ENOBUFS);
	if ((m->m_flags & M_PKTHDR) == 0)
	panic("route_output");
	len = m->m_pkthdr.len;
	if (len < sizeof(*rtm) \|\|
	len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
	info.rti_info[RTAX_DST] = NULL;
	senderr(EINVAL);
	}
	R_Malloc(rtm, struct rt_msghdr *, len);
	if (rtm == NULL) {
	info.rti_info[RTAX_DST] = NULL;
	senderr(ENOBUFS);
	}
	m_copydata(m, 0, len, (caddr_t)rtm);
	if (rtm->rtm_version != RTM_VERSION) {
	info.rti_info[RTAX_DST] = NULL;
	senderr(EPROTONOSUPPORT);
	}
	rtm->rtm_pid = curproc->p_pid;
	bzero(&info, sizeof(info));
	info.rti_addrs = rtm->rtm_addrs;
	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) {
	info.rti_info[RTAX_DST] = NULL;
	senderr(EINVAL);
	}
	info.rti_flags = rtm->rtm_flags;
	if (info.rti_info[RTAX_DST] == NULL \|\|
	info.rti_info[RTAX_DST]->sa_family >= AF_MAX \|\|
	(info.rti_info[RTAX_GATEWAY] != NULL &&
	info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))
	senderr(EINVAL);
	if (info.rti_info[RTAX_GENMASK]) {
	struct radix_node *t;
	t = rn_addmask((caddr_t) info.rti_info[RTAX_GENMASK], 0, 1);
	if (t != NULL &&
	bcmp((char )(void )info.rti_info[RTAX_GENMASK] + 1,
	(char )(void )t->rn_key + 1,
	((struct sockaddr *)t->rn_key)->sa_len - 1) == 0)
	info.rti_info[RTAX_GENMASK] =
	(struct sockaddr *)t->rn_key;
	else
	senderr(ENOBUFS);
	}

	/*
	* Verify that the caller has the appropriate privilege; RTM_GET
	* is the only operation the non-superuser is allowed.
	*/
	if (rtm->rtm_type != RTM_GET) {
	error = priv_check(curthread, PRIV_NET_ROUTE);
	if (error)
	senderr(error);
	}

	switch (rtm->rtm_type) {
	struct rtentry *saved_nrt;

	case RTM_ADD:
	if (info.rti_info[RTAX_GATEWAY] == NULL)
	senderr(EINVAL);
	saved_nrt = NULL;
	error = rtrequest1_fib(RTM_ADD, &info, &saved_nrt,
	so->so_fibnum);
	if (error == 0 && saved_nrt) {
	RT_LOCK(saved_nrt);
	rt_setmetrics(rtm->rtm_inits,
	&rtm->rtm_rmx, &saved_nrt->rt_rmx);
	rtm->rtm_index = saved_nrt->rt_ifp->if_index;
	RT_REMREF(saved_nrt);
	saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK];
	RT_UNLOCK(saved_nrt);
	}
	break;

	case RTM_DELETE:
	saved_nrt = NULL;
	error = rtrequest1_fib(RTM_DELETE, &info, &saved_nrt,
	so->so_fibnum);
	if (error == 0) {
	RT_LOCK(saved_nrt);
	rt = saved_nrt;
	goto report;
	}
	break;

	case RTM_GET:
	case RTM_CHANGE:
	case RTM_LOCK:
	rnh = V_rt_tables[so->so_fibnum][info.rti_info[RTAX_DST]->sa_family];
	if (rnh == NULL)
	senderr(EAFNOSUPPORT);
	RADIX_NODE_HEAD_LOCK(rnh);
	rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST],
	info.rti_info[RTAX_NETMASK], rnh);
	if (rt == NULL) { /* XXX looks bogus */
	RADIX_NODE_HEAD_UNLOCK(rnh);
	senderr(ESRCH);
	}
	#ifdef RADIX_MPATH
	/*
	* for RTM_CHANGE/LOCK, if we got multipath routes,
	* we require users to specify a matching RTAX_GATEWAY.
	*
	* for RTM_GET, gate is optional even with multipath.
	* if gate == NULL the first match is returned.
	* (no need to call rt_mpath_matchgate if gate == NULL)
	*/
	if (rn_mpath_capable(rnh) &&
	(rtm->rtm_type != RTM_GET \|\| info.rti_info[RTAX_GATEWAY])) {
	rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
	if (!rt) {
	RADIX_NODE_HEAD_UNLOCK(rnh);
	senderr(ESRCH);
	}
	}
	#endif
	RT_LOCK(rt);
	RT_ADDREF(rt);
	RADIX_NODE_HEAD_UNLOCK(rnh);

	/*
	* Fix for PR: 82974
	*
	* RTM_CHANGE/LOCK need a perfect match, rn_lookup()
	* returns a perfect match in case a netmask is
	* specified. For host routes only a longest prefix
	* match is returned so it is necessary to compare the
	* existence of the netmask. If both have a netmask
	* rnh_lookup() did a perfect match and if none of them
	* have a netmask both are host routes which is also a
	* perfect match.
	*/

	if (rtm->rtm_type != RTM_GET &&
	(!rt_mask(rt) != !info.rti_info[RTAX_NETMASK])) {
	RT_UNLOCK(rt);
	senderr(ESRCH);
	}

	switch(rtm->rtm_type) {

	case RTM_GET:
	report:
	RT_LOCK_ASSERT(rt);
	info.rti_info[RTAX_DST] = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
	info.rti_info[RTAX_GENMASK] = rt->rt_genmask;
	if (rtm->rtm_addrs & (RTA_IFP \| RTA_IFA)) {
	ifp = rt->rt_ifp;
	if (ifp) {
	info.rti_info[RTAX_IFP] =
	ifp->if_addr->ifa_addr;
	if (jailed(so->so_cred)) {
	bzero(&jail, sizeof(jail));
	jail.sin_family = PF_INET;
	jail.sin_len = sizeof(jail);
	jail.sin_addr.s_addr =
	htonl(prison_getip(so->so_cred));
	info.rti_info[RTAX_IFA] =
	(struct sockaddr *)&jail;
	} else
	info.rti_info[RTAX_IFA] =
	rt->rt_ifa->ifa_addr;
	if (ifp->if_flags & IFF_POINTOPOINT)
	info.rti_info[RTAX_BRD] =
	rt->rt_ifa->ifa_dstaddr;
	rtm->rtm_index = ifp->if_index;
	} else {
	info.rti_info[RTAX_IFP] = NULL;
	info.rti_info[RTAX_IFA] = NULL;
	}
	} else if ((ifp = rt->rt_ifp) != NULL) {
	rtm->rtm_index = ifp->if_index;
	}
	len = rt_msg2(rtm->rtm_type, &info, NULL, NULL);
	if (len > rtm->rtm_msglen) {
	struct rt_msghdr *new_rtm;
	R_Malloc(new_rtm, struct rt_msghdr *, len);
	if (new_rtm == NULL) {
	RT_UNLOCK(rt);
	senderr(ENOBUFS);
	}
	bcopy(rtm, new_rtm, rtm->rtm_msglen);
	Free(rtm); rtm = new_rtm;
	}
	(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, NULL);
	rtm->rtm_flags = rt->rt_flags;
	rtm->rtm_use = 0;
	rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
	rtm->rtm_addrs = info.rti_addrs;
	break;

	case RTM_CHANGE:
	/*
	* New gateway could require new ifaddr, ifp;
	* flags may also be different; ifp may be specified
	* by ll sockaddr when protocol address is ambiguous
	*/
	if (((rt->rt_flags & RTF_GATEWAY) &&
	info.rti_info[RTAX_GATEWAY] != NULL) \|\|
	info.rti_info[RTAX_IFP] != NULL \|\|
	(info.rti_info[RTAX_IFA] != NULL &&
	!sa_equal(info.rti_info[RTAX_IFA],
	rt->rt_ifa->ifa_addr))) {
	RT_UNLOCK(rt);
	if ((error = rt_getifa_fib(&info,
	rt->rt_fibnum)) != 0)
	senderr(error);
	RT_LOCK(rt);
	}
	if (info.rti_ifa != NULL &&
	info.rti_ifa != rt->rt_ifa &&
	rt->rt_ifa != NULL &&
	rt->rt_ifa->ifa_rtrequest != NULL) {
	rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt,
	&info);
	IFAFREE(rt->rt_ifa);
	}
	if (info.rti_info[RTAX_GATEWAY] != NULL) {
	if ((error = rt_setgate(rt, rt_key(rt),
	info.rti_info[RTAX_GATEWAY])) != 0) {
	RT_UNLOCK(rt);
	senderr(error);
	}
	if (!(rt->rt_flags & RTF_LLINFO))
	rt->rt_flags \|= RTF_GATEWAY;
	}
	if (info.rti_ifa != NULL &&
	info.rti_ifa != rt->rt_ifa) {
	IFAREF(info.rti_ifa);
	rt->rt_ifa = info.rti_ifa;
	rt->rt_ifp = info.rti_ifp;
	}
	/* Allow some flags to be toggled on change. */
	if (rtm->rtm_fmask & RTF_FMASK)
	rt->rt_flags = (rt->rt_flags &
	~rtm->rtm_fmask) \|
	(rtm->rtm_flags & rtm->rtm_fmask);
	rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
	&rt->rt_rmx);
	rtm->rtm_index = rt->rt_ifp->if_index;
	if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
	rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, &info);
	if (info.rti_info[RTAX_GENMASK])
	rt->rt_genmask = info.rti_info[RTAX_GENMASK];
	/* FALLTHROUGH */
	case RTM_LOCK:
	/* We don't support locks anymore */
	break;
	}
	RT_UNLOCK(rt);
	break;

	default:
	senderr(EOPNOTSUPP);
	}

	flush:
	if (rtm) {
	if (error)
	rtm->rtm_errno = error;
	else
	rtm->rtm_flags \|= RTF_DONE;
	}
	if (rt) /* XXX can this be true? */
	RTFREE(rt);
	{
	struct rawcb *rp = NULL;
	/*
	* Check to see if we don't want our own messages.
	*/
	if ((so->so_options & SO_USELOOPBACK) == 0) {
	if (route_cb.any_count <= 1) {
	if (rtm)
	Free(rtm);
	m_freem(m);
	return (error);
	}
	/* There is another listener, so construct message */
	rp = sotorawcb(so);
	}
	if (rtm) {
	m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
	if (m->m_pkthdr.len < rtm->rtm_msglen) {
	m_freem(m);
	m = NULL;
	} else if (m->m_pkthdr.len > rtm->rtm_msglen)
	m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
	Free(rtm);
	}
	if (m) {
	if (rp) {
	/*
	* XXX insure we don't get a copy by
	* invalidating our protocol
	*/
	unsigned short family = rp->rcb_proto.sp_family;
	rp->rcb_proto.sp_family = 0;
	rt_dispatch(m, info.rti_info[RTAX_DST]);
	rp->rcb_proto.sp_family = family;
	} else
	rt_dispatch(m, info.rti_info[RTAX_DST]);
	}
	}
	return (error);
	#undef sa_equal
	}

	static void
	rt_setmetrics(u_long which, const struct rt_metrics *in,
	struct rt_metrics_lite *out)
	{
	#define metric(f, e) if (which & (f)) out->e = in->e;
	/*
	* Only these are stored in the routing entry since introduction
	* of tcp hostcache. The rest is ignored.
	*/
	metric(RTV_MTU, rmx_mtu);
	/* Userland -> kernel timebase conversion. */
	if (which & RTV_EXPIRE)
	out->rmx_expire = in->rmx_expire ?
	in->rmx_expire - time_second + time_uptime : 0;
	#undef metric
	}

	static void
	rt_getmetrics(const struct rt_metrics_lite in, struct rt_metrics out)
	{
	#define metric(e) out->e = in->e;
	bzero(out, sizeof(*out));
	metric(rmx_mtu);
	/* Kernel -> userland timebase conversion. */
	out->rmx_expire = in->rmx_expire ?
	in->rmx_expire - time_uptime + time_second : 0;
	#undef metric
	}

	/*
	* Extract the addresses of the passed sockaddrs.
	* Do a little sanity checking so as to avoid bad memory references.
	* This data is derived straight from userland.
	*/
	static int
	rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
	{
	struct sockaddr *sa;
	int i;

	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
	if ((rtinfo->rti_addrs & (1 << i)) == 0)
	continue;
	sa = (struct sockaddr *)cp;
	/*
	* It won't fit.
	*/
	if (cp + sa->sa_len > cplim)
	return (EINVAL);
	/*
	* there are no more.. quit now
	* If there are more bits, they are in error.
	* I've seen this. route(1) can evidently generate these.
	* This causes kernel to core dump.
	* for compatibility, If we see this, point to a safe address.
	*/
	if (sa->sa_len == 0) {
	rtinfo->rti_info[i] = &sa_zero;
	return (0); /* should be EINVAL but for compat */
	}
	/* accept it */
	rtinfo->rti_info[i] = sa;
	cp += SA_SIZE(sa);
	}
	return (0);
	}

	static struct mbuf *
	rt_msg1(int type, struct rt_addrinfo *rtinfo)
	{
	struct rt_msghdr *rtm;
	struct mbuf *m;
	int i;
	struct sockaddr *sa;
	int len, dlen;

	switch (type) {

	case RTM_DELADDR:
	case RTM_NEWADDR:
	len = sizeof(struct ifa_msghdr);
	break;

	case RTM_DELMADDR:
	case RTM_NEWMADDR:
	len = sizeof(struct ifma_msghdr);
	break;

	case RTM_IFINFO:
	len = sizeof(struct if_msghdr);
	break;

	case RTM_IFANNOUNCE:
	case RTM_IEEE80211:
	len = sizeof(struct if_announcemsghdr);
	break;

	default:
	len = sizeof(struct rt_msghdr);
	}
	if (len > MCLBYTES)
	panic("rt_msg1");
	m = m_gethdr(M_DONTWAIT, MT_DATA);
	if (m && len > MHLEN) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	m_free(m);
	m = NULL;
	}
	}
	if (m == NULL)
	return (m);
	m->m_pkthdr.len = m->m_len = len;
	m->m_pkthdr.rcvif = NULL;
	rtm = mtod(m, struct rt_msghdr *);
	bzero((caddr_t)rtm, len);
	for (i = 0; i < RTAX_MAX; i++) {
	if ((sa = rtinfo->rti_info[i]) == NULL)
	continue;
	rtinfo->rti_addrs \|= (1 << i);
	dlen = SA_SIZE(sa);
	m_copyback(m, len, dlen, (caddr_t)sa);
	len += dlen;
	}
	if (m->m_pkthdr.len != len) {
	m_freem(m);
	return (NULL);
	}
	rtm->rtm_msglen = len;
	rtm->rtm_version = RTM_VERSION;
	rtm->rtm_type = type;
	return (m);
	}

	static int
	rt_msg2(int type, struct rt_addrinfo rtinfo, caddr_t cp, struct walkarg w)
	{
	int i;
	int len, dlen, second_time = 0;
	caddr_t cp0;

	rtinfo->rti_addrs = 0;
	again:
	switch (type) {

	case RTM_DELADDR:
	case RTM_NEWADDR:
	len = sizeof(struct ifa_msghdr);
	break;

	case RTM_IFINFO:
	len = sizeof(struct if_msghdr);
	break;

	case RTM_NEWMADDR:
	len = sizeof(struct ifma_msghdr);
	break;

	default:
	len = sizeof(struct rt_msghdr);
	}
	cp0 = cp;
	if (cp0)
	cp += len;
	for (i = 0; i < RTAX_MAX; i++) {
	struct sockaddr *sa;

	if ((sa = rtinfo->rti_info[i]) == NULL)
	continue;
	rtinfo->rti_addrs \|= (1 << i);
	dlen = SA_SIZE(sa);
	if (cp) {
	bcopy((caddr_t)sa, cp, (unsigned)dlen);
	cp += dlen;
	}
	len += dlen;
	}
	len = ALIGN(len);
	if (cp == NULL && w != NULL && !second_time) {
	struct walkarg *rw = w;

	if (rw->w_req) {
	if (rw->w_tmemsize < len) {
	if (rw->w_tmem)
	free(rw->w_tmem, M_RTABLE);
	rw->w_tmem = (caddr_t)
	malloc(len, M_RTABLE, M_NOWAIT);
	if (rw->w_tmem)
	rw->w_tmemsize = len;
	}
	if (rw->w_tmem) {
	cp = rw->w_tmem;
	second_time = 1;
	goto again;
	}
	}
	}
	if (cp) {
	struct rt_msghdr rtm = (struct rt_msghdr )cp0;

	rtm->rtm_version = RTM_VERSION;
	rtm->rtm_type = type;
	rtm->rtm_msglen = len;
	}
	return (len);
	}

	/*
	* This routine is called to generate a message from the routing
	* socket indicating that a redirect has occured, a routing lookup
	* has failed, or that a protocol has detected timeouts to a particular
	* destination.
	*/
	void
	rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
	{
	struct rt_msghdr *rtm;
	struct mbuf *m;
	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];

	if (route_cb.any_count == 0)
	return;
	m = rt_msg1(type, rtinfo);
	if (m == NULL)
	return;
	rtm = mtod(m, struct rt_msghdr *);
	rtm->rtm_flags = RTF_DONE \| flags;
	rtm->rtm_errno = error;
	rtm->rtm_addrs = rtinfo->rti_addrs;
	rt_dispatch(m, sa);
	}

	/*
	* This routine is called to generate a message from the routing
	* socket indicating that the status of a network interface has changed.
	*/
	void
	rt_ifmsg(struct ifnet *ifp)
	{
	struct if_msghdr *ifm;
	struct mbuf *m;
	struct rt_addrinfo info;

	if (route_cb.any_count == 0)
	return;
	bzero((caddr_t)&info, sizeof(info));
	m = rt_msg1(RTM_IFINFO, &info);
	if (m == NULL)
	return;
	ifm = mtod(m, struct if_msghdr *);
	ifm->ifm_index = ifp->if_index;
	ifm->ifm_flags = ifp->if_flags \| ifp->if_drv_flags;
	ifm->ifm_data = ifp->if_data;
	ifm->ifm_addrs = 0;
	rt_dispatch(m, NULL);
	}

	/*
	* This is called to generate messages from the routing socket
	* indicating a network interface has had addresses associated with it.
	* if we ever reverse the logic and replace messages TO the routing
	* socket indicate a request to configure interfaces, then it will
	* be unnecessary as the routing socket will automatically generate
	* copies of it.
	*/
	void
	rt_newaddrmsg(int cmd, struct ifaddr ifa, int error, struct rtentry rt)
	{
	struct rt_addrinfo info;
	struct sockaddr *sa = NULL;
	int pass;
	struct mbuf *m = NULL;
	struct ifnet *ifp = ifa->ifa_ifp;

	KASSERT(cmd == RTM_ADD \|\| cmd == RTM_DELETE,
	("unexpected cmd %u", cmd));
	#ifdef SCTP
	/*
	* notify the SCTP stack
	* this will only get called when an address is added/deleted
	* XXX pass the ifaddr struct instead if ifa->ifa_addr...
	*/
	sctp_addr_change(ifa, cmd);
	#endif /* SCTP */
	if (route_cb.any_count == 0)
	return;
	for (pass = 1; pass < 3; pass++) {
	bzero((caddr_t)&info, sizeof(info));
	if ((cmd == RTM_ADD && pass == 1) \|\|
	(cmd == RTM_DELETE && pass == 2)) {
	struct ifa_msghdr *ifam;
	int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;

	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
	info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
	if ((m = rt_msg1(ncmd, &info)) == NULL)
	continue;
	ifam = mtod(m, struct ifa_msghdr *);
	ifam->ifam_index = ifp->if_index;
	ifam->ifam_metric = ifa->ifa_metric;
	ifam->ifam_flags = ifa->ifa_flags;
	ifam->ifam_addrs = info.rti_addrs;
	}
	if ((cmd == RTM_ADD && pass == 2) \|\|
	(cmd == RTM_DELETE && pass == 1)) {
	struct rt_msghdr *rtm;

	if (rt == NULL)
	continue;
	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
	info.rti_info[RTAX_DST] = sa = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
	if ((m = rt_msg1(cmd, &info)) == NULL)
	continue;
	rtm = mtod(m, struct rt_msghdr *);
	rtm->rtm_index = ifp->if_index;
	rtm->rtm_flags \|= rt->rt_flags;
	rtm->rtm_errno = error;
	rtm->rtm_addrs = info.rti_addrs;
	}
	rt_dispatch(m, sa);
	}
	}

	/*
	* This is the analogue to the rt_newaddrmsg which performs the same
	* function but for multicast group memberhips. This is easier since
	* there is no route state to worry about.
	*/
	void
	rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
	{
	struct rt_addrinfo info;
	struct mbuf *m = NULL;
	struct ifnet *ifp = ifma->ifma_ifp;
	struct ifma_msghdr *ifmam;

	if (route_cb.any_count == 0)
	return;

	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
	info.rti_info[RTAX_IFP] = ifp ? ifp->if_addr->ifa_addr : NULL;
	/*
	* If a link-layer address is present, present it as a ``gateway''
	* (similarly to how ARP entries, e.g., are presented).
	*/
	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
	m = rt_msg1(cmd, &info);
	if (m == NULL)
	return;
	ifmam = mtod(m, struct ifma_msghdr *);
	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
	__func__));
	ifmam->ifmam_index = ifp->if_index;
	ifmam->ifmam_addrs = info.rti_addrs;
	rt_dispatch(m, ifma->ifma_addr);
	}

	static struct mbuf *
	rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
	struct rt_addrinfo *info)
	{
	struct if_announcemsghdr *ifan;
	struct mbuf *m;

	if (route_cb.any_count == 0)
	return NULL;
	bzero((caddr_t)info, sizeof(*info));
	m = rt_msg1(type, info);
	if (m != NULL) {
	ifan = mtod(m, struct if_announcemsghdr *);
	ifan->ifan_index = ifp->if_index;
	strlcpy(ifan->ifan_name, ifp->if_xname,
	sizeof(ifan->ifan_name));
	ifan->ifan_what = what;
	}
	return m;
	}

	/*
	* This is called to generate routing socket messages indicating
	* IEEE80211 wireless events.
	* XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
	*/
	void
	rt_ieee80211msg(struct ifnet ifp, int what, void data, size_t data_len)
	{
	struct mbuf *m;
	struct rt_addrinfo info;

	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
	if (m != NULL) {
	/*
	* Append the ieee80211 data. Try to stick it in the
	* mbuf containing the ifannounce msg; otherwise allocate
	* a new mbuf and append.
	*
	* NB: we assume m is a single mbuf.
	*/
	if (data_len > M_TRAILINGSPACE(m)) {
	struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
	if (n == NULL) {
	m_freem(m);
	return;
	}
	bcopy(data, mtod(n, void *), data_len);
	n->m_len = data_len;
	m->m_next = n;
	} else if (data_len > 0) {
	bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
	m->m_len += data_len;
	}
	if (m->m_flags & M_PKTHDR)
	m->m_pkthdr.len += data_len;
	mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
	rt_dispatch(m, NULL);
	}
	}

	/*
	* This is called to generate routing socket messages indicating
	* network interface arrival and departure.
	*/
	void
	rt_ifannouncemsg(struct ifnet *ifp, int what)
	{
	struct mbuf *m;
	struct rt_addrinfo info;

	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
	if (m != NULL)
	rt_dispatch(m, NULL);
	}

	static void
	rt_dispatch(struct mbuf m, const struct sockaddr sa)
	{
	+ INIT_VNET_NET(curvnet);
	struct m_tag *tag;

	/*
	* Preserve the family from the sockaddr, if any, in an m_tag for
	* use when injecting the mbuf into the routing socket buffer from
	* the netisr.
	*/
	if (sa != NULL) {
	tag = m_tag_get(PACKET_TAG_RTSOCKFAM, sizeof(unsigned short),
	M_NOWAIT);
	if (tag == NULL) {
	m_freem(m);
	return;
	}
	(unsigned short )(tag + 1) = sa->sa_family;
	m_tag_prepend(m, tag);
	}
	netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */
	}

	/*
	* This is used in dumping the kernel table via sysctl().
	*/
	static int
	sysctl_dumpentry(struct radix_node rn, void vw)
	{
	struct walkarg *w = vw;
	struct rtentry rt = (struct rtentry )rn;
	int error = 0, size;
	struct rt_addrinfo info;

	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
	return 0;
	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_DST] = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
	info.rti_info[RTAX_GENMASK] = rt->rt_genmask;
	if (rt->rt_ifp) {
	info.rti_info[RTAX_IFP] = rt->rt_ifp->if_addr->ifa_addr;
	info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
	if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
	info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
	}
	size = rt_msg2(RTM_GET, &info, NULL, w);
	if (w->w_req && w->w_tmem) {
	struct rt_msghdr rtm = (struct rt_msghdr )w->w_tmem;

	rtm->rtm_flags = rt->rt_flags;
	rtm->rtm_use = rt->rt_rmx.rmx_pksent;
	rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
	rtm->rtm_index = rt->rt_ifp->if_index;
	rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
	rtm->rtm_addrs = info.rti_addrs;
	error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
	return (error);
	}
	return (error);
	}

	static int
	sysctl_iflist(int af, struct walkarg *w)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct rt_addrinfo info;
	int len, error = 0;

	bzero((caddr_t)&info, sizeof(info));
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (w->w_arg && w->w_arg != ifp->if_index)
	continue;
	ifa = ifp->if_addr;
	info.rti_info[RTAX_IFP] = ifa->ifa_addr;
	len = rt_msg2(RTM_IFINFO, &info, NULL, w);
	info.rti_info[RTAX_IFP] = NULL;
	if (w->w_req && w->w_tmem) {
	struct if_msghdr *ifm;

	ifm = (struct if_msghdr *)w->w_tmem;
	ifm->ifm_index = ifp->if_index;
	ifm->ifm_flags = ifp->if_flags \| ifp->if_drv_flags;
	ifm->ifm_data = ifp->if_data;
	ifm->ifm_addrs = info.rti_addrs;
	error = SYSCTL_OUT(w->w_req,(caddr_t)ifm, len);
	if (error)
	goto done;
	}
	while ((ifa = TAILQ_NEXT(ifa, ifa_link)) != NULL) {
	if (af && af != ifa->ifa_addr->sa_family)
	continue;
	if (jailed(curthread->td_ucred) &&
	prison_if(curthread->td_ucred, ifa->ifa_addr))
	continue;
	info.rti_info[RTAX_IFA] = ifa->ifa_addr;
	info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
	len = rt_msg2(RTM_NEWADDR, &info, NULL, w);
	if (w->w_req && w->w_tmem) {
	struct ifa_msghdr *ifam;

	ifam = (struct ifa_msghdr *)w->w_tmem;
	ifam->ifam_index = ifa->ifa_ifp->if_index;
	ifam->ifam_flags = ifa->ifa_flags;
	ifam->ifam_metric = ifa->ifa_metric;
	ifam->ifam_addrs = info.rti_addrs;
	error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
	if (error)
	goto done;
	}
	}
	info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
	info.rti_info[RTAX_BRD] = NULL;
	}
	done:
	IFNET_RUNLOCK();
	return (error);
	}

	int
	sysctl_ifmalist(int af, struct walkarg *w)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	struct ifmultiaddr *ifma;
	struct rt_addrinfo info;
	int len, error = 0;
	struct ifaddr *ifa;

	bzero((caddr_t)&info, sizeof(info));
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	if (w->w_arg && w->w_arg != ifp->if_index)
	continue;
	ifa = ifp->if_addr;
	info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
	IF_ADDR_LOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (af && af != ifma->ifma_addr->sa_family)
	continue;
	if (jailed(curproc->p_ucred) &&
	prison_if(curproc->p_ucred, ifma->ifma_addr))
	continue;
	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
	info.rti_info[RTAX_GATEWAY] =
	(ifma->ifma_addr->sa_family != AF_LINK) ?
	ifma->ifma_lladdr : NULL;
	len = rt_msg2(RTM_NEWMADDR, &info, NULL, w);
	if (w->w_req && w->w_tmem) {
	struct ifma_msghdr *ifmam;

	ifmam = (struct ifma_msghdr *)w->w_tmem;
	ifmam->ifmam_index = ifma->ifma_ifp->if_index;
	ifmam->ifmam_flags = 0;
	ifmam->ifmam_addrs = info.rti_addrs;
	error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
	if (error) {
	IF_ADDR_UNLOCK(ifp);
	goto done;
	}
	}
	}
	IF_ADDR_UNLOCK(ifp);
	}
	done:
	IFNET_RUNLOCK();
	return (error);
	}

	static int
	sysctl_rtsock(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_NET(curvnet);
	int name = (int )arg1;
	u_int namelen = arg2;
	struct radix_node_head *rnh;
	int i, lim, error = EINVAL;
	u_char af;
	struct walkarg w;

	name ++;
	namelen--;
	if (req->newptr)
	return (EPERM);
	if (namelen != 3)
	return ((namelen < 3) ? EISDIR : ENOTDIR);
	af = name[0];
	if (af > AF_MAX)
	return (EINVAL);
	bzero(&w, sizeof(w));
	w.w_op = name[1];
	w.w_arg = name[2];
	w.w_req = req;

	error = sysctl_wire_old_buffer(req, 0);
	if (error)
	return (error);
	switch (w.w_op) {

	case NET_RT_DUMP:
	case NET_RT_FLAGS:
	if (af == 0) { /* dump all tables */
	i = 1;
	lim = AF_MAX;
	} else /* dump only one table */
	i = lim = af;
	for (error = 0; error == 0 && i <= lim; i++)
	if ((rnh = V_rt_tables[curthread->td_proc->p_fibnum][i]) != NULL) {
	RADIX_NODE_HEAD_LOCK(rnh);
	error = rnh->rnh_walktree(rnh,
	sysctl_dumpentry, &w);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	} else if (af != 0)
	error = EAFNOSUPPORT;
	break;

	case NET_RT_IFLIST:
	error = sysctl_iflist(af, &w);
	break;

	case NET_RT_IFMALIST:
	error = sysctl_ifmalist(af, &w);
	break;
	}
	if (w.w_tmem)
	free(w.w_tmem, M_RTABLE);
	return (error);
	}

	SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, "");

	/*
	* Definitions of protocols supported in the ROUTE domain.
	*/

	static struct domain routedomain; /* or at least forward */

	static struct protosw routesw[] = {
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &routedomain,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_output = route_output,
	.pr_ctlinput = raw_ctlinput,
	.pr_init = raw_init,
	.pr_usrreqs = &route_usrreqs
	}
	};

	static struct domain routedomain = {
	.dom_family = PF_ROUTE,
	.dom_name = "route",
	.dom_protosw = routesw,
	.dom_protoswNPROTOSW = &routesw[sizeof(routesw)/sizeof(routesw[0])]
	};

	DOMAIN_SET(route);
	Index: head/sys/net/vnet.h
	===================================================================
	--- head/sys/net/vnet.h (nonexistent)
	+++ head/sys/net/vnet.h (revision 183550)
	@@ -0,0 +1,93 @@
	+/*-
	+ * Copyright (c) 2006-2008 University of Zagreb
	+ * Copyright (c) 2006-2008 FreeBSD Foundation
	+ *
	+ * This software was developed by the University of Zagreb and the
	+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
	+ * FreeBSD Foundation.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _NET_VNET_H_
	+#define _NET_VNET_H_
	+
	+#ifdef VIMAGE
	+#include "opt_route.h"
	+
	+#include <sys/proc.h>
	+#include <sys/protosw.h>
	+#include <sys/socket.h>
	+
	+#include <net/if.h>
	+#include <net/if_var.h>
	+#include <net/route.h>
	+#include <net/raw_cb.h>
	+
	+struct vnet_net {
	+ int _if_index;
	+ struct ifindex_entry *_ifindex_table;
	+ struct ifnethead _ifnet;
	+ struct ifgrouphead _ifg_head;
	+
	+ int _if_indexlim;
	+ struct knlist _ifklist;
	+
	+ struct rtstat _rtstat;
	+ struct radix_node_head *_rt_tables[RT_MAXFIBS][AF_MAX+1];
	+ int _rttrash;
	+
	+ struct ifnet *_loif;
	+ LIST_HEAD(, lo_softc) _lo_list;
	+
	+ LIST_HEAD(, rawcb) _rawcb_list;
	+
	+ int _ether_ipfw;
	+};
	+
	+#endif
	+
	+/*
	+ * Symbol translation macros
	+ */
	+#define INIT_VNET_NET(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_NET, struct vnet_net, vnet_net)
	+
	+#define VNET_NET(sym) VSYM(vnet_net, sym)
	+
	+#define V_ether_ipfw VNET_NET(ether_ipfw)
	+#define V_if_index VNET_NET(if_index)
	+#define V_if_indexlim VNET_NET(if_indexlim)
	+#define V_ifg_head VNET_NET(ifg_head)
	+#define V_ifindex_table VNET_NET(ifindex_table)
	+#define V_ifklist VNET_NET(ifklist)
	+#define V_ifnet VNET_NET(ifnet)
	+#define V_lo_list VNET_NET(lo_list)
	+#define V_loif VNET_NET(loif)
	+#define V_rawcb_list VNET_NET(rawcb_list)
	+#define V_rt_tables VNET_NET(rt_tables)
	+#define V_rtstat VNET_NET(rtstat)
	+#define V_rttrash VNET_NET(rttrash)
	+
	+#endif /* !_NET_VNET_H_ */

	Property changes on: head/sys/net/vnet.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/sys/net80211/ieee80211_ddb.c
	===================================================================
	--- head/sys/net80211/ieee80211_ddb.c (revision 183549)
	+++ head/sys/net80211/ieee80211_ddb.c (revision 183550)
	@@ -1,799 +1,804 @@
	/*-
	* Copyright (c) 2007-2008 Sam Leffler, Errno Consulting
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_wlan.h"

	#ifdef DDB
	/*
	* IEEE 802.11 DDB support
	*/
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/socket.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_media.h>
	#include <net/if_types.h>
	#include <net/ethernet.h>

	#include <net80211/ieee80211_var.h>

	#include <ddb/ddb.h>
	#include <ddb/db_sym.h>

	#define IEEE80211_MSG_BITS \
	"\20\3IOCTL\4WDS\5ACTION\6RATECTL\7ROAM\10INACT\11DOTH\12SUPERG" \
	"\13WME\14ACL\15WPA\16RADKEYS\17RADDUMP\20RADIUS\21DOT1X\22POWER" \
	"\23STATE\24OUTPUT\25SCAN\26AUTH\27ASSOC\30NODE\31ELEMID\32XRATE" \
	"\33INPUT\34CRYPTO\35DUPMPKTS\36DEBUG\3711N"

	#define IEEE80211_F_BITS \
	"\20\1TURBOP\2COMP\3FF\4BURST\5PRIVACY\6PUREG\10SCAN\11ASCAN\12SIBSS" \
	"\13SHSLOT\14PMGTON\15DESBSSID\16WME\17BGSCAN\20SWRETRY\21TXPOW_FIXED" \
	"\22IBSSON\23SHPREAMBLE\24DATAPAD\25USEPROT\26USERBARKER\27CSAPENDING" \
	"\30WPA1\31WPA2\32DROPUNENC\33COUNTERM\34HIDESSID\35NOBRIDG\36PCF" \
	"\37DOTH\40DWDS"

	#define IEEE80211_FEXT_BITS \
	"\20\1NONHT_PR\2INACT\3SCANWAIT\4BGSCAN\5WPS\6TSN\7SCANREQ\10RESUME" \
	"\12NONEPR_PR\13SWBMISS\14DFS\15DOTD\22WDSLEGACY\23PROBECHAN\24HT" \
	"\25AMDPU_TX\26AMPDU_TX\27AMSDU_TX\30AMSDU_RX\31USEHT40\32PUREN" \
	"\33SHORTGI20\34SHORTGI40\35HTCOMPAT\36RIFS"

	#define IEEE80211_FVEN_BITS "\20"

	#define IEEE80211_C_BITS \
	"\20\1STA\7FF\10TURBOP\11IBSS\12PMGT" \
	"\13HOSTAP\14AHDEMO\15SWRETRY\16TXPMGT\17SHSLOT\20SHPREAMBLE" \
	"\21MONITOR\22DFS\30WPA1\31WPA2\32BURST\33WME\34WDS\36BGSCAN" \
	"\37TXFRAG"

	#define IEEE80211_C_CRYPTO_BITS \
	"\20\1WEP\2TKIP\3AES\4AES_CCM\5TKIPMIC\6CKIP\12PMGT"

	#define IEEE80211_C_HTCAP_BITS \
	"\20\1LDPC\2CHWIDTH40\5GREENFIELD\6SHORTGI20\7SHORTGI40\10TXSTBC" \
	"\21AMPDU\22AMSDU\23HT\24SMPS\25RIFS"

	/* NB: policy bits not included */
	#define IEEE80211_CHAN_BITS \
	"\20\5TURBO\6CCK\7OFDM\0102GHZ\0115GHZ\12PASSIVE\13DYN\14GFSK" \
	"\15STURBO\16HALF\17QUARTER\20HT20\21HT40U\22HT40D\23DFS"

	#define IEEE80211_NODE_BITS \
	"\20\1AUTH\2QOS\3ERP\5PWR_MGT\6AREF\7HT\10HTCOMPAT\11WPS\12TSN" \
	"\13AMPDU_RX\14AMPDU_TX\15MIMO_PS\16MIMO_RTS\17RIFS\20SGI20\21SGI40"

	#define IEEE80211_ERP_BITS \
	"\20\1NON_ERP_PRESENT\2USE_PROTECTION\3LONG_PREAMBLE"

	#define IEEE80211_CAPINFO_BITS \
	"\20\1ESS\2IBSS\3CF_POLLABLE\4CF_POLLREQ\5PRIVACY\6SHORT_PREAMBLE" \
	"\7PBCC\10CHNL_AGILITY\11SPECTRUM_MGMT\13SHORT_SLOTTIME\14RSN" \
	"\16DSSOFDM"

	#define IEEE80211_HTCAP_BITS \
	"\20\1LDPC\2CHWIDTH40\5GREENFIELD\6SHORTGI20\7SHORTGI40\10TXSTBC" \
	"\13DELBA\14AMSDU(7935)\15DSSSCCK40\16PSMP\1740INTOLERANT" \
	"\20LSIGTXOPPROT"

	#define IEEE80211_AGGR_BITS \
	"\20\1IMMEDIATE\2XCHGPEND\3RUNNING\4SETUP\5NAK"

	#define DB_PRINTSYM(prefix, addr) \
	db_printf(prefix " "); \
	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
	db_printf("\n");

	static void _db_show_sta(const struct ieee80211_node *);
	static void _db_show_vap(const struct ieee80211vap *, int);
	static void _db_show_com(const struct ieee80211com *,
	int showvaps, int showsta, int showprocs);

	static void _db_show_channel(const char tag, const struct ieee80211_channel );
	static void _db_show_ssid(const char tag, int ix, int len, const uint8_t );
	static void _db_show_appie(const char tag, const struct ieee80211_appie );
	static void _db_show_key(const char tag, int ix, const struct ieee80211_key );
	static void _db_show_roamparams(const char tag, const void arg,
	const struct ieee80211_roamparam *rp);
	static void _db_show_txparams(const char tag, const void arg,
	const struct ieee80211_txparam *tp);
	static void _db_show_stats(const struct ieee80211_stats *);

	DB_SHOW_COMMAND(sta, db_show_sta)
	{
	if (!have_addr) {
	db_printf("usage: show sta <addr>\n");
	return;
	}
	_db_show_sta((const struct ieee80211_node *) addr);
	}

	DB_SHOW_COMMAND(vap, db_show_vap)
	{
	int i, showprocs = 0;

	if (!have_addr) {
	db_printf("usage: show vap <addr>\n");
	return;
	}
	for (i = 0; modif[i] != '\0'; i++)
	switch (modif[i]) {
	case 'a':
	showprocs = 1;
	break;
	case 'p':
	showprocs = 1;
	break;
	}
	_db_show_vap((const struct ieee80211vap *) addr, showprocs);
	}

	DB_SHOW_COMMAND(com, db_show_com)
	{
	const struct ieee80211com *ic;
	int i, showprocs = 0, showvaps = 0, showsta = 0;

	if (!have_addr) {
	db_printf("usage: show com <addr>\n");
	return;
	}
	for (i = 0; modif[i] != '\0'; i++)
	switch (modif[i]) {
	case 'a':
	showsta = showvaps = showprocs = 1;
	break;
	case 's':
	showsta = 1;
	break;
	case 'v':
	showvaps = 1;
	break;
	case 'p':
	showprocs = 1;
	break;
	}

	ic = (const struct ieee80211com *) addr;
	_db_show_com(ic, showvaps, showsta, showprocs);
	}

	DB_SHOW_ALL_COMMAND(vaps, db_show_all_vaps)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	const struct ifnet *ifp;
	int i, showall = 0;

	for (i = 0; modif[i] != '\0'; i++)
	switch (modif[i]) {
	case 'a':
	showall = 1;
	break;
	}

	- TAILQ_FOREACH(ifp, &V_ifnet, if_list)
	- if (ifp->if_type == IFT_IEEE80211) {
	- const struct ieee80211com *ic = ifp->if_l2com;
	+ VNET_FOREACH(vnet_iter) {
	+ INIT_VNET_NET(vnet_iter);
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_list)
	+ if (ifp->if_type == IFT_IEEE80211) {
	+ const struct ieee80211com *ic = ifp->if_l2com;

	- if (!showall) {
	- const struct ieee80211vap *vap;
	- db_printf("%s: com %p vaps:",
	- ifp->if_xname, ic);
	- TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
	- db_printf(" %s(%p)",
	- vap->iv_ifp->if_xname, vap);
	- db_printf("\n");
	- } else
	- _db_show_com(ic, 1, 1, 1);
	- }
	+ if (!showall) {
	+ const struct ieee80211vap *vap;
	+ db_printf("%s: com %p vaps:",
	+ ifp->if_xname, ic);
	+ TAILQ_FOREACH(vap, &ic->ic_vaps,
	+ iv_next)
	+ db_printf(" %s(%p)",
	+ vap->iv_ifp->if_xname, vap);
	+ db_printf("\n");
	+ } else
	+ _db_show_com(ic, 1, 1, 1);
	+ }
	+ }
	}

	static void
	_db_show_txampdu(const char sep, int ix, const struct ieee80211_tx_ampdu tap)
	{
	db_printf("%stxampdu[%d]: %p flags %b ac %u\n",
	sep, ix, tap, tap->txa_flags, IEEE80211_AGGR_BITS, tap->txa_ac);
	db_printf("%s token %u qbytes %d qframes %d start %u wnd %u\n",
	sep, tap->txa_token, tap->txa_qbytes, tap->txa_qframes,
	tap->txa_start, tap->txa_wnd);
	db_printf("%s attempts %d nextrequest %d\n",
	sep, tap->txa_attempts, tap->txa_nextrequest);
	/* XXX packet q + timer */
	}

	static void
	_db_show_rxampdu(const char sep, int ix, const struct ieee80211_rx_ampdu rap)
	{
	db_printf("%srxampdu[%d]: %p flags 0x%x tid %u\n",
	sep, ix, rap, rap->rxa_flags, ix /XXX /);
	db_printf("%s qbytes %d qframes %d seqstart %u start %u wnd %u\n",
	sep, rap->rxa_qbytes, rap->rxa_qframes,
	rap->rxa_seqstart, rap->rxa_start, rap->rxa_wnd);
	db_printf("%s age %d nframes %d\n",
	sep, rap->rxa_age, rap->rxa_nframes);
	}

	static void
	_db_show_sta(const struct ieee80211_node *ni)
	{
	int i;

	db_printf("0x%p: mac %s refcnt %d\n", ni,
	ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni));
	db_printf("\tvap %p wdsvap %p ic %p table %p\n",
	ni->ni_vap, ni->ni_wdsvap, ni->ni_ic, ni->ni_table);
	db_printf("\tflags=%b\n", ni->ni_flags, IEEE80211_NODE_BITS);
	db_printf("\tscangen %u authmode %u ath_flags 0x%x ath_defkeyix %u\n",
	ni->ni_scangen, ni->ni_authmode,
	ni->ni_ath_flags, ni->ni_ath_defkeyix);
	db_printf("\tassocid 0x%x txpower %u vlan %u\n",
	ni->ni_associd, ni->ni_txpower, ni->ni_vlan);
	db_printf("\tjointime %d (%lu secs) challenge %p\n",
	ni->ni_jointime, (unsigned long)(time_uptime - ni->ni_jointime),
	ni->ni_challenge);
	db_printf("\ties: data %p len %d\n", ni->ni_ies.data, ni->ni_ies.len);
	db_printf("\t[wpa_ie %p rsn_ie %p wme_ie %p ath_ie %p\n",
	ni->ni_ies.wpa_ie, ni->ni_ies.rsn_ie, ni->ni_ies.wme_ie,
	ni->ni_ies.ath_ie);
	db_printf("\t htcap_ie %p htinfo_ie %p]\n",
	ni->ni_ies.htcap_ie, ni->ni_ies.htinfo_ie);
	db_printf("\ttxseq %u rxseq %u fragno %u rxfragstamp %u\n",
	ni->ni_txseqs[IEEE80211_NONQOS_TID],
	ni->ni_rxseqs[IEEE80211_NONQOS_TID] >> IEEE80211_SEQ_SEQ_SHIFT,
	ni->ni_rxseqs[IEEE80211_NONQOS_TID] & IEEE80211_SEQ_FRAG_MASK,
	ni->ni_rxfragstamp);
	db_printf("\trxfrag[0] %p rxfrag[1] %p rxfrag[2] %p\n",
	ni->ni_rxfrag[0], ni->ni_rxfrag[1], ni->ni_rxfrag[2]);
	db_printf("\trstamp %u avgrssi 0x%x (rssi %d) noise %d\n",
	ni->ni_rstamp, ni->ni_avgrssi,
	IEEE80211_RSSI_GET(ni->ni_avgrssi), ni->ni_noise);
	db_printf("\tintval %u capinfo %b\n",
	ni->ni_intval, ni->ni_capinfo, IEEE80211_CAPINFO_BITS);
	db_printf("\tbssid %s", ether_sprintf(ni->ni_bssid));
	_db_show_ssid(" essid ", 0, ni->ni_esslen, ni->ni_essid);
	db_printf("\n");
	_db_show_channel("\tchannel", ni->ni_chan);
	db_printf("\n");
	db_printf("\terp %b dtim_period %u dtim_count %u\n",
	ni->ni_erp, IEEE80211_ERP_BITS,
	ni->ni_dtim_period, ni->ni_dtim_count);

	db_printf("\thtcap %b htparam 0x%x htctlchan %u ht2ndchan %u\n",
	ni->ni_htcap, IEEE80211_HTCAP_BITS,
	ni->ni_htparam, ni->ni_htctlchan, ni->ni_ht2ndchan);
	db_printf("\thtopmode 0x%x htstbc 0x%x chw %u\n",
	ni->ni_htopmode, ni->ni_htstbc, ni->ni_chw);

	/* XXX ampdu state */
	for (i = 0; i < WME_NUM_AC; i++)
	if (ni->ni_tx_ampdu[i].txa_flags & IEEE80211_AGGR_SETUP)
	_db_show_txampdu("\t", i, &ni->ni_tx_ampdu[i]);
	for (i = 0; i < WME_NUM_TID; i++)
	if (ni->ni_rx_ampdu[i].rxa_nframes)
	_db_show_rxampdu("\t", i, &ni->ni_rx_ampdu[i]);

	db_printf("\tinact %u inact_reload %u txrate %u\n",
	ni->ni_inact, ni->ni_inact_reload, ni->ni_txrate);
	/* XXX savedq */
	/* XXX wdsq */
	}

	static void
	_db_show_vap(const struct ieee80211vap *vap, int showprocs)
	{
	const struct ieee80211com *ic = vap->iv_ic;
	int i;

	db_printf("%p:", vap);
	db_printf(" bss %p", vap->iv_bss);
	db_printf(" myaddr %s", ether_sprintf(vap->iv_myaddr));
	db_printf("\n");

	db_printf("\topmode %s", ieee80211_opmode_name[vap->iv_opmode]);
	db_printf(" state %s", ieee80211_state_name[vap->iv_state]);
	db_printf(" ifp %p", vap->iv_ifp);
	db_printf("\n");

	db_printf("\tic %p", vap->iv_ic);
	db_printf(" media %p", &vap->iv_media);
	db_printf(" bpf_if %p", vap->iv_rawbpf);
	db_printf(" mgtsend %p", &vap->iv_mgtsend);
	#if 0
	struct sysctllog iv_sysctl; / dynamic sysctl context */
	#endif
	db_printf("\n");
	db_printf("\tdebug=%b\n", vap->iv_debug, IEEE80211_MSG_BITS);

	db_printf("\tflags=%b\n", vap->iv_flags, IEEE80211_F_BITS);
	db_printf("\tflags_ext=%b\n", vap->iv_flags_ext, IEEE80211_FEXT_BITS);
	db_printf("\tflags_ven=%b\n", vap->iv_flags_ven, IEEE80211_FVEN_BITS);
	db_printf("\tcaps=%b\n", vap->iv_caps, IEEE80211_C_BITS);
	db_printf("\thtcaps=%b\n", vap->iv_htcaps, IEEE80211_C_HTCAP_BITS);

	_db_show_stats(&vap->iv_stats);

	db_printf("\tinact_init %d", vap->iv_inact_init);
	db_printf(" inact_auth %d", vap->iv_inact_auth);
	db_printf(" inact_run %d", vap->iv_inact_run);
	db_printf(" inact_probe %d", vap->iv_inact_probe);
	db_printf("\n");

	db_printf("\tdes_nssid %d", vap->iv_des_nssid);
	if (vap->iv_des_nssid)
	_db_show_ssid(" des_ssid[%u] ", 0,
	vap->iv_des_ssid[0].len, vap->iv_des_ssid[0].ssid);
	db_printf(" des_bssid %s", ether_sprintf(vap->iv_des_bssid));
	db_printf("\n");
	db_printf("\tdes_mode %d", vap->iv_des_mode);
	_db_show_channel(" des_chan", vap->iv_des_chan);
	db_printf("\n");
	#if 0
	int iv_nicknamelen; /* XXX junk */
	uint8_t iv_nickname[IEEE80211_NWID_LEN];
	#endif
	db_printf("\tbgscanidle %u", vap->iv_bgscanidle);
	db_printf(" bgscanintvl %u", vap->iv_bgscanintvl);
	db_printf(" scanvalid %u", vap->iv_scanvalid);
	db_printf("\n");
	db_printf("\tscanreq_duration %u", vap->iv_scanreq_duration);
	db_printf(" scanreq_mindwell %u", vap->iv_scanreq_mindwell);
	db_printf(" scanreq_maxdwell %u", vap->iv_scanreq_maxdwell);
	db_printf("\n");
	db_printf(" scanreq_flags 0x%x", vap->iv_scanreq_flags);
	db_printf("\tscanreq_nssid %d", vap->iv_scanreq_nssid);
	for (i = 0; i < vap->iv_scanreq_nssid; i++)
	_db_show_ssid(" scanreq_ssid[%u]", i,
	vap->iv_scanreq_ssid[i].len, vap->iv_scanreq_ssid[i].ssid);
	db_printf(" roaming %d", vap->iv_roaming);
	db_printf("\n");
	for (i = IEEE80211_MODE_11A; i < IEEE80211_MODE_MAX; i++)
	if (isset(ic->ic_modecaps, i)) {
	_db_show_roamparams("\troamparms[%s]",
	ieee80211_phymode_name[i], &vap->iv_roamparms[i]);
	db_printf("\n");
	}

	db_printf("\tbmissthreshold %u", vap->iv_bmissthreshold);
	db_printf(" bmiss_max %u", vap->iv_bmiss_count);
	db_printf(" bmiss_max %d", vap->iv_bmiss_max);
	db_printf("\n");
	db_printf("\tswbmiss_count %u", vap->iv_swbmiss_count);
	db_printf(" swbmiss_period %u", vap->iv_swbmiss_period);
	db_printf(" swbmiss %p", &vap->iv_swbmiss);
	db_printf("\n");

	db_printf("\tampdu_rxmax %d", vap->iv_ampdu_rxmax);
	db_printf(" ampdu_density %d", vap->iv_ampdu_density);
	db_printf(" ampdu_limit %d", vap->iv_ampdu_limit);
	db_printf(" amsdu_limit %d", vap->iv_amsdu_limit);
	db_printf("\n");

	db_printf("\tmax_aid %u", vap->iv_max_aid);
	db_printf(" aid_bitmap %p", vap->iv_aid_bitmap);
	db_printf("\n");
	db_printf("\tsta_assoc %u", vap->iv_sta_assoc);
	db_printf(" ps_sta %u", vap->iv_ps_sta);
	db_printf(" ps_pending %u", vap->iv_ps_pending);
	db_printf(" tim_len %u", vap->iv_tim_len);
	db_printf(" tim_bitmap %p", vap->iv_tim_bitmap);
	db_printf("\n");
	db_printf("\tdtim_period %u", vap->iv_dtim_period);
	db_printf(" dtim_count %u", vap->iv_dtim_count);
	db_printf(" set_tim %p", vap->iv_set_tim);
	db_printf(" csa_count %d", vap->iv_csa_count);
	db_printf("\n");

	db_printf("\trtsthreshold %u", vap->iv_rtsthreshold);
	db_printf(" fragthreshold %u", vap->iv_fragthreshold);
	db_printf(" inact_timer %d", vap->iv_inact_timer);
	db_printf("\n");
	for (i = IEEE80211_MODE_11A; i < IEEE80211_MODE_MAX; i++)
	if (isset(ic->ic_modecaps, i)) {
	_db_show_txparams("\ttxparms[%s]",
	ieee80211_phymode_name[i], &vap->iv_txparms[i]);
	db_printf("\n");
	}

	/* application-specified IE's to attach to mgt frames */
	_db_show_appie("\tappie_beacon", vap->iv_appie_beacon);
	_db_show_appie("\tappie_probereq", vap->iv_appie_probereq);
	_db_show_appie("\tappie_proberesp", vap->iv_appie_proberesp);
	_db_show_appie("\tappie_assocreq", vap->iv_appie_assocreq);
	_db_show_appie("\tappie_asscoresp", vap->iv_appie_assocresp);
	_db_show_appie("\tappie_wpa", vap->iv_appie_wpa);
	if (vap->iv_wpa_ie != NULL \|\| vap->iv_rsn_ie != NULL) {
	if (vap->iv_wpa_ie != NULL)
	db_printf("\twpa_ie %p", vap->iv_wpa_ie);
	if (vap->iv_rsn_ie != NULL)
	db_printf("\trsn_ie %p", vap->iv_rsn_ie);
	db_printf("\n");
	}
	db_printf("\tmax_keyix %u", vap->iv_max_keyix);
	db_printf(" def_txkey %d", vap->iv_def_txkey);
	db_printf("\n");
	for (i = 0; i < IEEE80211_WEP_NKID; i++)
	_db_show_key("\tnw_keys[%u]", i, &vap->iv_nw_keys[i]);

	db_printf("\tauth %p", vap->iv_auth);
	db_printf(" ec %p", vap->iv_ec);

	db_printf(" acl %p", vap->iv_acl);
	db_printf(" as %p", vap->iv_as);
	db_printf("\n");

	if (showprocs) {
	DB_PRINTSYM("\tiv_key_alloc", vap->iv_key_alloc);
	DB_PRINTSYM("\tiv_key_delete", vap->iv_key_delete);
	DB_PRINTSYM("\tiv_key_set", vap->iv_key_set);
	DB_PRINTSYM("\tiv_key_update_begin", vap->iv_key_update_begin);
	DB_PRINTSYM("\tiv_key_update_end", vap->iv_key_update_end);
	DB_PRINTSYM("\tiv_opdetach", vap->iv_opdetach);
	DB_PRINTSYM("\tiv_input", vap->iv_input);
	DB_PRINTSYM("\tiv_recv_mgmt", vap->iv_recv_mgmt);
	DB_PRINTSYM("\tiv_deliver_data", vap->iv_deliver_data);
	DB_PRINTSYM("\tiv_bmiss", vap->iv_bmiss);
	DB_PRINTSYM("\tiv_reset", vap->iv_reset);
	DB_PRINTSYM("\tiv_update_beacon", vap->iv_update_beacon);
	DB_PRINTSYM("\tiv_newstate", vap->iv_newstate);
	DB_PRINTSYM("\tiv_output", vap->iv_output);
	}
	}

	static void
	_db_show_com(const struct ieee80211com *ic, int showvaps, int showsta, int showprocs)
	{
	struct ieee80211vap *vap;

	db_printf("%p:", ic);
	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
	db_printf(" %s(%p)", vap->iv_ifp->if_xname, vap);
	db_printf("\n");
	db_printf("\tifp %p", ic->ic_ifp);
	db_printf(" comlock %p", &ic->ic_comlock);
	db_printf("\n");
	_db_show_stats(&ic->ic_stats);
	db_printf("\theadroom %d", ic->ic_headroom);
	db_printf(" phytype %d", ic->ic_phytype);
	db_printf(" opmode %s", ieee80211_opmode_name[ic->ic_opmode]);
	db_printf("\n");
	db_printf("\tmedia %p", &ic->ic_media);
	db_printf(" myaddr %s", ether_sprintf(ic->ic_myaddr));
	db_printf(" inact %p", &ic->ic_inact);
	db_printf("\n");

	db_printf("\tflags=%b\n", ic->ic_flags, IEEE80211_F_BITS);
	db_printf("\tflags_ext=%b\n", ic->ic_flags_ext, IEEE80211_FEXT_BITS);
	db_printf("\tflags_ven=%b\n", ic->ic_flags_ven, IEEE80211_FVEN_BITS);
	db_printf("\tcaps=%b\n", ic->ic_caps, IEEE80211_C_BITS);
	db_printf("\tcryptocaps=%b\n",
	ic->ic_cryptocaps, IEEE80211_C_CRYPTO_BITS);
	db_printf("\thtcaps=%b\n", ic->ic_htcaps, IEEE80211_HTCAP_BITS);

	#if 0
	uint8_t ic_modecaps[2]; /* set of mode capabilities */
	#endif
	db_printf("\tcurmode %u", ic->ic_curmode);
	db_printf(" promisc %u", ic->ic_promisc);
	db_printf(" allmulti %u", ic->ic_allmulti);
	db_printf(" nrunning %u", ic->ic_nrunning);
	db_printf("\n");
	db_printf("\tbintval %u", ic->ic_bintval);
	db_printf(" lintval %u", ic->ic_lintval);
	db_printf(" holdover %u", ic->ic_holdover);
	db_printf(" txpowlimit %u", ic->ic_txpowlimit);
	db_printf("\n");
	#if 0
	struct ieee80211_rateset ic_sup_rates[IEEE80211_MODE_MAX];
	#endif
	/*
	* Channel state:
	*
	* ic_channels is the set of available channels for the device;
	* it is setup by the driver
	* ic_nchans is the number of valid entries in ic_channels
	* ic_chan_avail is a bit vector of these channels used to check
	* whether a channel is available w/o searching the channel table.
	* ic_chan_active is a (potentially) constrained subset of
	* ic_chan_avail that reflects any mode setting or user-specified
	* limit on the set of channels to use/scan
	* ic_curchan is the current channel the device is set to; it may
	* be different from ic_bsschan when we are off-channel scanning
	* or otherwise doing background work
	* ic_bsschan is the channel selected for operation; it may
	* be undefined (IEEE80211_CHAN_ANYC)
	* ic_prevchan is a cached ``previous channel'' used to optimize
	* lookups when switching back+forth between two channels
	* (e.g. for dynamic turbo)
	*/
	db_printf("\tnchans %d", ic->ic_nchans);
	#if 0
	struct ieee80211_channel ic_channels[IEEE80211_CHAN_MAX+1];
	uint8_t ic_chan_avail[IEEE80211_CHAN_BYTES];
	uint8_t ic_chan_active[IEEE80211_CHAN_BYTES];
	uint8_t ic_chan_scan[IEEE80211_CHAN_BYTES];
	#endif
	db_printf("\n");
	_db_show_channel("\tcurchan", ic->ic_curchan);
	db_printf("\n");
	_db_show_channel("\tbsschan", ic->ic_bsschan);
	db_printf("\n");
	_db_show_channel("\tprevchan", ic->ic_prevchan);
	db_printf("\n");
	db_printf("\tregdomain %p", &ic->ic_regdomain);
	db_printf("\n");

	_db_show_channel("\tcsa_newchan", ic->ic_csa_newchan);
	db_printf(" csa_count %d", ic->ic_csa_count);
	db_printf( "dfs %p", &ic->ic_dfs);
	db_printf("\n");

	db_printf("\tscan %p", ic->ic_scan);
	db_printf(" lastdata %d", ic->ic_lastdata);
	db_printf(" lastscan %d", ic->ic_lastscan);
	db_printf("\n");

	db_printf("\tmax_keyix %d", ic->ic_max_keyix);
	db_printf(" sta %p", &ic->ic_sta);
	db_printf(" wme %p", &ic->ic_wme);
	db_printf("\n");

	db_printf("\tprotmode %d", ic->ic_protmode);
	db_printf(" nonerpsta %u", ic->ic_nonerpsta);
	db_printf(" longslotsta %u", ic->ic_longslotsta);
	db_printf(" lastnonerp %d", ic->ic_lastnonerp);
	db_printf("\n");
	db_printf("\tsta_assoc %u", ic->ic_sta_assoc);
	db_printf(" ht_sta_assoc %u", ic->ic_ht_sta_assoc);
	db_printf(" ht40_sta_assoc %u", ic->ic_ht40_sta_assoc);
	db_printf("\n");
	db_printf("\tcurhtprotmode 0x%x", ic->ic_curhtprotmode);
	db_printf(" htprotmode %d", ic->ic_htprotmode);
	db_printf(" lastnonht %d", ic->ic_lastnonht);
	db_printf("\n");

	if (showprocs) {
	DB_PRINTSYM("\tic_vap_create", ic->ic_vap_create);
	DB_PRINTSYM("\tic_vap_delete", ic->ic_vap_delete);
	#if 0
	/* operating mode attachment */
	ieee80211vap_attach ic_vattach[IEEE80211_OPMODE_MAX];
	#endif
	DB_PRINTSYM("\tic_newassoc", ic->ic_newassoc);
	DB_PRINTSYM("\tic_getradiocaps", ic->ic_getradiocaps);
	DB_PRINTSYM("\tic_setregdomain", ic->ic_setregdomain);
	DB_PRINTSYM("\tic_send_mgmt", ic->ic_send_mgmt);
	DB_PRINTSYM("\tic_raw_xmit", ic->ic_raw_xmit);
	DB_PRINTSYM("\tic_updateslot", ic->ic_updateslot);
	DB_PRINTSYM("\tic_update_mcast", ic->ic_update_mcast);
	DB_PRINTSYM("\tic_update_promisc", ic->ic_update_promisc);
	DB_PRINTSYM("\tic_node_alloc", ic->ic_node_alloc);
	DB_PRINTSYM("\tic_node_free", ic->ic_node_free);
	DB_PRINTSYM("\tic_node_cleanup", ic->ic_node_cleanup);
	DB_PRINTSYM("\tic_node_getrssi", ic->ic_node_getrssi);
	DB_PRINTSYM("\tic_node_getsignal", ic->ic_node_getsignal);
	DB_PRINTSYM("\tic_node_getmimoinfo", ic->ic_node_getmimoinfo);
	DB_PRINTSYM("\tic_scan_start", ic->ic_scan_start);
	DB_PRINTSYM("\tic_scan_end", ic->ic_scan_end);
	DB_PRINTSYM("\tic_set_channel", ic->ic_set_channel);
	DB_PRINTSYM("\tic_scan_curchan", ic->ic_scan_curchan);
	DB_PRINTSYM("\tic_scan_mindwell", ic->ic_scan_mindwell);
	DB_PRINTSYM("\tic_recv_action", ic->ic_recv_action);
	DB_PRINTSYM("\tic_send_action", ic->ic_send_action);
	DB_PRINTSYM("\tic_addba_request", ic->ic_addba_request);
	DB_PRINTSYM("\tic_addba_response", ic->ic_addba_response);
	DB_PRINTSYM("\tic_addba_stop", ic->ic_addba_stop);
	}
	if (showvaps && !TAILQ_EMPTY(&ic->ic_vaps)) {
	db_printf("\n");
	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
	_db_show_vap(vap, showprocs);
	}
	if (showsta && !TAILQ_EMPTY(&ic->ic_sta.nt_node)) {
	const struct ieee80211_node_table *nt = &ic->ic_sta;
	const struct ieee80211_node *ni;

	TAILQ_FOREACH(ni, &nt->nt_node, ni_list) {
	db_printf("\n");
	_db_show_sta(ni);
	}
	}
	}

	static void
	_db_show_channel(const char tag, const struct ieee80211_channel c)
	{
	db_printf("%s ", tag);
	if (c == NULL)
	db_printf("<NULL>");
	else if (c == IEEE80211_CHAN_ANYC)
	db_printf("<ANY>");
	else
	db_printf("[%u (%u) flags=%b maxreg %u maxpow %u minpow %u state 0x%x extieee %u]",
	c->ic_freq, c->ic_ieee,
	c->ic_flags, IEEE80211_CHAN_BITS,
	c->ic_maxregpower, c->ic_maxpower, c->ic_minpower,
	c->ic_state, c->ic_extieee);
	}

	static void
	_db_show_ssid(const char tag, int ix, int len, const uint8_t ssid)
	{
	const uint8_t *p;
	int i;

	db_printf(tag, ix);

	if (len > IEEE80211_NWID_LEN)
	len = IEEE80211_NWID_LEN;
	/* determine printable or not */
	for (i = 0, p = ssid; i < len; i++, p++) {
	if (p < ' ' \|\| p > 0x7e)
	break;
	}
	if (i == len) {
	db_printf("\"");
	for (i = 0, p = ssid; i < len; i++, p++)
	db_printf("%c", *p);
	db_printf("\"");
	} else {
	db_printf("0x");
	for (i = 0, p = ssid; i < len; i++, p++)
	db_printf("%02x", *p);
	}
	}

	static void
	_db_show_appie(const char tag, const struct ieee80211_appie ie)
	{
	const uint8_t *p;
	int i;

	if (ie == NULL)
	return;
	db_printf("%s [0x", tag);
	for (i = 0, p = ie->ie_data; i < ie->ie_len; i++, p++)
	db_printf("%02x", *p);
	db_printf("]\n");
	}

	static void
	_db_show_key(const char tag, int ix, const struct ieee80211_key wk)
	{
	static const uint8_t zerodata[IEEE80211_KEYBUF_SIZE];
	const struct ieee80211_cipher *cip = wk->wk_cipher;
	int keylen = wk->wk_keylen;

	if ((wk->wk_flags & IEEE80211_KEY_DEVKEY) == 0)
	return;
	db_printf(tag, ix);
	switch (cip->ic_cipher) {
	case IEEE80211_CIPHER_WEP:
	/* compatibility */
	db_printf(" wepkey %u:%s", wk->wk_keyix,
	keylen <= 5 ? "40-bit" :
	keylen <= 13 ? "104-bit" : "128-bit");
	break;
	case IEEE80211_CIPHER_TKIP:
	if (keylen > 128/8)
	keylen -= 128/8; /* ignore MIC for now */
	db_printf(" TKIP %u:%u-bit", wk->wk_keyix, 8*keylen);
	break;
	case IEEE80211_CIPHER_AES_OCB:
	db_printf(" AES-OCB %u:%u-bit", wk->wk_keyix, 8*keylen);
	break;
	case IEEE80211_CIPHER_AES_CCM:
	db_printf(" AES-CCM %u:%u-bit", wk->wk_keyix, 8*keylen);
	break;
	case IEEE80211_CIPHER_CKIP:
	db_printf(" CKIP %u:%u-bit", wk->wk_keyix, 8*keylen);
	break;
	case IEEE80211_CIPHER_NONE:
	db_printf(" NULL %u:%u-bit", wk->wk_keyix, 8*keylen);
	break;
	default:
	db_printf(" UNKNOWN (0x%x) %u:%u-bit",
	cip->ic_cipher, wk->wk_keyix, 8*keylen);
	break;
	}
	if (memcmp(wk->wk_key, zerodata, keylen) != 0) {
	int i;

	db_printf(" <");
	for (i = 0; i < keylen; i++)
	db_printf("%02x", wk->wk_key[i]);
	db_printf(">");
	if (cip->ic_cipher != IEEE80211_CIPHER_WEP &&
	wk->wk_keyrsc[IEEE80211_NONQOS_TID] != 0)
	db_printf(" rsc %ju", (uintmax_t)wk->wk_keyrsc[IEEE80211_NONQOS_TID]);
	if (cip->ic_cipher != IEEE80211_CIPHER_WEP &&
	wk->wk_keytsc != 0)
	db_printf(" tsc %ju", (uintmax_t)wk->wk_keytsc);
	if (wk->wk_flags != 0) {
	const char *sep = " ";

	if (wk->wk_flags & IEEE80211_KEY_XMIT)
	db_printf("%stx", sep), sep = "+";
	if (wk->wk_flags & IEEE80211_KEY_RECV)
	db_printf("%srx", sep), sep = "+";
	if (wk->wk_flags & IEEE80211_KEY_DEFAULT)
	db_printf("%sdef", sep), sep = "+";
	if (wk->wk_flags & IEEE80211_KEY_SWCRYPT)
	db_printf("%sswcrypt", sep), sep = "+";
	if (wk->wk_flags & IEEE80211_KEY_SWMIC)
	db_printf("%sswmic", sep), sep = "+";
	}
	db_printf("\n");
	}
	}

	static void
	printrate(const char *tag, int v)
	{
	if (v == IEEE80211_FIXED_RATE_NONE)
	db_printf(" %s <none>", tag);
	else if (v == 11)
	db_printf(" %s 5.5", tag);
	else if (v & IEEE80211_RATE_MCS)
	db_printf(" %s MCS%d", tag, v &~ IEEE80211_RATE_MCS);
	else
	db_printf(" %s %d", tag, v/2);
	}

	static void
	_db_show_roamparams(const char tag, const void arg,
	const struct ieee80211_roamparam *rp)
	{

	db_printf(tag, arg);
	if (rp->rssi & 1)
	db_printf(" rssi %u.5", rp->rssi/2);
	else
	db_printf(" rssi %u", rp->rssi/2);
	printrate("rate", rp->rate);
	}

	static void
	_db_show_txparams(const char tag, const void arg,
	const struct ieee80211_txparam *tp)
	{

	db_printf(tag, arg);
	printrate("ucastrate", tp->ucastrate);
	printrate("mcastrate", tp->mcastrate);
	printrate("mgmtrate", tp->mgmtrate);
	db_printf(" maxretry %d", tp->maxretry);
	}

	static void
	_db_show_stats(const struct ieee80211_stats *is)
	{
	}
	#endif /* DDB */
	Index: head/sys/netgraph/atm/ng_atm.c
	===================================================================
	--- head/sys/netgraph/atm/ng_atm.c (revision 183549)
	+++ head/sys/netgraph/atm/ng_atm.c (revision 183550)
	@@ -1,1434 +1,1449 @@
	/*-
	* Copyright (c) 2001-2003
	* Fraunhofer Institute for Open Communication Systems (FhG Fokus).
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Author: Hartmut Brandt <harti@freebsd.org>
	*/

	/*
	* Netgraph module to connect NATM interfaces to netgraph.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sbuf.h>
	#include <sys/ioccom.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_arp.h>
	#include <net/if_var.h>
	#include <net/if_media.h>
	#include <net/if_atm.h>

	#include <netgraph/ng_message.h>
	#include <netgraph/netgraph.h>
	#include <netgraph/ng_parse.h>
	#include <netgraph/atm/ng_atm.h>

	/*
	* Hooks in the NATM code
	*/
	extern void (ng_atm_attach_p)(struct ifnet );
	extern void (ng_atm_detach_p)(struct ifnet );
	extern int (ng_atm_output_p)(struct ifnet , struct mbuf **);
	extern void (ng_atm_input_p)(struct ifnet , struct mbuf **,
	struct atm_pseudohdr , void );
	extern void (ng_atm_input_orphan_p)(struct ifnet , struct mbuf *,
	struct atm_pseudohdr , void );
	extern void (ng_atm_event_p)(struct ifnet , uint32_t, void *);

	/*
	* Sysctl stuff.
	*/
	SYSCTL_NODE(_net_graph, OID_AUTO, atm, CTLFLAG_RW, 0, "atm related stuff");

	#ifdef NGATM_DEBUG
	static int allow_shutdown;

	SYSCTL_INT(_net_graph_atm, OID_AUTO, allow_shutdown, CTLFLAG_RW,
	&allow_shutdown, 0, "allow ng_atm nodes to shutdown");
	#endif

	/*
	* Hook private data
	*/
	struct ngvcc {
	uint16_t vpi; /* VPI of this hook */
	uint16_t vci; /* VCI of this hook, 0 if none */
	uint32_t flags; /* private flags */
	hook_p hook; /* the connected hook */

	LIST_ENTRY(ngvcc) link;
	};
	#define VCC_OPEN 0x0001 /* open */

	/*
	* Node private data
	*/
	struct priv {
	struct ifnet ifp; / the ATM interface */
	hook_p input; /* raw input hook */
	hook_p orphans; /* packets to nowhere */
	hook_p output; /* catch output packets */
	hook_p manage; /* has also entry in vccs */
	uint64_t in_packets;
	uint64_t in_errors;
	uint64_t out_packets;
	uint64_t out_errors;

	LIST_HEAD(, ngvcc) vccs;
	};

	/*
	* Parse ifstate state
	*/
	static const struct ng_parse_struct_field ng_atm_if_change_info[] =
	NGM_ATM_IF_CHANGE_INFO;
	static const struct ng_parse_type ng_atm_if_change_type = {
	&ng_parse_struct_type,
	&ng_atm_if_change_info
	};

	/*
	* Parse vcc state change
	*/
	static const struct ng_parse_struct_field ng_atm_vcc_change_info[] =
	NGM_ATM_VCC_CHANGE_INFO;
	static const struct ng_parse_type ng_atm_vcc_change_type = {
	&ng_parse_struct_type,
	&ng_atm_vcc_change_info
	};

	/*
	* Parse acr change
	*/
	static const struct ng_parse_struct_field ng_atm_acr_change_info[] =
	NGM_ATM_ACR_CHANGE_INFO;
	static const struct ng_parse_type ng_atm_acr_change_type = {
	&ng_parse_struct_type,
	&ng_atm_acr_change_info
	};

	/*
	* Parse the configuration structure ng_atm_config
	*/
	static const struct ng_parse_struct_field ng_atm_config_type_info[] =
	NGM_ATM_CONFIG_INFO;

	static const struct ng_parse_type ng_atm_config_type = {
	&ng_parse_struct_type,
	&ng_atm_config_type_info
	};

	/*
	* Parse a single vcc structure and a variable array of these ng_atm_vccs
	*/
	static const struct ng_parse_struct_field ng_atm_tparam_type_info[] =
	NGM_ATM_TPARAM_INFO;
	static const struct ng_parse_type ng_atm_tparam_type = {
	&ng_parse_struct_type,
	&ng_atm_tparam_type_info
	};
	static const struct ng_parse_struct_field ng_atm_vcc_type_info[] =
	NGM_ATM_VCC_INFO;
	static const struct ng_parse_type ng_atm_vcc_type = {
	&ng_parse_struct_type,
	&ng_atm_vcc_type_info
	};


	static int
	ng_atm_vccarray_getlen(const struct ng_parse_type *type,
	const u_char start, const u_char buf)
	{
	const struct atmio_vcctable *vp;

	vp = (const struct atmio_vcctable *)
	(buf - offsetof(struct atmio_vcctable, vccs));

	return (vp->count);
	}
	static const struct ng_parse_array_info ng_atm_vccarray_info =
	NGM_ATM_VCCARRAY_INFO;
	static const struct ng_parse_type ng_atm_vccarray_type = {
	&ng_parse_array_type,
	&ng_atm_vccarray_info
	};


	static const struct ng_parse_struct_field ng_atm_vcctable_type_info[] =
	NGM_ATM_VCCTABLE_INFO;

	static const struct ng_parse_type ng_atm_vcctable_type = {
	&ng_parse_struct_type,
	&ng_atm_vcctable_type_info
	};

	/*
	* Parse CPCS INIT structure ng_atm_cpcs_init
	*/
	static const struct ng_parse_struct_field ng_atm_cpcs_init_type_info[] =
	NGM_ATM_CPCS_INIT_INFO;

	static const struct ng_parse_type ng_atm_cpcs_init_type = {
	&ng_parse_struct_type,
	&ng_atm_cpcs_init_type_info
	};

	/*
	* Parse CPCS TERM structure ng_atm_cpcs_term
	*/
	static const struct ng_parse_struct_field ng_atm_cpcs_term_type_info[] =
	NGM_ATM_CPCS_TERM_INFO;

	static const struct ng_parse_type ng_atm_cpcs_term_type = {
	&ng_parse_struct_type,
	&ng_atm_cpcs_term_type_info
	};

	/*
	* Parse statistic struct
	*/
	static const struct ng_parse_struct_field ng_atm_stats_type_info[] =
	NGM_ATM_STATS_INFO;

	static const struct ng_parse_type ng_atm_stats_type = {
	&ng_parse_struct_type,
	&ng_atm_stats_type_info
	};

	static const struct ng_cmdlist ng_atm_cmdlist[] = {
	{
	NGM_ATM_COOKIE,
	NGM_ATM_GET_IFNAME,
	"getifname",
	NULL,
	&ng_parse_string_type
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_GET_CONFIG,
	"getconfig",
	NULL,
	&ng_atm_config_type
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_GET_VCCS,
	"getvccs",
	NULL,
	&ng_atm_vcctable_type
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_CPCS_INIT,
	"cpcsinit",
	&ng_atm_cpcs_init_type,
	NULL
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_CPCS_TERM,
	"cpcsterm",
	&ng_atm_cpcs_term_type,
	NULL
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_GET_VCC,
	"getvcc",
	&ng_parse_hookbuf_type,
	&ng_atm_vcc_type
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_GET_VCCID,
	"getvccid",
	&ng_atm_vcc_type,
	&ng_atm_vcc_type
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_GET_STATS,
	"getstats",
	NULL,
	&ng_atm_stats_type
	},

	/* events */
	{
	NGM_ATM_COOKIE,
	NGM_ATM_IF_CHANGE,
	"if_change",
	&ng_atm_if_change_type,
	&ng_atm_if_change_type,
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_VCC_CHANGE,
	"vcc_change",
	&ng_atm_vcc_change_type,
	&ng_atm_vcc_change_type,
	},
	{
	NGM_ATM_COOKIE,
	NGM_ATM_ACR_CHANGE,
	"acr_change",
	&ng_atm_acr_change_type,
	&ng_atm_acr_change_type,
	},
	{ 0 }
	};

	static int ng_atm_mod_event(module_t, int, void *);

	static ng_constructor_t ng_atm_constructor;
	static ng_shutdown_t ng_atm_shutdown;
	static ng_rcvmsg_t ng_atm_rcvmsg;
	static ng_newhook_t ng_atm_newhook;
	static ng_connect_t ng_atm_connect;
	static ng_disconnect_t ng_atm_disconnect;
	static ng_rcvdata_t ng_atm_rcvdata;
	static ng_rcvdata_t ng_atm_rcvdrop;

	static struct ng_type ng_atm_typestruct = {
	.version = NG_ABI_VERSION,
	.name = NG_ATM_NODE_TYPE,
	.mod_event = ng_atm_mod_event,
	.constructor = ng_atm_constructor,
	.rcvmsg = ng_atm_rcvmsg,
	.shutdown = ng_atm_shutdown,
	.newhook = ng_atm_newhook,
	.connect = ng_atm_connect,
	.rcvdata = ng_atm_rcvdata,
	.disconnect = ng_atm_disconnect,
	.cmdlist = ng_atm_cmdlist,
	};
	NETGRAPH_INIT(atm, &ng_atm_typestruct);

	static const struct {
	u_int media;
	const char *name;
	} atmmedia[] = IFM_SUBTYPE_ATM_DESCRIPTIONS;


	#define IFP2NG(IFP) ((node_p)((struct ifatm *)(IFP)->if_softc)->ngpriv)
	#define IFP2NG_SET(IFP, val) (((struct ifatm *)(IFP)->if_softc)->ngpriv = (val))

	#define IFFLAGS "\020\001UP\002BROADCAST\003DEBUG\004LOOPBACK" \
	"\005POINTOPOINT\006SMART\007RUNNING\010NOARP" \
	"\011PROMISC\012ALLMULTI\013OACTIVE\014SIMPLEX" \
	"\015LINK0\016LINK1\017LINK2\020MULTICAST"


	/************************************************************/
	/*
	* INPUT
	*/
	/*
	* A packet is received from an interface.
	* If we have an input hook, prepend the pseudoheader to the data and
	* deliver it out to that hook. If not, look whether it is destined for
	* use. If so locate the appropriate hook, deliver the packet without the
	* header and we are done. If it is not for us, leave it alone.
	*/
	static void
	ng_atm_input(struct ifnet ifp, struct mbuf *mp,
	struct atm_pseudohdr ah, void rxhand)
	{
	node_p node = IFP2NG(ifp);
	struct priv *priv;
	const struct ngvcc *vcc;
	int error;

	if (node == NULL)
	return;
	priv = NG_NODE_PRIVATE(node);
	if (priv->input != NULL) {
	/*
	* Prepend the atm_pseudoheader.
	*/
	M_PREPEND(mp, sizeof(ah), M_DONTWAIT);
	if (*mp == NULL)
	return;
	memcpy(mtod(mp, struct atm_pseudohdr ), ah, sizeof(*ah));
	NG_SEND_DATA_ONLY(error, priv->input, *mp);
	if (error == 0) {
	priv->in_packets++;
	*mp = NULL;
	} else {
	#ifdef NGATM_DEBUG
	printf("%s: error=%d\n", __func__, error);
	#endif
	priv->in_errors++;
	}
	return;
	}
	if ((ATM_PH_FLAGS(ah) & ATMIO_FLAG_NG) == 0)
	return;

	vcc = (struct ngvcc *)rxhand;

	NG_SEND_DATA_ONLY(error, vcc->hook, *mp);
	if (error == 0) {
	priv->in_packets++;
	*mp = NULL;
	} else {
	#ifdef NGATM_DEBUG
	printf("%s: error=%d\n", __func__, error);
	#endif
	priv->in_errors++;
	}
	}

	/*
	* ATM packet is about to be output. The atm_pseudohdr is already prepended.
	* If the hook is set, reroute the packet to the hook.
	*/
	static int
	ng_atm_output(struct ifnet ifp, struct mbuf *mp)
	{
	const node_p node = IFP2NG(ifp);
	const struct priv *priv;
	int error = 0;

	if (node == NULL)
	return (0);
	priv = NG_NODE_PRIVATE(node);
	if (priv->output) {
	NG_SEND_DATA_ONLY(error, priv->output, *mp);
	*mp = NULL;
	}

	return (error);
	}

	/*
	* Well, this doesn't make much sense for ATM.
	*/
	static void
	ng_atm_input_orphans(struct ifnet ifp, struct mbuf m,
	struct atm_pseudohdr ah, void rxhand)
	{
	node_p node = IFP2NG(ifp);
	struct priv *priv;
	int error;

	if (node == NULL) {
	m_freem(m);
	return;
	}
	priv = NG_NODE_PRIVATE(node);
	if (priv->orphans == NULL) {
	m_freem(m);
	return;
	}
	/*
	* Prepend the atm_pseudoheader.
	*/
	M_PREPEND(m, sizeof(*ah), M_DONTWAIT);
	if (m == NULL)
	return;
	memcpy(mtod(m, struct atm_pseudohdr ), ah, sizeof(ah));
	NG_SEND_DATA_ONLY(error, priv->orphans, m);
	if (error == 0)
	priv->in_packets++;
	else {
	priv->in_errors++;
	#ifdef NGATM_DEBUG
	printf("%s: error=%d\n", __func__, error);
	#endif
	}
	}

	/************************************************************/
	/*
	* OUTPUT
	*/
	static int
	ng_atm_rcvdata(hook_p hook, item_p item)
	{
	node_p node = NG_HOOK_NODE(hook);
	struct priv *priv = NG_NODE_PRIVATE(node);
	const struct ngvcc *vcc = NG_HOOK_PRIVATE(hook);
	struct mbuf *m;
	struct atm_pseudohdr *aph;
	int error;

	if (vcc->vci == 0) {
	NG_FREE_ITEM(item);
	return (ENOTCONN);
	}

	NGI_GET_M(item, m);
	NG_FREE_ITEM(item);

	/*
	* Prepend pseudo-hdr. Drivers don't care about the flags.
	*/
	M_PREPEND(m, sizeof(*aph), M_DONTWAIT);
	if (m == NULL) {
	NG_FREE_M(m);
	return (ENOMEM);
	}
	aph = mtod(m, struct atm_pseudohdr *);
	ATM_PH_VPI(aph) = vcc->vpi;
	ATM_PH_SETVCI(aph, vcc->vci);
	ATM_PH_FLAGS(aph) = 0;

	if ((error = atm_output(priv->ifp, m, NULL, NULL)) == 0)
	priv->out_packets++;
	else
	priv->out_errors++;
	return (error);
	}

	static int
	ng_atm_rcvdrop(hook_p hook, item_p item)
	{
	NG_FREE_ITEM(item);
	return (0);
	}


	/************************************************************
	*
	* Event from driver.
	*/
	static void
	ng_atm_event_func(node_p node, hook_p hook, void *arg, int event)
	{
	const struct priv *priv = NG_NODE_PRIVATE(node);
	struct ngvcc *vcc;
	struct ng_mesg *mesg;
	int error;

	switch (event) {

	case ATMEV_FLOW_CONTROL:
	{
	struct atmev_flow_control *ev = arg;
	struct ngm_queue_state *qstate;

	/* find the connection */
	LIST_FOREACH(vcc, &priv->vccs, link)
	if (vcc->vci == ev->vci && vcc->vpi == ev->vpi)
	break;
	if (vcc == NULL)
	break;

	/* convert into a flow control message */
	NG_MKMESSAGE(mesg, NGM_FLOW_COOKIE,
	ev->busy ? NGM_HIGH_WATER_PASSED : NGM_LOW_WATER_PASSED,
	sizeof(struct ngm_queue_state), M_NOWAIT);
	if (mesg == NULL)
	break;
	qstate = (struct ngm_queue_state *)mesg->data;

	/* XXX have to figure out how to get that info */

	NG_SEND_MSG_HOOK(error, node, mesg, vcc->hook, 0);
	break;
	}

	case ATMEV_VCC_CHANGED:
	{
	struct atmev_vcc_changed *ev = arg;
	struct ngm_atm_vcc_change *chg;

	if (priv->manage == NULL)
	break;
	NG_MKMESSAGE(mesg, NGM_ATM_COOKIE, NGM_ATM_VCC_CHANGE,
	sizeof(struct ngm_atm_vcc_change), M_NOWAIT);
	if (mesg == NULL)
	break;
	chg = (struct ngm_atm_vcc_change *)mesg->data;
	chg->vci = ev->vci;
	chg->vpi = ev->vpi;
	chg->state = (ev->up != 0);
	chg->node = NG_NODE_ID(node);
	NG_SEND_MSG_HOOK(error, node, mesg, priv->manage, 0);
	break;
	}

	case ATMEV_IFSTATE_CHANGED:
	{
	struct atmev_ifstate_changed *ev = arg;
	struct ngm_atm_if_change *chg;

	if (priv->manage == NULL)
	break;
	NG_MKMESSAGE(mesg, NGM_ATM_COOKIE, NGM_ATM_IF_CHANGE,
	sizeof(struct ngm_atm_if_change), M_NOWAIT);
	if (mesg == NULL)
	break;
	chg = (struct ngm_atm_if_change *)mesg->data;
	chg->carrier = (ev->carrier != 0);
	chg->running = (ev->running != 0);
	chg->node = NG_NODE_ID(node);
	NG_SEND_MSG_HOOK(error, node, mesg, priv->manage, 0);
	break;
	}

	case ATMEV_ACR_CHANGED:
	{
	struct atmev_acr_changed *ev = arg;
	struct ngm_atm_acr_change *acr;

	/* find the connection */
	LIST_FOREACH(vcc, &priv->vccs, link)
	if (vcc->vci == ev->vci && vcc->vpi == ev->vpi)
	break;
	if (vcc == NULL)
	break;

	/* convert into a flow control message */
	NG_MKMESSAGE(mesg, NGM_ATM_COOKIE, NGM_ATM_ACR_CHANGE,
	sizeof(struct ngm_atm_acr_change), M_NOWAIT);
	if (mesg == NULL)
	break;
	acr = (struct ngm_atm_acr_change *)mesg->data;
	acr->node = NG_NODE_ID(node);
	acr->vci = ev->vci;
	acr->vpi = ev->vpi;
	acr->acr = ev->acr;

	NG_SEND_MSG_HOOK(error, node, mesg, vcc->hook, 0);
	break;
	}
	}
	}

	/*
	* Use send_fn to get the right lock
	*/
	static void
	ng_atm_event(struct ifnet ifp, uint32_t event, void arg)
	{
	const node_p node = IFP2NG(ifp);

	if (node != NULL)
	/* may happen during attach/detach */
	(void)ng_send_fn(node, NULL, ng_atm_event_func, arg, event);
	}

	/************************************************************
	*
	* CPCS
	*/
	/*
	* Open a channel for the user
	*/
	static int
	ng_atm_cpcs_init(node_p node, const struct ngm_atm_cpcs_init *arg)
	{
	struct priv *priv = NG_NODE_PRIVATE(node);
	const struct ifatm_mib *mib;
	struct ngvcc *vcc;
	struct atmio_openvcc data;
	int err;

	if(priv->ifp->if_ioctl == NULL)
	return (ENXIO);

	mib = (const struct ifatm_mib *)(priv->ifp->if_linkmib);

	LIST_FOREACH(vcc, &priv->vccs, link)
	if (strcmp(arg->name, NG_HOOK_NAME(vcc->hook)) == 0)
	break;
	if (vcc == NULL)
	return (ENOTCONN);
	if (vcc->flags & VCC_OPEN)
	return (EISCONN);

	/*
	* Check user arguments and construct ioctl argument
	*/
	memset(&data, 0, sizeof(data));

	data.rxhand = vcc;

	switch (data.param.aal = arg->aal) {

	case ATMIO_AAL_34:
	case ATMIO_AAL_5:
	case ATMIO_AAL_0:
	case ATMIO_AAL_RAW:
	break;

	default:
	return (EINVAL);
	}

	if (arg->vpi > 0xff)
	return (EINVAL);
	data.param.vpi = arg->vpi;

	/* allow 0.0 as catch all receive channel */
	if (arg->vci == 0 && (arg->vpi != 0 \|\| !(arg->flags & ATMIO_FLAG_NOTX)))
	return (EINVAL);
	data.param.vci = arg->vci;

	data.param.tparam.pcr = arg->pcr;

	if (arg->mcr > arg->pcr)
	return (EINVAL);
	data.param.tparam.mcr = arg->mcr;

	if (!(arg->flags & ATMIO_FLAG_NOTX)) {
	if (arg->tmtu == 0)
	data.param.tmtu = priv->ifp->if_mtu;
	else {
	data.param.tmtu = arg->tmtu;
	}
	}
	if (!(arg->flags & ATMIO_FLAG_NORX)) {
	if (arg->rmtu == 0)
	data.param.rmtu = priv->ifp->if_mtu;
	else {
	data.param.rmtu = arg->rmtu;
	}
	}

	switch (data.param.traffic = arg->traffic) {

	case ATMIO_TRAFFIC_UBR:
	case ATMIO_TRAFFIC_CBR:
	break;

	case ATMIO_TRAFFIC_VBR:
	if (arg->scr > arg->pcr)
	return (EINVAL);
	data.param.tparam.scr = arg->scr;

	if (arg->mbs > (1 << 24))
	return (EINVAL);
	data.param.tparam.mbs = arg->mbs;
	break;

	case ATMIO_TRAFFIC_ABR:
	if (arg->icr > arg->pcr \|\| arg->icr < arg->mcr)
	return (EINVAL);
	data.param.tparam.icr = arg->icr;

	if (arg->tbe == 0 \|\| arg->tbe > (1 << 24))
	return (EINVAL);
	data.param.tparam.tbe = arg->tbe;

	if (arg->nrm > 0x7)
	return (EINVAL);
	data.param.tparam.nrm = arg->nrm;

	if (arg->trm > 0x7)
	return (EINVAL);
	data.param.tparam.trm = arg->trm;

	if (arg->adtf > 0x3ff)
	return (EINVAL);
	data.param.tparam.adtf = arg->adtf;

	if (arg->rif > 0xf)
	return (EINVAL);
	data.param.tparam.rif = arg->rif;

	if (arg->rdf > 0xf)
	return (EINVAL);
	data.param.tparam.rdf = arg->rdf;

	if (arg->cdf > 0x7)
	return (EINVAL);
	data.param.tparam.cdf = arg->cdf;

	break;

	default:
	return (EINVAL);
	}

	if ((arg->flags & ATMIO_FLAG_NORX) && (arg->flags & ATMIO_FLAG_NOTX))
	return (EINVAL);

	data.param.flags = arg->flags & ~(ATM_PH_AAL5 \| ATM_PH_LLCSNAP);
	data.param.flags \|= ATMIO_FLAG_NG;

	err = (*priv->ifp->if_ioctl)(priv->ifp, SIOCATMOPENVCC, (caddr_t)&data);

	if (err == 0) {
	vcc->vci = data.param.vci;
	vcc->vpi = data.param.vpi;
	vcc->flags = VCC_OPEN;
	}

	return (err);
	}

	/*
	* Issue the close command to the driver
	*/
	static int
	cpcs_term(const struct priv *priv, u_int vpi, u_int vci)
	{
	struct atmio_closevcc data;

	if (priv->ifp->if_ioctl == NULL)
	return ENXIO;

	data.vpi = vpi;
	data.vci = vci;

	return ((*priv->ifp->if_ioctl)(priv->ifp,
	SIOCATMCLOSEVCC, (caddr_t)&data));
	}


	/*
	* Close a channel by request of the user
	*/
	static int
	ng_atm_cpcs_term(node_p node, const struct ngm_atm_cpcs_term *arg)
	{
	struct priv *priv = NG_NODE_PRIVATE(node);
	struct ngvcc *vcc;
	int error;

	LIST_FOREACH(vcc, &priv->vccs, link)
	if(strcmp(arg->name, NG_HOOK_NAME(vcc->hook)) == 0)
	break;
	if (vcc == NULL)
	return (ENOTCONN);
	if (!(vcc->flags & VCC_OPEN))
	return (ENOTCONN);

	error = cpcs_term(priv, vcc->vpi, vcc->vci);

	vcc->vci = 0;
	vcc->vpi = 0;
	vcc->flags = 0;

	return (error);
	}

	/************************************************************/
	/*
	* CONTROL MESSAGES
	*/

	/*
	* Produce a textual description of the current status
	*/
	static int
	text_status(node_p node, char *arg, u_int len)
	{
	const struct priv *priv = NG_NODE_PRIVATE(node);
	const struct ifatm_mib *mib;
	struct sbuf sbuf;
	u_int i;

	static const struct {
	const char *name;
	const char *vendor;
	} devices[] = {
	ATM_DEVICE_NAMES
	};

	mib = (const struct ifatm_mib *)(priv->ifp->if_linkmib);

	sbuf_new(&sbuf, arg, len, SBUF_FIXEDLEN);
	sbuf_printf(&sbuf, "interface: %s\n", priv->ifp->if_xname);

	if (mib->device >= sizeof(devices) / sizeof(devices[0]))
	sbuf_printf(&sbuf, "device=unknown\nvendor=unknown\n");
	else
	sbuf_printf(&sbuf, "device=%s\nvendor=%s\n",
	devices[mib->device].name, devices[mib->device].vendor);

	for (i = 0; atmmedia[i].name; i++)
	if(mib->media == atmmedia[i].media) {
	sbuf_printf(&sbuf, "media=%s\n", atmmedia[i].name);
	break;
	}
	if(atmmedia[i].name == NULL)
	sbuf_printf(&sbuf, "media=unknown\n");

	sbuf_printf(&sbuf, "serial=%u esi=%6D hardware=%u software=%u\n",
	mib->serial, mib->esi, ":", mib->hw_version, mib->sw_version);
	sbuf_printf(&sbuf, "pcr=%u vpi_bits=%u vci_bits=%u max_vpcs=%u "
	"max_vccs=%u\n", mib->pcr, mib->vpi_bits, mib->vci_bits,
	mib->max_vpcs, mib->max_vccs);
	sbuf_printf(&sbuf, "ifflags=%b\n", priv->ifp->if_flags, IFFLAGS);

	sbuf_finish(&sbuf);

	return (sbuf_len(&sbuf));
	}

	/*
	* Get control message
	*/
	static int
	ng_atm_rcvmsg(node_p node, item_p item, hook_p lasthook)
	{
	const struct priv *priv = NG_NODE_PRIVATE(node);
	struct ng_mesg *resp = NULL;
	struct ng_mesg *msg;
	struct ifatm_mib mib = (struct ifatm_mib )(priv->ifp->if_linkmib);
	int error = 0;

	NGI_GET_MSG(item, msg);

	switch (msg->header.typecookie) {

	case NGM_GENERIC_COOKIE:
	switch (msg->header.cmd) {

	case NGM_TEXT_STATUS:
	NG_MKRESPONSE(resp, msg, NG_TEXTRESPONSE, M_NOWAIT);
	if(resp == NULL) {
	error = ENOMEM;
	break;
	}

	resp->header.arglen = text_status(node,
	(char *)resp->data, resp->header.arglen) + 1;
	break;

	default:
	error = EINVAL;
	break;
	}
	break;

	case NGM_ATM_COOKIE:
	switch (msg->header.cmd) {

	case NGM_ATM_GET_IFNAME:
	NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ);
	break;

	case NGM_ATM_GET_CONFIG:
	{
	struct ngm_atm_config *config;

	NG_MKRESPONSE(resp, msg, sizeof(*config), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	config = (struct ngm_atm_config *)resp->data;
	config->pcr = mib->pcr;
	config->vpi_bits = mib->vpi_bits;
	config->vci_bits = mib->vci_bits;
	config->max_vpcs = mib->max_vpcs;
	config->max_vccs = mib->max_vccs;
	break;
	}

	case NGM_ATM_GET_VCCS:
	{
	struct atmio_vcctable *vccs;
	size_t len;

	if (priv->ifp->if_ioctl == NULL) {
	error = ENXIO;
	break;
	}
	error = (*priv->ifp->if_ioctl)(priv->ifp,
	SIOCATMGETVCCS, (caddr_t)&vccs);
	if (error)
	break;

	len = sizeof(*vccs) +
	vccs->count * sizeof(vccs->vccs[0]);
	NG_MKRESPONSE(resp, msg, len, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	free(vccs, M_DEVBUF);
	break;
	}

	(void)memcpy(resp->data, vccs, len);
	free(vccs, M_DEVBUF);

	break;
	}

	case NGM_ATM_GET_VCC:
	{
	char hook[NG_HOOKSIZ];
	struct atmio_vcctable *vccs;
	struct ngvcc *vcc;
	u_int i;

	if (priv->ifp->if_ioctl == NULL) {
	error = ENXIO;
	break;
	}
	if (msg->header.arglen != NG_HOOKSIZ) {
	error = EINVAL;
	break;
	}
	strncpy(hook, msg->data, NG_HOOKSIZ);
	hook[NG_HOOKSIZ - 1] = '\0';
	LIST_FOREACH(vcc, &priv->vccs, link)
	if (strcmp(NG_HOOK_NAME(vcc->hook), hook) == 0)
	break;
	if (vcc == NULL) {
	error = ENOTCONN;
	break;
	}
	error = (*priv->ifp->if_ioctl)(priv->ifp,
	SIOCATMGETVCCS, (caddr_t)&vccs);
	if (error)
	break;

	for (i = 0; i < vccs->count; i++)
	if (vccs->vccs[i].vpi == vcc->vpi &&
	vccs->vccs[i].vci == vcc->vci)
	break;
	if (i == vccs->count) {
	error = ENOTCONN;
	free(vccs, M_DEVBUF);
	break;
	}

	NG_MKRESPONSE(resp, msg, sizeof(vccs->vccs[0]),
	M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	free(vccs, M_DEVBUF);
	break;
	}

	(struct atmio_vcc )resp->data = vccs->vccs[i];
	free(vccs, M_DEVBUF);
	break;
	}

	case NGM_ATM_GET_VCCID:
	{
	struct atmio_vcc *arg;
	struct atmio_vcctable *vccs;
	u_int i;

	if (priv->ifp->if_ioctl == NULL) {
	error = ENXIO;
	break;
	}
	if (msg->header.arglen != sizeof(*arg)) {
	error = EINVAL;
	break;
	}
	arg = (struct atmio_vcc *)msg->data;

	error = (*priv->ifp->if_ioctl)(priv->ifp,
	SIOCATMGETVCCS, (caddr_t)&vccs);
	if (error)
	break;

	for (i = 0; i < vccs->count; i++)
	if (vccs->vccs[i].vpi == arg->vpi &&
	vccs->vccs[i].vci == arg->vci)
	break;
	if (i == vccs->count) {
	error = ENOTCONN;
	free(vccs, M_DEVBUF);
	break;
	}

	NG_MKRESPONSE(resp, msg, sizeof(vccs->vccs[0]),
	M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	free(vccs, M_DEVBUF);
	break;
	}

	(struct atmio_vcc )resp->data = vccs->vccs[i];
	free(vccs, M_DEVBUF);
	break;
	}

	case NGM_ATM_CPCS_INIT:
	if (msg->header.arglen !=
	sizeof(struct ngm_atm_cpcs_init)) {
	error = EINVAL;
	break;
	}
	error = ng_atm_cpcs_init(node,
	(struct ngm_atm_cpcs_init *)msg->data);
	break;

	case NGM_ATM_CPCS_TERM:
	if (msg->header.arglen !=
	sizeof(struct ngm_atm_cpcs_term)) {
	error = EINVAL;
	break;
	}
	error = ng_atm_cpcs_term(node,
	(struct ngm_atm_cpcs_term *)msg->data);
	break;

	case NGM_ATM_GET_STATS:
	{
	struct ngm_atm_stats *p;

	if (msg->header.arglen != 0) {
	error = EINVAL;
	break;
	}
	NG_MKRESPONSE(resp, msg, sizeof(*p), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	p = (struct ngm_atm_stats *)resp->data;
	p->in_packets = priv->in_packets;
	p->out_packets = priv->out_packets;
	p->in_errors = priv->in_errors;
	p->out_errors = priv->out_errors;

	break;
	}

	default:
	error = EINVAL;
	break;
	}
	break;

	default:
	error = EINVAL;
	break;
	}

	NG_RESPOND_MSG(error, node, item, resp);
	NG_FREE_MSG(msg);
	return (error);
	}

	/************************************************************/
	/*
	* HOOK MANAGEMENT
	*/

	/*
	* A new hook is create that will be connected to the node.
	* Check, whether the name is one of the predefined ones.
	* If not, create a new entry into the vcc list.
	*/
	static int
	ng_atm_newhook(node_p node, hook_p hook, const char *name)
	{
	struct priv *priv = NG_NODE_PRIVATE(node);
	struct ngvcc *vcc;

	if (strcmp(name, "input") == 0) {
	priv->input = hook;
	NG_HOOK_SET_RCVDATA(hook, ng_atm_rcvdrop);
	return (0);
	}
	if (strcmp(name, "output") == 0) {
	priv->output = hook;
	NG_HOOK_SET_RCVDATA(hook, ng_atm_rcvdrop);
	return (0);
	}
	if (strcmp(name, "orphans") == 0) {
	priv->orphans = hook;
	NG_HOOK_SET_RCVDATA(hook, ng_atm_rcvdrop);
	return (0);
	}

	/*
	* Allocate a new entry
	*/
	vcc = malloc(sizeof(*vcc), M_NETGRAPH, M_NOWAIT \| M_ZERO);
	if (vcc == NULL)
	return (ENOMEM);

	vcc->hook = hook;
	NG_HOOK_SET_PRIVATE(hook, vcc);

	LIST_INSERT_HEAD(&priv->vccs, vcc, link);

	if (strcmp(name, "manage") == 0)
	priv->manage = hook;

	return (0);
	}

	/*
	* Connect. Set the peer to queuing.
	*/
	static int
	ng_atm_connect(hook_p hook)
	{
	if (NG_HOOK_PRIVATE(hook) != NULL)
	NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook));

	return (0);
	}

	/*
	* Disconnect a HOOK
	*/
	static int
	ng_atm_disconnect(hook_p hook)
	{
	struct priv *priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
	struct ngvcc *vcc = NG_HOOK_PRIVATE(hook);

	if (vcc == NULL) {
	if (hook == priv->output) {
	priv->output = NULL;
	return (0);
	}
	if (hook == priv->input) {
	priv->input = NULL;
	return (0);
	}
	if (hook == priv->orphans) {
	priv->orphans = NULL;
	return (0);
	}
	log(LOG_ERR, "ng_atm: bad hook '%s'", NG_HOOK_NAME(hook));
	return (0);
	}

	/* don't terminate if we are detaching from the interface */
	if ((vcc->flags & VCC_OPEN) && priv->ifp != NULL)
	(void)cpcs_term(priv, vcc->vpi, vcc->vci);

	NG_HOOK_SET_PRIVATE(hook, NULL);

	LIST_REMOVE(vcc, link);
	free(vcc, M_NETGRAPH);

	if (hook == priv->manage)
	priv->manage = NULL;

	return (0);
	}

	/************************************************************/
	/*
	* NODE MANAGEMENT
	*/

	/*
	* ATM interface attached - create a node and name it like the interface.
	*/
	static void
	ng_atm_attach(struct ifnet *ifp)
	{
	node_p node;
	struct priv *priv;

	KASSERT(IFP2NG(ifp) == 0, ("%s: node alreay exists?", __func__));

	if (ng_make_node_common(&ng_atm_typestruct, &node) != 0) {
	log(LOG_ERR, "%s: can't create node for %s\n",
	__func__, ifp->if_xname);
	return;
	}

	priv = malloc(sizeof(*priv), M_NETGRAPH, M_NOWAIT \| M_ZERO);
	if (priv == NULL) {
	log(LOG_ERR, "%s: can't allocate memory for %s\n",
	__func__, ifp->if_xname);
	NG_NODE_UNREF(node);
	return;
	}
	NG_NODE_SET_PRIVATE(node, priv);
	priv->ifp = ifp;
	LIST_INIT(&priv->vccs);
	IFP2NG_SET(ifp, node);

	if (ng_name_node(node, ifp->if_xname) != 0) {
	log(LOG_WARNING, "%s: can't name node %s\n",
	__func__, ifp->if_xname);
	}
	}

	/*
	* ATM interface detached - destroy node.
	*/
	static void
	ng_atm_detach(struct ifnet *ifp)
	{
	const node_p node = IFP2NG(ifp);
	struct priv *priv;

	if(node == NULL)
	return;

	NG_NODE_REALLY_DIE(node);

	priv = NG_NODE_PRIVATE(node);
	IFP2NG_SET(priv->ifp, NULL);
	priv->ifp = NULL;

	ng_rmnode_self(node);
	}

	/*
	* Shutdown the node. This is called from the shutdown message processing.
	*/
	static int
	ng_atm_shutdown(node_p node)
	{
	struct priv *priv = NG_NODE_PRIVATE(node);

	if (node->nd_flags & NGF_REALLY_DIE) {
	/*
	* We are called from unloading the ATM driver. Really,
	* really need to shutdown this node. The ifp was
	* already handled in the detach routine.
	*/
	NG_NODE_SET_PRIVATE(node, NULL);
	free(priv, M_NETGRAPH);

	NG_NODE_UNREF(node);
	return (0);
	}

	#ifdef NGATM_DEBUG
	if (!allow_shutdown)
	NG_NODE_REVIVE(node); /* we persist */
	else {
	IFP2NG_SET(priv->ifp, NULL);
	NG_NODE_SET_PRIVATE(node, NULL);
	free(priv, M_NETGRAPH);
	NG_NODE_UNREF(node);
	}
	#else
	/*
	* We are persistant - reinitialize
	*/
	NG_NODE_REVIVE(node);
	#endif
	return (0);
	}

	/*
	* Nodes are constructed only via interface attaches.
	*/
	static int
	ng_atm_constructor(node_p nodep)
	{
	return (EINVAL);
	}

	/************************************************************/
	/*
	* INITIALISATION
	*/
	/*
	* Loading and unloading of node type
	*
	* The assignments to the globals for the hooks should be ok without
	* a special hook. The use pattern is generally: check that the pointer
	* is not NULL, call the function. In the attach case this is no problem.
	* In the detach case we can detach only when no ATM node exists. That
	* means that there is no ATM interface anymore. So we are sure that
	* we are not in the code path in if_atmsubr.c. To prevent someone
	* from adding an interface after we have started to unload the node, we
	* take the iflist lock so an if_attach will be blocked until we are done.
	* XXX: perhaps the function pointers should be 'volatile' for this to work
	* properly.
	*/
	static int
	ng_atm_mod_event(module_t mod, int event, void *data)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct ifnet *ifp;
	int error = 0;

	switch (event) {

	case MOD_LOAD:
	/*
	* Register function hooks
	*/
	if (ng_atm_attach_p != NULL) {
	error = EEXIST;
	break;
	}
	IFNET_RLOCK();

	ng_atm_attach_p = ng_atm_attach;
	ng_atm_detach_p = ng_atm_detach;
	ng_atm_output_p = ng_atm_output;
	ng_atm_input_p = ng_atm_input;
	ng_atm_input_orphan_p = ng_atm_input_orphans;
	ng_atm_event_p = ng_atm_event;

	/* Create nodes for existing ATM interfaces */
	- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	- if (ifp->if_type == IFT_ATM)
	- ng_atm_attach(ifp);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET_QUIET(vnet_iter);
	+ INIT_VNET_NET(vnet_iter);
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	+ if (ifp->if_type == IFT_ATM)
	+ ng_atm_attach(ifp);
	+ }
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IFNET_RUNLOCK();
	break;

	case MOD_UNLOAD:
	IFNET_RLOCK();

	ng_atm_attach_p = NULL;
	ng_atm_detach_p = NULL;
	ng_atm_output_p = NULL;
	ng_atm_input_p = NULL;
	ng_atm_input_orphan_p = NULL;
	ng_atm_event_p = NULL;

	- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	- if (ifp->if_type == IFT_ATM)
	- ng_atm_detach(ifp);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET_QUIET(vnet_iter);
	+ INIT_VNET_NET(vnet_iter);
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	+ if (ifp->if_type == IFT_ATM)
	+ ng_atm_detach(ifp);
	+ }
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IFNET_RUNLOCK();
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}
	return (error);
	}
	Index: head/sys/netgraph/netgraph.h
	===================================================================
	--- head/sys/netgraph/netgraph.h (revision 183549)
	+++ head/sys/netgraph/netgraph.h (revision 183550)
	@@ -1,1187 +1,1206 @@
	/*
	* netgraph.h
	*/

	/*-
	* Copyright (c) 1996-1999 Whistle Communications, Inc.
	* All rights reserved.
	*
	* Subject to the following obligations and disclaimer of warranty, use and
	* redistribution of this software, in source or object code forms, with or
	* without modifications are expressly permitted by Whistle Communications;
	* provided, however, that:
	* 1. Any and all reproductions of the source or object code must include the
	* copyright notice above and the following disclaimer of warranties; and
	* 2. No rights are granted, in any manner or form, to use Whistle
	* Communications, Inc. trademarks, including the mark "WHISTLE
	* COMMUNICATIONS" on advertising, endorsements, or otherwise except as
	* such appears in the above copyright notice or in the software.
	*
	* THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
	* TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
	* REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
	* INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
	* WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
	* REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
	* SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
	* IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
	* RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
	* WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
	* PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
	* OF SUCH DAMAGE.
	*
	* Author: Julian Elischer <julian@freebsd.org>
	*
	* $FreeBSD$
	* $Whistle: netgraph.h,v 1.29 1999/11/01 07:56:13 julian Exp $
	*/

	#ifndef _NETGRAPH_NETGRAPH_H_
	#define _NETGRAPH_NETGRAPH_H_

	#ifndef _KERNEL
	#error "This file should not be included in user level programs"
	#endif

	#include <sys/queue.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>

	#ifdef HAVE_KERNEL_OPTION_HEADERS
	#include "opt_netgraph.h"
	#endif

	/* debugging options */
	#define NG_SEPARATE_MALLOC /* make modules use their own malloc types */

	/*
	* This defines the in-kernel binary interface version.
	* It is possible to change this but leave the external message
	* API the same. Each type also has it's own cookies for versioning as well.
	* Change it for NETGRAPH_DEBUG version so we cannot mix debug and non debug
	* modules.
	*/
	#define _NG_ABI_VERSION 12
	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	#define NG_ABI_VERSION (_NG_ABI_VERSION + 0x10000)
	#else /* NETGRAPH_DEBUG / /----------------------------------------------*/
	#define NG_ABI_VERSION _NG_ABI_VERSION
	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/


	/*
	* Forward references for the basic structures so we can
	* define the typedefs and use them in the structures themselves.
	*/
	struct ng_hook ;
	struct ng_node ;
	struct ng_item ;
	typedef struct ng_item *item_p;
	typedef struct ng_node *node_p;
	typedef struct ng_hook *hook_p;

	/* node method definitions */
	typedef int ng_constructor_t(node_p node);
	typedef int ng_close_t(node_p node);
	typedef int ng_shutdown_t(node_p node);
	typedef int ng_newhook_t(node_p node, hook_p hook, const char *name);
	typedef hook_p ng_findhook_t(node_p node, const char *name);
	typedef int ng_connect_t(hook_p hook);
	typedef int ng_rcvmsg_t(node_p node, item_p item, hook_p lasthook);
	typedef int ng_rcvdata_t(hook_p hook, item_p item);
	typedef int ng_disconnect_t(hook_p hook);
	typedef int ng_rcvitem (node_p node, hook_p hook, item_p item);

	/***********************************************************************
	*************** Hook Structure and Methods ************************
	***********************************************************************
	*
	* Structure of a hook
	*/
	struct ng_hook {
	char hk_name[NG_HOOKSIZ]; /* what this node knows this link as */
	void hk_private; / node dependant ID for this hook */
	int hk_flags; /* info about this hook/link */
	int hk_type; /* tbd: hook data link type */
	struct ng_hook hk_peer; / the other end of this link */
	struct ng_node hk_node; / The node this hook is attached to */
	LIST_ENTRY(ng_hook) hk_hooks; /* linked list of all hooks on node */
	ng_rcvmsg_t hk_rcvmsg; / control messages come here */
	ng_rcvdata_t hk_rcvdata; / data comes here */
	int hk_refs; /* dont actually free this till 0 */
	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	#define HK_MAGIC 0x78573011
	int hk_magic;
	char *lastfile;
	int lastline;
	SLIST_ENTRY(ng_hook) hk_all; /* all existing items */
	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/
	};
	/* Flags for a hook */
	#define HK_INVALID 0x0001 /* don't trust it! */
	#define HK_QUEUE 0x0002 /* queue for later delivery */
	#define HK_FORCE_WRITER 0x0004 /* Incoming data queued as a writer */
	#define HK_DEAD 0x0008 /* This is the dead hook.. don't free */
	#define HK_HI_STACK 0x0010 /* Hook has hi stack usage */

	/*
	* Public Methods for hook
	* If you can't do it with these you probably shouldn;t be doing it.
	*/
	void ng_unref_hook(hook_p hook); /* don't move this */
	#define _NG_HOOK_REF(hook) atomic_add_int(&(hook)->hk_refs, 1)
	#define _NG_HOOK_NAME(hook) ((hook)->hk_name)
	#define _NG_HOOK_UNREF(hook) ng_unref_hook(hook)
	#define _NG_HOOK_SET_PRIVATE(hook, val) do {(hook)->hk_private = val;} while (0)
	#define _NG_HOOK_SET_RCVMSG(hook, val) do {(hook)->hk_rcvmsg = val;} while (0)
	#define _NG_HOOK_SET_RCVDATA(hook, val) do {(hook)->hk_rcvdata = val;} while (0)
	#define _NG_HOOK_PRIVATE(hook) ((hook)->hk_private)
	#define _NG_HOOK_NOT_VALID(hook) ((hook)->hk_flags & HK_INVALID)
	#define _NG_HOOK_IS_VALID(hook) (!((hook)->hk_flags & HK_INVALID))
	#define _NG_HOOK_NODE(hook) ((hook)->hk_node) /* only rvalue! */
	#define _NG_HOOK_PEER(hook) ((hook)->hk_peer) /* only rvalue! */
	#define _NG_HOOK_FORCE_WRITER(hook) \
	do { hook->hk_flags \|= HK_FORCE_WRITER; } while (0)
	#define _NG_HOOK_FORCE_QUEUE(hook) do { hook->hk_flags \|= HK_QUEUE; } while (0)
	#define _NG_HOOK_HI_STACK(hook) do { hook->hk_flags \|= HK_HI_STACK; } while (0)

	/* Some shortcuts */
	#define NG_PEER_NODE(hook) NG_HOOK_NODE(NG_HOOK_PEER(hook))
	#define NG_PEER_HOOK_NAME(hook) NG_HOOK_NAME(NG_HOOK_PEER(hook))
	#define NG_PEER_NODE_NAME(hook) NG_NODE_NAME(NG_PEER_NODE(hook))

	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	#define _NN_ __FILE__,__LINE__
	void dumphook (hook_p hook, char *file, int line);
	static __inline void _chkhook(hook_p hook, char *file, int line);
	static __inline void _ng_hook_ref(hook_p hook, char * file, int line);
	static __inline char * _ng_hook_name(hook_p hook, char * file, int line);
	static __inline void _ng_hook_unref(hook_p hook, char * file, int line);
	static __inline void _ng_hook_set_private(hook_p hook,
	void * val, char * file, int line);
	static __inline void _ng_hook_set_rcvmsg(hook_p hook,
	ng_rcvmsg_t val, char file, int line);
	static __inline void _ng_hook_set_rcvdata(hook_p hook,
	ng_rcvdata_t val, char file, int line);
	static __inline void * _ng_hook_private(hook_p hook, char * file, int line);
	static __inline int _ng_hook_not_valid(hook_p hook, char * file, int line);
	static __inline int _ng_hook_is_valid(hook_p hook, char * file, int line);
	static __inline node_p _ng_hook_node(hook_p hook, char * file, int line);
	static __inline hook_p _ng_hook_peer(hook_p hook, char * file, int line);
	static __inline void _ng_hook_force_writer(hook_p hook, char * file,
	int line);
	static __inline void _ng_hook_force_queue(hook_p hook, char * file, int line);

	static __inline void
	_chkhook(hook_p hook, char *file, int line)
	{
	if (hook->hk_magic != HK_MAGIC) {
	printf("Accessing freed hook ");
	dumphook(hook, file, line);
	}
	hook->lastline = line;
	hook->lastfile = file;
	}

	static __inline void
	_ng_hook_ref(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_REF(hook);
	}

	static __inline char *
	_ng_hook_name(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	return (_NG_HOOK_NAME(hook));
	}

	static __inline void
	_ng_hook_unref(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_UNREF(hook);
	}

	static __inline void
	_ng_hook_set_private(hook_p hook, void val, char file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_SET_PRIVATE(hook, val);
	}

	static __inline void
	_ng_hook_set_rcvmsg(hook_p hook, ng_rcvmsg_t val, char file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_SET_RCVMSG(hook, val);
	}

	static __inline void
	_ng_hook_set_rcvdata(hook_p hook, ng_rcvdata_t val, char file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_SET_RCVDATA(hook, val);
	}

	static __inline void *
	_ng_hook_private(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	return (_NG_HOOK_PRIVATE(hook));
	}

	static __inline int
	_ng_hook_not_valid(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	return (_NG_HOOK_NOT_VALID(hook));
	}

	static __inline int
	_ng_hook_is_valid(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	return (_NG_HOOK_IS_VALID(hook));
	}

	static __inline node_p
	_ng_hook_node(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	return (_NG_HOOK_NODE(hook));
	}

	static __inline hook_p
	_ng_hook_peer(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	return (_NG_HOOK_PEER(hook));
	}

	static __inline void
	_ng_hook_force_writer(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_FORCE_WRITER(hook);
	}

	static __inline void
	_ng_hook_force_queue(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_FORCE_QUEUE(hook);
	}

	static __inline void
	_ng_hook_hi_stack(hook_p hook, char * file, int line)
	{
	_chkhook(hook, file, line);
	_NG_HOOK_HI_STACK(hook);
	}


	#define NG_HOOK_REF(hook) _ng_hook_ref(hook, _NN_)
	#define NG_HOOK_NAME(hook) _ng_hook_name(hook, _NN_)
	#define NG_HOOK_UNREF(hook) _ng_hook_unref(hook, _NN_)
	#define NG_HOOK_SET_PRIVATE(hook, val) _ng_hook_set_private(hook, val, _NN_)
	#define NG_HOOK_SET_RCVMSG(hook, val) _ng_hook_set_rcvmsg(hook, val, _NN_)
	#define NG_HOOK_SET_RCVDATA(hook, val) _ng_hook_set_rcvdata(hook, val, _NN_)
	#define NG_HOOK_PRIVATE(hook) _ng_hook_private(hook, _NN_)
	#define NG_HOOK_NOT_VALID(hook) _ng_hook_not_valid(hook, _NN_)
	#define NG_HOOK_IS_VALID(hook) _ng_hook_is_valid(hook, _NN_)
	#define NG_HOOK_NODE(hook) _ng_hook_node(hook, _NN_)
	#define NG_HOOK_PEER(hook) _ng_hook_peer(hook, _NN_)
	#define NG_HOOK_FORCE_WRITER(hook) _ng_hook_force_writer(hook, _NN_)
	#define NG_HOOK_FORCE_QUEUE(hook) _ng_hook_force_queue(hook, _NN_)
	#define NG_HOOK_HI_STACK(hook) _ng_hook_hi_stack(hook, _NN_)

	#else /* NETGRAPH_DEBUG / /----------------------------------------------*/

	#define NG_HOOK_REF(hook) _NG_HOOK_REF(hook)
	#define NG_HOOK_NAME(hook) _NG_HOOK_NAME(hook)
	#define NG_HOOK_UNREF(hook) _NG_HOOK_UNREF(hook)
	#define NG_HOOK_SET_PRIVATE(hook, val) _NG_HOOK_SET_PRIVATE(hook, val)
	#define NG_HOOK_SET_RCVMSG(hook, val) _NG_HOOK_SET_RCVMSG(hook, val)
	#define NG_HOOK_SET_RCVDATA(hook, val) _NG_HOOK_SET_RCVDATA(hook, val)
	#define NG_HOOK_PRIVATE(hook) _NG_HOOK_PRIVATE(hook)
	#define NG_HOOK_NOT_VALID(hook) _NG_HOOK_NOT_VALID(hook)
	#define NG_HOOK_IS_VALID(hook) _NG_HOOK_IS_VALID(hook)
	#define NG_HOOK_NODE(hook) _NG_HOOK_NODE(hook)
	#define NG_HOOK_PEER(hook) _NG_HOOK_PEER(hook)
	#define NG_HOOK_FORCE_WRITER(hook) _NG_HOOK_FORCE_WRITER(hook)
	#define NG_HOOK_FORCE_QUEUE(hook) _NG_HOOK_FORCE_QUEUE(hook)
	#define NG_HOOK_HI_STACK(hook) _NG_HOOK_HI_STACK(hook)

	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/

	/***********************************************************************
	*************** Node Structure and Methods ************************
	***********************************************************************
	* Structure of a node
	* including the eembedded queue structure.
	*
	* The structure for queueing Netgraph request items
	* embedded in the node structure
	*/
	struct ng_queue {
	u_int q_flags; /* Current r/w/q lock flags */
	u_int q_flags2; /* Other queue flags */
	struct mtx q_mtx;
	STAILQ_ENTRY(ng_node) q_work; /* nodes with work to do */
	STAILQ_HEAD(, ng_item) queue; /* actually items queue */
	};

	struct ng_node {
	char nd_name[NG_NODESIZ]; /* optional globally unique name */
	struct ng_type nd_type; / the installed 'type' */
	int nd_flags; /* see below for bit definitions */
	int nd_numhooks; /* number of hooks */
	void nd_private; / node type dependant node ID */
	ng_ID_t nd_ID; /* Unique per node */
	LIST_HEAD(hooks, ng_hook) nd_hooks; /* linked list of node hooks */
	LIST_ENTRY(ng_node) nd_nodes; /* linked list of all nodes */
	LIST_ENTRY(ng_node) nd_idnodes; /* ID hash collision list */
	struct ng_queue nd_input_queue; /* input queue for locking */
	int nd_refs; /* # of references to this node */
	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	#define ND_MAGIC 0x59264837
	int nd_magic;
	char *lastfile;
	int lastline;
	SLIST_ENTRY(ng_node) nd_all; /* all existing nodes */
	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/
	};

	/* Flags for a node */
	#define NGF_INVALID 0x00000001 /* free when refs go to 0 */
	#define NG_INVALID NGF_INVALID /* compat for old code */
	#define NGF_FORCE_WRITER 0x00000004 /* Never multithread this node */
	#define NG_FORCE_WRITER NGF_FORCE_WRITER /* compat for old code */
	#define NGF_CLOSING 0x00000008 /* ng_rmnode() at work */
	#define NG_CLOSING NGF_CLOSING /* compat for old code */
	#define NGF_REALLY_DIE 0x00000010 /* "persistent" node is unloading */
	#define NG_REALLY_DIE NGF_REALLY_DIE /* compat for old code */
	#define NGF_HI_STACK 0x00000020 /* node has hi stack usage */
	#define NGF_TYPE1 0x10000000 /* reserved for type specific storage */
	#define NGF_TYPE2 0x20000000 /* reserved for type specific storage */
	#define NGF_TYPE3 0x40000000 /* reserved for type specific storage */
	#define NGF_TYPE4 0x80000000 /* reserved for type specific storage */

	/*
	* Public methods for nodes.
	* If you can't do it with these you probably shouldn't be doing it.
	*/
	int ng_unref_node(node_p node); /* don't move this */
	#define _NG_NODE_NAME(node) ((node)->nd_name + 0)
	#define _NG_NODE_HAS_NAME(node) ((node)->nd_name[0] + 0)
	#define _NG_NODE_ID(node) ((node)->nd_ID + 0)
	#define _NG_NODE_REF(node) atomic_add_int(&(node)->nd_refs, 1)
	#define _NG_NODE_UNREF(node) ng_unref_node(node)
	#define _NG_NODE_SET_PRIVATE(node, val) do {(node)->nd_private = val;} while (0)
	#define _NG_NODE_PRIVATE(node) ((node)->nd_private)
	#define _NG_NODE_IS_VALID(node) (!((node)->nd_flags & NGF_INVALID))
	#define _NG_NODE_NOT_VALID(node) ((node)->nd_flags & NGF_INVALID)
	#define _NG_NODE_NUMHOOKS(node) ((node)->nd_numhooks + 0) /* rvalue */
	#define _NG_NODE_FORCE_WRITER(node) \
	do{ node->nd_flags \|= NGF_FORCE_WRITER; }while (0)
	#define _NG_NODE_HI_STACK(node) \
	do{ node->nd_flags \|= NGF_HI_STACK; }while (0)
	#define _NG_NODE_REALLY_DIE(node) \
	do{ node->nd_flags \|= (NGF_REALLY_DIE\|NGF_INVALID); }while (0)
	#define _NG_NODE_REVIVE(node) \
	do { node->nd_flags &= ~NGF_INVALID; } while (0)
	/*
	* The hook iterator.
	* This macro will call a function of type ng_fn_eachhook for each
	* hook attached to the node. If the function returns 0, then the
	* iterator will stop and return a pointer to the hook that returned 0.
	*/
	typedef int ng_fn_eachhook(hook_p hook, void* arg);
	#define _NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) \
	do { \
	hook_p _hook; \
	(rethook) = NULL; \
	LIST_FOREACH(_hook, &((node)->nd_hooks), hk_hooks) { \
	if ((fn)(_hook, arg) == 0) { \
	(rethook) = _hook; \
	break; \
	} \
	} \
	} while (0)

	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	void dumpnode(node_p node, char *file, int line);
	static __inline void _chknode(node_p node, char *file, int line);
	static __inline char * _ng_node_name(node_p node, char *file, int line);
	static __inline int _ng_node_has_name(node_p node, char *file, int line);
	static __inline ng_ID_t _ng_node_id(node_p node, char *file, int line);
	static __inline void _ng_node_ref(node_p node, char *file, int line);
	static __inline int _ng_node_unref(node_p node, char *file, int line);
	static __inline void _ng_node_set_private(node_p node, void * val,
	char *file, int line);
	static __inline void * _ng_node_private(node_p node, char *file, int line);
	static __inline int _ng_node_is_valid(node_p node, char *file, int line);
	static __inline int _ng_node_not_valid(node_p node, char *file, int line);
	static __inline int _ng_node_numhooks(node_p node, char *file, int line);
	static __inline void _ng_node_force_writer(node_p node, char *file, int line);
	static __inline hook_p _ng_node_foreach_hook(node_p node,
	ng_fn_eachhook fn, void arg, char *file, int line);
	static __inline void _ng_node_revive(node_p node, char *file, int line);

	static __inline void
	_chknode(node_p node, char *file, int line)
	{
	if (node->nd_magic != ND_MAGIC) {
	printf("Accessing freed node ");
	dumpnode(node, file, line);
	}
	node->lastline = line;
	node->lastfile = file;
	}

	static __inline char *
	_ng_node_name(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return(_NG_NODE_NAME(node));
	}

	static __inline int
	_ng_node_has_name(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return(_NG_NODE_HAS_NAME(node));
	}

	static __inline ng_ID_t
	_ng_node_id(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return(_NG_NODE_ID(node));
	}

	static __inline void
	_ng_node_ref(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	_NG_NODE_REF(node);
	}

	static __inline int
	_ng_node_unref(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return (_NG_NODE_UNREF(node));
	}

	static __inline void
	_ng_node_set_private(node_p node, void * val, char *file, int line)
	{
	_chknode(node, file, line);
	_NG_NODE_SET_PRIVATE(node, val);
	}

	static __inline void *
	_ng_node_private(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return (_NG_NODE_PRIVATE(node));
	}

	static __inline int
	_ng_node_is_valid(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return(_NG_NODE_IS_VALID(node));
	}

	static __inline int
	_ng_node_not_valid(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return(_NG_NODE_NOT_VALID(node));
	}

	static __inline int
	_ng_node_numhooks(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	return(_NG_NODE_NUMHOOKS(node));
	}

	static __inline void
	_ng_node_force_writer(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	_NG_NODE_FORCE_WRITER(node);
	}

	static __inline void
	_ng_node_hi_stack(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	_NG_NODE_HI_STACK(node);
	}

	static __inline void
	_ng_node_really_die(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	_NG_NODE_REALLY_DIE(node);
	}

	static __inline void
	_ng_node_revive(node_p node, char *file, int line)
	{
	_chknode(node, file, line);
	_NG_NODE_REVIVE(node);
	}

	static __inline hook_p
	_ng_node_foreach_hook(node_p node, ng_fn_eachhook fn, void arg,
	char *file, int line)
	{
	hook_p hook;
	_chknode(node, file, line);
	_NG_NODE_FOREACH_HOOK(node, fn, arg, hook);
	return (hook);
	}

	#define NG_NODE_NAME(node) _ng_node_name(node, _NN_)
	#define NG_NODE_HAS_NAME(node) _ng_node_has_name(node, _NN_)
	#define NG_NODE_ID(node) _ng_node_id(node, _NN_)
	#define NG_NODE_REF(node) _ng_node_ref(node, _NN_)
	#define NG_NODE_UNREF(node) _ng_node_unref(node, _NN_)
	#define NG_NODE_SET_PRIVATE(node, val) _ng_node_set_private(node, val, _NN_)
	#define NG_NODE_PRIVATE(node) _ng_node_private(node, _NN_)
	#define NG_NODE_IS_VALID(node) _ng_node_is_valid(node, _NN_)
	#define NG_NODE_NOT_VALID(node) _ng_node_not_valid(node, _NN_)
	#define NG_NODE_FORCE_WRITER(node) _ng_node_force_writer(node, _NN_)
	#define NG_NODE_HI_STACK(node) _ng_node_hi_stack(node, _NN_)
	#define NG_NODE_REALLY_DIE(node) _ng_node_really_die(node, _NN_)
	#define NG_NODE_NUMHOOKS(node) _ng_node_numhooks(node, _NN_)
	#define NG_NODE_REVIVE(node) _ng_node_revive(node, _NN_)
	#define NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) \
	do { \
	rethook = _ng_node_foreach_hook(node, fn, (void *)arg, _NN_); \
	} while (0)

	#else /* NETGRAPH_DEBUG / /----------------------------------------------*/

	#define NG_NODE_NAME(node) _NG_NODE_NAME(node)
	#define NG_NODE_HAS_NAME(node) _NG_NODE_HAS_NAME(node)
	#define NG_NODE_ID(node) _NG_NODE_ID(node)
	#define NG_NODE_REF(node) _NG_NODE_REF(node)
	#define NG_NODE_UNREF(node) _NG_NODE_UNREF(node)
	#define NG_NODE_SET_PRIVATE(node, val) _NG_NODE_SET_PRIVATE(node, val)
	#define NG_NODE_PRIVATE(node) _NG_NODE_PRIVATE(node)
	#define NG_NODE_IS_VALID(node) _NG_NODE_IS_VALID(node)
	#define NG_NODE_NOT_VALID(node) _NG_NODE_NOT_VALID(node)
	#define NG_NODE_FORCE_WRITER(node) _NG_NODE_FORCE_WRITER(node)
	#define NG_NODE_HI_STACK(node) _NG_NODE_HI_STACK(node)
	#define NG_NODE_REALLY_DIE(node) _NG_NODE_REALLY_DIE(node)
	#define NG_NODE_NUMHOOKS(node) _NG_NODE_NUMHOOKS(node)
	#define NG_NODE_REVIVE(node) _NG_NODE_REVIVE(node)
	#define NG_NODE_FOREACH_HOOK(node, fn, arg, rethook) \
	_NG_NODE_FOREACH_HOOK(node, fn, arg, rethook)
	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/

	/***********************************************************************
	*********** Node Queue and Item Structures and Methods ************
	***********************************************************************
	*
	*/
	typedef void ng_item_fn(node_p node, hook_p hook, void *arg1, int arg2);
	typedef int ng_item_fn2(node_p node, struct ng_item *item, hook_p hook);
	typedef void ng_apply_t(void *context, int error);
	struct ng_apply_info {
	ng_apply_t *apply;
	void *context;
	int refs;
	int error;
	};
	struct ng_item {
	u_long el_flags;
	STAILQ_ENTRY(ng_item) el_next;
	node_p el_dest; /* The node it will be applied against (or NULL) */
	hook_p el_hook; /* Entering hook. Optional in Control messages */
	union {
	struct mbuf *da_m;
	struct {
	struct ng_mesg *msg_msg;
	ng_ID_t msg_retaddr;
	} msg;
	struct {
	union {
	ng_item_fn *fn_fn;
	ng_item_fn2 *fn_fn2;
	} fn_fn;
	void *fn_arg1;
	int fn_arg2;
	} fn;
	} body;
	/*
	* Optional callback called when item is being applied,
	* and its context.
	*/
	struct ng_apply_info *apply;
	u_int depth;
	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	char *lastfile;
	int lastline;
	TAILQ_ENTRY(ng_item) all; /* all existing items */
	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/
	};

	#define NGQF_TYPE 0x03 /* MASK of content definition */
	#define NGQF_MESG 0x00 /* the queue element is a message */
	#define NGQF_DATA 0x01 /* the queue element is data */
	#define NGQF_FN 0x02 /* the queue element is a function */
	#define NGQF_FN2 0x03 /* the queue element is a new function */

	#define NGQF_RW 0x04 /* MASK for wanted queue mode */
	#define NGQF_READER 0x04 /* wants to be a reader */
	#define NGQF_WRITER 0x00 /* wants to be a writer */

	#define NGQF_QMODE 0x08 /* MASK for how it was queued */
	#define NGQF_QREADER 0x08 /* was queued as a reader */
	#define NGQF_QWRITER 0x00 /* was queued as a writer */

	/*
	* Get the mbuf (etc) out of an item.
	* Sets the value in the item to NULL in case we need to call NG_FREE_ITEM()
	* with it, (to avoid freeing the things twice).
	* If you don't want to zero out the item then realise that the
	* item still owns it.
	* Retaddr is different. There are no references on that. It's just a number.
	* The debug versions must be either all used everywhere or not at all.
	*/

	#define _NGI_M(i) ((i)->body.da_m)
	#define _NGI_MSG(i) ((i)->body.msg.msg_msg)
	#define _NGI_RETADDR(i) ((i)->body.msg.msg_retaddr)
	#define _NGI_FN(i) ((i)->body.fn.fn_fn.fn_fn)
	#define _NGI_FN2(i) ((i)->body.fn.fn_fn.fn_fn2)
	#define _NGI_ARG1(i) ((i)->body.fn.fn_arg1)
	#define _NGI_ARG2(i) ((i)->body.fn.fn_arg2)
	#define _NGI_NODE(i) ((i)->el_dest)
	#define _NGI_HOOK(i) ((i)->el_hook)
	#define _NGI_SET_HOOK(i,h) do { _NGI_HOOK(i) = h; h = NULL;} while (0)
	#define _NGI_CLR_HOOK(i) do { \
	hook_p _hook = _NGI_HOOK(i); \
	if (_hook) { \
	_NG_HOOK_UNREF(_hook); \
	_NGI_HOOK(i) = NULL; \
	} \
	} while (0)
	#define _NGI_SET_NODE(i,n) do { _NGI_NODE(i) = n; n = NULL;} while (0)
	#define _NGI_CLR_NODE(i) do { \
	node_p _node = _NGI_NODE(i); \
	if (_node) { \
	_NG_NODE_UNREF(_node); \
	_NGI_NODE(i) = NULL; \
	} \
	} while (0)

	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	void dumpitem(item_p item, char *file, int line);
	static __inline void _ngi_check(item_p item, char *file, int line) ;
	static __inline struct mbuf ** _ngi_m(item_p item, char *file, int line) ;
	static __inline ng_ID_t * _ngi_retaddr(item_p item, char *file, int line);
	static __inline struct ng_mesg ** _ngi_msg(item_p item, char *file, int line) ;
	static __inline ng_item_fn ** _ngi_fn(item_p item, char *file, int line) ;
	static __inline ng_item_fn2 ** _ngi_fn2(item_p item, char *file, int line) ;
	static __inline void ** _ngi_arg1(item_p item, char *file, int line) ;
	static __inline int * _ngi_arg2(item_p item, char *file, int line) ;
	static __inline node_p _ngi_node(item_p item, char *file, int line);
	static __inline hook_p _ngi_hook(item_p item, char *file, int line);

	static __inline void
	_ngi_check(item_p item, char *file, int line)
	{
	(item)->lastline = line;
	(item)->lastfile = file;
	}

	static __inline struct mbuf **
	_ngi_m(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_M(item));
	}

	static __inline struct ng_mesg **
	_ngi_msg(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_MSG(item));
	}

	static __inline ng_ID_t *
	_ngi_retaddr(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_RETADDR(item));
	}

	static __inline ng_item_fn **
	_ngi_fn(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_FN(item));
	}

	static __inline ng_item_fn2 **
	_ngi_fn2(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_FN2(item));
	}

	static __inline void **
	_ngi_arg1(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_ARG1(item));
	}

	static __inline int *
	_ngi_arg2(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (&_NGI_ARG2(item));
	}

	static __inline node_p
	_ngi_node(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (_NGI_NODE(item));
	}

	static __inline hook_p
	_ngi_hook(item_p item, char *file, int line)
	{
	_ngi_check(item, file, line);
	return (_NGI_HOOK(item));
	}

	#define NGI_M(i) (*_ngi_m(i, _NN_))
	#define NGI_MSG(i) (*_ngi_msg(i, _NN_))
	#define NGI_RETADDR(i) (*_ngi_retaddr(i, _NN_))
	#define NGI_FN(i) (*_ngi_fn(i, _NN_))
	#define NGI_FN2(i) (*_ngi_fn2(i, _NN_))
	#define NGI_ARG1(i) (*_ngi_arg1(i, _NN_))
	#define NGI_ARG2(i) (*_ngi_arg2(i, _NN_))
	#define NGI_HOOK(i) _ngi_hook(i, _NN_)
	#define NGI_NODE(i) _ngi_node(i, _NN_)
	#define NGI_SET_HOOK(i,h) \
	do { _ngi_check(i, _NN_); _NGI_SET_HOOK(i, h); } while (0)
	#define NGI_CLR_HOOK(i) \
	do { _ngi_check(i, _NN_); _NGI_CLR_HOOK(i); } while (0)
	#define NGI_SET_NODE(i,n) \
	do { _ngi_check(i, _NN_); _NGI_SET_NODE(i, n); } while (0)
	#define NGI_CLR_NODE(i) \
	do { _ngi_check(i, _NN_); _NGI_CLR_NODE(i); } while (0)

	#define NG_FREE_ITEM(item) \
	do { \
	_ngi_check(item, _NN_); \
	ng_free_item((item)); \
	} while (0)

	#define SAVE_LINE(item) \
	do { \
	(item)->lastline = __LINE__; \
	(item)->lastfile = __FILE__; \
	} while (0)

	#else /* NETGRAPH_DEBUG / /----------------------------------------------*/

	#define NGI_M(i) _NGI_M(i)
	#define NGI_MSG(i) _NGI_MSG(i)
	#define NGI_RETADDR(i) _NGI_RETADDR(i)
	#define NGI_FN(i) _NGI_FN(i)
	#define NGI_FN2(i) _NGI_FN2(i)
	#define NGI_ARG1(i) _NGI_ARG1(i)
	#define NGI_ARG2(i) _NGI_ARG2(i)
	#define NGI_NODE(i) _NGI_NODE(i)
	#define NGI_HOOK(i) _NGI_HOOK(i)
	#define NGI_SET_HOOK(i,h) _NGI_SET_HOOK(i,h)
	#define NGI_CLR_HOOK(i) _NGI_CLR_HOOK(i)
	#define NGI_SET_NODE(i,n) _NGI_SET_NODE(i,n)
	#define NGI_CLR_NODE(i) _NGI_CLR_NODE(i)

	#define NG_FREE_ITEM(item) ng_free_item((item))
	#define SAVE_LINE(item) do {} while (0)

	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/

	#define NGI_GET_M(i,m) \
	do { \
	(m) = NGI_M(i); \
	_NGI_M(i) = NULL; \
	} while (0)

	#define NGI_GET_MSG(i,m) \
	do { \
	(m) = NGI_MSG(i); \
	_NGI_MSG(i) = NULL; \
	} while (0)

	#define NGI_GET_NODE(i,n) /* YOU NOW HAVE THE REFERENCE */ \
	do { \
	(n) = NGI_NODE(i); \
	_NGI_NODE(i) = NULL; \
	} while (0)

	#define NGI_GET_HOOK(i,h) \
	do { \
	(h) = NGI_HOOK(i); \
	_NGI_HOOK(i) = NULL; \
	} while (0)

	#define NGI_SET_WRITER(i) ((i)->el_flags &= ~NGQF_QMODE)
	#define NGI_SET_READER(i) ((i)->el_flags \|= NGQF_QREADER)

	#define NGI_QUEUED_READER(i) ((i)->el_flags & NGQF_QREADER)
	#define NGI_QUEUED_WRITER(i) (((i)->el_flags & NGQF_QMODE) == NGQF_QWRITER)

	/**********************************************************************
	* Data macros. Send, manipulate and free.
	**********************************************************************/
	/*
	* Assuming the data is already ok, just set the new address and send
	*/
	#define NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, flags) \
	do { \
	(error) = \
	ng_address_hook(NULL, (item), (hook), NG_NOFLAGS); \
	if (error == 0) { \
	SAVE_LINE(item); \
	(error) = ng_snd_item((item), (flags)); \
	} \
	(item) = NULL; \
	} while (0)
	#define NG_FWD_ITEM_HOOK(error, item, hook) \
	NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, NG_NOFLAGS)

	/*
	* Forward a data packet. Mbuf pointer is updated to new value. We
	* presume you dealt with the old one when you update it to the new one
	* (or it maybe the old one). We got a packet and possibly had to modify
	* the mbuf. You should probably use NGI_GET_M() if you are going to use
	* this too.
	*/
	#define NG_FWD_NEW_DATA_FLAGS(error, item, hook, m, flags) \
	do { \
	NGI_M(item) = (m); \
	(m) = NULL; \
	NG_FWD_ITEM_HOOK_FLAGS(error, item, hook, flags); \
	} while (0)
	#define NG_FWD_NEW_DATA(error, item, hook, m) \
	NG_FWD_NEW_DATA_FLAGS(error, item, hook, m, NG_NOFLAGS)

	/* Send a previously unpackaged mbuf. XXX: This should be called
	* NG_SEND_DATA in future, but this name is kept for compatibility
	* reasons.
	*/
	#define NG_SEND_DATA_FLAGS(error, hook, m, flags) \
	do { \
	item_p _item; \
	if ((_item = ng_package_data((m), flags))) { \
	NG_FWD_ITEM_HOOK_FLAGS(error, _item, hook, flags);\
	} else { \
	(error) = ENOMEM; \
	} \
	(m) = NULL; \
	} while (0)

	#define NG_SEND_DATA_ONLY(error, hook, m) \
	NG_SEND_DATA_FLAGS(error, hook, m, NG_NOFLAGS)
	/* NG_SEND_DATA() compat for meta-data times */
	#define NG_SEND_DATA(error, hook, m, x) \
	NG_SEND_DATA_FLAGS(error, hook, m, NG_NOFLAGS)

	#define NG_FREE_MSG(msg) \
	do { \
	if ((msg)) { \
	FREE((msg), M_NETGRAPH_MSG); \
	(msg) = NULL; \
	} \
	} while (0)

	#define NG_FREE_M(m) \
	do { \
	if ((m)) { \
	m_freem((m)); \
	(m) = NULL; \
	} \
	} while (0)

	/*****************************************
	* Message macros
	*****************************************/

	#define NG_SEND_MSG_HOOK(error, here, msg, hook, retaddr) \
	do { \
	item_p _item; \
	if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\
	(msg) = NULL; \
	(error) = ENOMEM; \
	break; \
	} \
	if (((error) = ng_address_hook((here), (_item), \
	(hook), (retaddr))) == 0) { \
	SAVE_LINE(_item); \
	(error) = ng_snd_item((_item), 0); \
	} \
	(msg) = NULL; \
	} while (0)

	#define NG_SEND_MSG_PATH(error, here, msg, path, retaddr) \
	do { \
	item_p _item; \
	if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\
	(msg) = NULL; \
	(error) = ENOMEM; \
	break; \
	} \
	if (((error) = ng_address_path((here), (_item), \
	(path), (retaddr))) == 0) { \
	SAVE_LINE(_item); \
	(error) = ng_snd_item((_item), 0); \
	} \
	(msg) = NULL; \
	} while (0)

	#define NG_SEND_MSG_ID(error, here, msg, ID, retaddr) \
	do { \
	item_p _item; \
	if ((_item = ng_package_msg(msg, NG_NOFLAGS)) == NULL) {\
	(msg) = NULL; \
	(error) = ENOMEM; \
	break; \
	} \
	if (((error) = ng_address_ID((here), (_item), \
	(ID), (retaddr))) == 0) { \
	SAVE_LINE(_item); \
	(error) = ng_snd_item((_item), 0); \
	} \
	(msg) = NULL; \
	} while (0)

	/*
	* Redirect the message to the next hop using the given hook.
	* ng_retarget_msg() frees the item if there is an error
	* and returns an error code. It returns 0 on success.
	*/
	#define NG_FWD_MSG_HOOK(error, here, item, hook, retaddr) \
	do { \
	if (((error) = ng_address_hook((here), (item), \
	(hook), (retaddr))) == 0) { \
	SAVE_LINE(item); \
	(error) = ng_snd_item((item), 0); \
	} \
	(item) = NULL; \
	} while (0)

	/*
	* Send a queue item back to it's originator with a response message.
	* Assume original message was removed and freed separatly.
	*/
	#define NG_RESPOND_MSG(error, here, item, resp) \
	do { \
	if (resp) { \
	ng_ID_t _dest = NGI_RETADDR(item); \
	NGI_RETADDR(item) = 0; \
	NGI_MSG(item) = resp; \
	if ((error = ng_address_ID((here), (item), \
	_dest, 0)) == 0) { \
	SAVE_LINE(item); \
	(error) = ng_snd_item((item), NG_QUEUE);\
	} \
	} else \
	NG_FREE_ITEM(item); \
	(item) = NULL; \
	} while (0)


	/***********************************************************************
	****** Structures Definitions and Macros for defining a node *****
	***********************************************************************
	*
	* Here we define the structures needed to actually define a new node
	* type.
	*/

	/*
	* Command list -- each node type specifies the command that it knows
	* how to convert between ASCII and binary using an array of these.
	* The last element in the array must be a terminator with cookie=0.
	*/

	struct ng_cmdlist {
	u_int32_t cookie; /* command typecookie */
	int cmd; /* command number */
	const char name; / command name */
	const struct ng_parse_type mesgType; / args if !NGF_RESP */
	const struct ng_parse_type respType; / args if NGF_RESP */
	};

	/*
	* Structure of a node type
	* If data is sent to the "rcvdata()" entrypoint then the system
	* may decide to defer it until later by queing it with the normal netgraph
	* input queuing system. This is decidde by the HK_QUEUE flag being set in
	* the flags word of the peer (receiving) hook. The dequeuing mechanism will
	* ensure it is not requeued again.
	* Note the input queueing system is to allow modules
	* to 'release the stack' or to pass data across spl layers.
	* The data will be redelivered as soon as the NETISR code runs
	* which may be almost immediatly. A node may also do it's own queueing
	* for other reasons (e.g. device output queuing).
	*/
	struct ng_type {

	u_int32_t version; /* must equal NG_API_VERSION */
	const char name; / Unique type name */
	modeventhand_t mod_event; /* Module event handler (optional) */
	ng_constructor_t constructor; / Node constructor */
	ng_rcvmsg_t rcvmsg; / control messages come here */
	ng_close_t close; / warn about forthcoming shutdown */
	ng_shutdown_t shutdown; / reset, and free resources */
	ng_newhook_t newhook; / first notification of new hook */
	ng_findhook_t findhook; / only if you have lots of hooks */
	ng_connect_t connect; / final notification of new hook */
	ng_rcvdata_t rcvdata; / data comes here */
	ng_disconnect_t disconnect; / notify on disconnect */

	const struct ng_cmdlist cmdlist; / commands we can convert */

	/* R/W data private to the base netgraph code DON'T TOUCH! */
	LIST_ENTRY(ng_type) types; /* linked list of all types */
	int refs; /* number of instances */
	};

	/*
	* Use the NETGRAPH_INIT() macro to link a node type into the
	* netgraph system. This works for types compiled into the kernel
	* as well as KLD modules. The first argument should be the type
	* name (eg, echo) and the second a pointer to the type struct.
	*
	* If a different link time is desired, e.g., a device driver that
	* needs to install its netgraph type before probing, use the
	* NETGRAPH_INIT_ORDERED() macro instead. Device drivers probably
	* want to use SI_SUB_DRIVERS/SI_ORDER_FIRST.
	*/

	#define NETGRAPH_INIT_ORDERED(typename, typestructp, sub, order) \
	static moduledata_t ng_##typename##_mod = { \
	"ng_" #typename, \
	ng_mod_event, \
	(typestructp) \
	}; \
	DECLARE_MODULE(ng_##typename, ng_##typename##_mod, sub, order); \
	MODULE_DEPEND(ng_##typename, netgraph, NG_ABI_VERSION, \
	NG_ABI_VERSION, \
	NG_ABI_VERSION)

	#define NETGRAPH_INIT(tn, tp) \
	NETGRAPH_INIT_ORDERED(tn, tp, SI_SUB_PSEUDO, SI_ORDER_ANY)

	/* Special malloc() type for netgraph structs and ctrl messages */
	/* Only these two types should be visible to nodes */
	MALLOC_DECLARE(M_NETGRAPH);
	MALLOC_DECLARE(M_NETGRAPH_MSG);

	/* declare the base of the netgraph sysclt hierarchy */
	/* but only if this file cares about sysctls */
	#ifdef SYSCTL_DECL
	SYSCTL_DECL(_net_graph);
	#endif

	/*
	* Methods that the nodes can use.
	* Many of these methods should usually NOT be used directly but via
	* Macros above.
	*/
	int ng_address_ID(node_p here, item_p item, ng_ID_t ID, ng_ID_t retaddr);
	int ng_address_hook(node_p here, item_p item, hook_p hook, ng_ID_t retaddr);
	int ng_address_path(node_p here, item_p item, char *address, ng_ID_t raddr);
	int ng_bypass(hook_p hook1, hook_p hook2);
	hook_p ng_findhook(node_p node, const char *name);
	struct ng_type ng_findtype(const char type);
	int ng_make_node_common(struct ng_type typep, node_p nodep);
	int ng_name_node(node_p node, const char *name);
	int ng_newtype(struct ng_type *tp);
	ng_ID_t ng_node2ID(node_p node);
	item_p ng_package_data(struct mbuf *m, int flags);
	item_p ng_package_msg(struct ng_mesg *msg, int flags);
	item_p ng_package_msg_self(node_p here, hook_p hook, struct ng_mesg *msg);
	void ng_replace_retaddr(node_p here, item_p item, ng_ID_t retaddr);
	int ng_rmhook_self(hook_p hook); /* if a node wants to kill a hook */
	int ng_rmnode_flags(node_p here, int flags);
	int ng_rmnode_self(node_p here); /* if a node wants to suicide */
	int ng_rmtype(struct ng_type *tp);
	int ng_snd_item(item_p item, int queue);
	int ng_send_fn(node_p node, hook_p hook, ng_item_fn fn, void arg1,
	int arg2);
	int ng_send_fn1(node_p node, hook_p hook, ng_item_fn fn, void arg1,
	int arg2, int flags);
	int ng_send_fn2(node_p node, hook_p hook, item_p pitem, ng_item_fn2 *fn,
	void *arg1, int arg2, int flags);
	int ng_uncallout(struct callout *c, node_p node);
	int ng_callout(struct callout *c, node_p node, hook_p hook, int ticks,
	ng_item_fn fn, void arg1, int arg2);
	#define ng_callout_init(c) callout_init(c, CALLOUT_MPSAFE)

	/* Flags for netgraph functions. */
	#define NG_NOFLAGS 0x00000000 /* no special options */
	#define NG_QUEUE 0x00000001 /* enqueue item, don't dispatch */
	#define NG_WAITOK 0x00000002 /* use M_WAITOK, etc. */
	/* XXXGL: NG_PROGRESS unused since ng_base.c rev. 1.136. Should be deleted? */
	#define NG_PROGRESS 0x00000004 /* return EINPROGRESS if queued */
	#define NG_REUSE_ITEM 0x00000008 /* supplied item should be reused */

	/*
	* prototypes the user should DEFINITELY not use directly
	*/
	void ng_free_item(item_p item); /* Use NG_FREE_ITEM instead */
	int ng_mod_event(module_t mod, int what, void *arg);

	/*
	* Tag definitions and constants
	*/

	#define NG_TAG_PRIO 1

	struct ng_tag_prio {
	struct m_tag tag;
	char priority;
	char discardability;
	};

	#define NG_PRIO_CUTOFF 32
	#define NG_PRIO_LINKSTATE 64

	/* Macros and declarations to keep compatibility with metadata, which
	* is obsoleted now. To be deleted.
	*/
	typedef void *meta_p;
	#define _NGI_META(i) NULL
	#define NGI_META(i) NULL
	#define NG_FREE_META(meta)
	#define NGI_GET_META(i,m)
	#define ng_copy_meta(meta) NULL

	+/* Hash related definitions */
	+#define NG_ID_HASH_SIZE 128 /* most systems wont need even this many */
	+
	+/* Virtualization macros */
	+#define INIT_VNET_NETGRAPH(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_NETGRAPH, \
	+ struct vnet_netgraph, vnet_netgraph)
	+
	+#define VNET_NETGRAPH(sym) VSYM(vnet_netgraph, sym)
	+
	+/* Symbol translation macros */
	+#define V_nextID VNET_NETGRAPH(nextID)
	+#define V_ng_ID_hash VNET_NETGRAPH(ng_ID_hash)
	+#define V_ng_eiface_unit VNET_NETGRAPH(ng_eiface_unit)
	+#define V_ng_iface_unit VNET_NETGRAPH(ng_iface_unit)
	+#define V_ng_name_hash VNET_NETGRAPH(ng_name_hash)
	+#define V_ng_nodelist VNET_NETGRAPH(ng_nodelist)
	+#define V_ng_wormhole_unit VNET_NETGRAPH(ng_wormhole_unit)
	+
	#endif /* _NETGRAPH_NETGRAPH_H_ */
	Index: head/sys/netgraph/ng_base.c
	===================================================================
	--- head/sys/netgraph/ng_base.c (revision 183549)
	+++ head/sys/netgraph/ng_base.c (revision 183550)
	@@ -1,3725 +1,3729 @@
	/*
	* ng_base.c
	*/

	/*-
	* Copyright (c) 1996-1999 Whistle Communications, Inc.
	* All rights reserved.
	*
	* Subject to the following obligations and disclaimer of warranty, use and
	* redistribution of this software, in source or object code forms, with or
	* without modifications are expressly permitted by Whistle Communications;
	* provided, however, that:
	* 1. Any and all reproductions of the source or object code must include the
	* copyright notice above and the following disclaimer of warranties; and
	* 2. No rights are granted, in any manner or form, to use Whistle
	* Communications, Inc. trademarks, including the mark "WHISTLE
	* COMMUNICATIONS" on advertising, endorsements, or otherwise except as
	* such appears in the above copyright notice or in the software.
	*
	* THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
	* TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
	* REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
	* INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
	* WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
	* REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
	* SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
	* IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
	* RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
	* WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
	* PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
	* OF SUCH DAMAGE.
	*
	* Authors: Julian Elischer <julian@freebsd.org>
	* Archie Cobbs <archie@freebsd.org>
	*
	* $FreeBSD$
	* $Whistle: ng_base.c,v 1.39 1999/01/28 23:54:53 julian Exp $
	*/

	/*
	* This file implements the base netgraph code.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/ctype.h>
	#include <sys/errno.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/queue.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/refcount.h>
	#include <sys/proc.h>
	#include <sys/vimage.h>
	#include <machine/cpu.h>

	#include <net/netisr.h>

	#include <netgraph/ng_message.h>
	#include <netgraph/netgraph.h>
	#include <netgraph/ng_parse.h>

	MODULE_VERSION(netgraph, NG_ABI_VERSION);

	/* Mutex to protect topology events. */
	static struct mtx ng_topo_mtx;

	#ifdef NETGRAPH_DEBUG
	static struct mtx ng_nodelist_mtx; /* protects global node/hook lists */
	static struct mtx ngq_mtx; /* protects the queue item list */

	static SLIST_HEAD(, ng_node) ng_allnodes;
	static LIST_HEAD(, ng_node) ng_freenodes; /* in debug, we never free() them */
	static SLIST_HEAD(, ng_hook) ng_allhooks;
	static LIST_HEAD(, ng_hook) ng_freehooks; /* in debug, we never free() them */

	static void ng_dumpitems(void);
	static void ng_dumpnodes(void);
	static void ng_dumphooks(void);

	#endif /* NETGRAPH_DEBUG */
	/*
	* DEAD versions of the structures.
	* In order to avoid races, it is sometimes neccesary to point
	* at SOMETHING even though theoretically, the current entity is
	* INVALID. Use these to avoid these races.
	*/
	struct ng_type ng_deadtype = {
	NG_ABI_VERSION,
	"dead",
	NULL, /* modevent */
	NULL, /* constructor */
	NULL, /* rcvmsg */
	NULL, /* shutdown */
	NULL, /* newhook */
	NULL, /* findhook */
	NULL, /* connect */
	NULL, /* rcvdata */
	NULL, /* disconnect */
	NULL, /* cmdlist */
	};

	struct ng_node ng_deadnode = {
	"dead",
	&ng_deadtype,
	NGF_INVALID,
	0, /* numhooks */
	NULL, /* private */
	0, /* ID */
	LIST_HEAD_INITIALIZER(ng_deadnode.hooks),
	{}, /* all_nodes list entry */
	{}, /* id hashtable list entry */
	{ 0,
	0,
	{}, /* should never use! (should hang) */
	{}, /* workqueue entry */
	STAILQ_HEAD_INITIALIZER(ng_deadnode.nd_input_queue.queue),
	},
	1, /* refs */
	#ifdef NETGRAPH_DEBUG
	ND_MAGIC,
	__FILE__,
	__LINE__,
	{NULL}
	#endif /* NETGRAPH_DEBUG */
	};

	struct ng_hook ng_deadhook = {
	"dead",
	NULL, /* private */
	HK_INVALID \| HK_DEAD,
	0, /* undefined data link type */
	&ng_deadhook, /* Peer is self */
	&ng_deadnode, /* attached to deadnode */
	{}, /* hooks list */
	NULL, /* override rcvmsg() */
	NULL, /* override rcvdata() */
	1, /* refs always >= 1 */
	#ifdef NETGRAPH_DEBUG
	HK_MAGIC,
	__FILE__,
	__LINE__,
	{NULL}
	#endif /* NETGRAPH_DEBUG */
	};

	/*
	* END DEAD STRUCTURES
	*/
	/* List nodes with unallocated work */
	static STAILQ_HEAD(, ng_node) ng_worklist = STAILQ_HEAD_INITIALIZER(ng_worklist);
	static struct mtx ng_worklist_mtx; /* MUST LOCK NODE FIRST */

	/* List of installed types */
	static LIST_HEAD(, ng_type) ng_typelist;
	static struct mtx ng_typelist_mtx;

	/* Hash related definitions */
	/* XXX Don't need to initialise them because it's a LIST */
	-#define NG_ID_HASH_SIZE 128 /* most systems wont need even this many */
	static LIST_HEAD(, ng_node) ng_ID_hash[NG_ID_HASH_SIZE];
	static struct mtx ng_idhash_mtx;
	/* Method to find a node.. used twice so do it here */
	#define NG_IDHASH_FN(ID) ((ID) % (NG_ID_HASH_SIZE))
	#define NG_IDHASH_FIND(ID, node) \
	do { \
	mtx_assert(&ng_idhash_mtx, MA_OWNED); \
	LIST_FOREACH(node, &V_ng_ID_hash[NG_IDHASH_FN(ID)], \
	nd_idnodes) { \
	if (NG_NODE_IS_VALID(node) \
	&& (NG_NODE_ID(node) == ID)) { \
	break; \
	} \
	} \
	} while (0)

	#define NG_NAME_HASH_SIZE 128 /* most systems wont need even this many */
	static LIST_HEAD(, ng_node) ng_name_hash[NG_NAME_HASH_SIZE];
	static struct mtx ng_namehash_mtx;
	#define NG_NAMEHASH(NAME, HASH) \
	do { \
	u_char h = 0; \
	const u_char *c; \
	for (c = (const u_char)(NAME); c; c++)\
	h += *c; \
	(HASH) = h % (NG_NAME_HASH_SIZE); \
	} while (0)


	/* Internal functions */
	static int ng_add_hook(node_p node, const char name, hook_p hookp);
	static int ng_generic_msg(node_p here, item_p item, hook_p lasthook);
	static ng_ID_t ng_decodeidname(const char *name);
	static int ngb_mod_event(module_t mod, int event, void *data);
	static void ng_worklist_add(node_p node);
	static void ngintr(void);
	static int ng_apply_item(node_p node, item_p item, int rw);
	static void ng_flush_input_queue(node_p node);
	static node_p ng_ID2noderef(ng_ID_t ID);
	static int ng_con_nodes(item_p item, node_p node, const char *name,
	node_p node2, const char *name2);
	static int ng_con_part2(node_p node, item_p item, hook_p hook);
	static int ng_con_part3(node_p node, item_p item, hook_p hook);
	static int ng_mkpeer(node_p node, const char *name,
	const char name2, char type);

	/* Imported, these used to be externally visible, some may go back. */
	void ng_destroy_hook(hook_p hook);
	node_p ng_name2noderef(node_p node, const char *name);
	int ng_path2noderef(node_p here, const char *path,
	node_p dest, hook_p lasthook);
	int ng_make_node(const char type, node_p nodepp);
	int ng_path_parse(char addr, char node, char path, char *hook);
	void ng_rmnode(node_p node, hook_p dummy1, void *dummy2, int dummy3);
	void ng_unname(node_p node);


	/* Our own netgraph malloc type */
	MALLOC_DEFINE(M_NETGRAPH, "netgraph", "netgraph structures and ctrl messages");
	MALLOC_DEFINE(M_NETGRAPH_HOOK, "netgraph_hook", "netgraph hook structures");
	MALLOC_DEFINE(M_NETGRAPH_NODE, "netgraph_node", "netgraph node structures");
	MALLOC_DEFINE(M_NETGRAPH_ITEM, "netgraph_item", "netgraph item structures");
	MALLOC_DEFINE(M_NETGRAPH_MSG, "netgraph_msg", "netgraph name storage");

	/* Should not be visible outside this file */

	#define _NG_ALLOC_HOOK(hook) \
	MALLOC(hook, hook_p, sizeof(*hook), M_NETGRAPH_HOOK, M_NOWAIT \| M_ZERO)
	#define _NG_ALLOC_NODE(node) \
	MALLOC(node, node_p, sizeof(*node), M_NETGRAPH_NODE, M_NOWAIT \| M_ZERO)

	#define NG_QUEUE_LOCK_INIT(n) \
	mtx_init(&(n)->q_mtx, "ng_node", NULL, MTX_DEF)
	#define NG_QUEUE_LOCK(n) \
	mtx_lock(&(n)->q_mtx)
	#define NG_QUEUE_UNLOCK(n) \
	mtx_unlock(&(n)->q_mtx)
	#define NG_WORKLIST_LOCK_INIT() \
	mtx_init(&ng_worklist_mtx, "ng_worklist", NULL, MTX_DEF)
	#define NG_WORKLIST_LOCK() \
	mtx_lock(&ng_worklist_mtx)
	#define NG_WORKLIST_UNLOCK() \
	mtx_unlock(&ng_worklist_mtx)

	#ifdef NETGRAPH_DEBUG /----------------------------------------------/
	/*
	* In debug mode:
	* In an attempt to help track reference count screwups
	* we do not free objects back to the malloc system, but keep them
	* in a local cache where we can examine them and keep information safely
	* after they have been freed.
	* We use this scheme for nodes and hooks, and to some extent for items.
	*/
	static __inline hook_p
	ng_alloc_hook(void)
	{
	hook_p hook;
	SLIST_ENTRY(ng_hook) temp;
	mtx_lock(&ng_nodelist_mtx);
	hook = LIST_FIRST(&ng_freehooks);
	if (hook) {
	LIST_REMOVE(hook, hk_hooks);
	bcopy(&hook->hk_all, &temp, sizeof(temp));
	bzero(hook, sizeof(struct ng_hook));
	bcopy(&temp, &hook->hk_all, sizeof(temp));
	mtx_unlock(&ng_nodelist_mtx);
	hook->hk_magic = HK_MAGIC;
	} else {
	mtx_unlock(&ng_nodelist_mtx);
	_NG_ALLOC_HOOK(hook);
	if (hook) {
	hook->hk_magic = HK_MAGIC;
	mtx_lock(&ng_nodelist_mtx);
	SLIST_INSERT_HEAD(&ng_allhooks, hook, hk_all);
	mtx_unlock(&ng_nodelist_mtx);
	}
	}
	return (hook);
	}

	static __inline node_p
	ng_alloc_node(void)
	{
	node_p node;
	SLIST_ENTRY(ng_node) temp;
	mtx_lock(&ng_nodelist_mtx);
	node = LIST_FIRST(&ng_freenodes);
	if (node) {
	LIST_REMOVE(node, nd_nodes);
	bcopy(&node->nd_all, &temp, sizeof(temp));
	bzero(node, sizeof(struct ng_node));
	bcopy(&temp, &node->nd_all, sizeof(temp));
	mtx_unlock(&ng_nodelist_mtx);
	node->nd_magic = ND_MAGIC;
	} else {
	mtx_unlock(&ng_nodelist_mtx);
	_NG_ALLOC_NODE(node);
	if (node) {
	node->nd_magic = ND_MAGIC;
	mtx_lock(&ng_nodelist_mtx);
	SLIST_INSERT_HEAD(&ng_allnodes, node, nd_all);
	mtx_unlock(&ng_nodelist_mtx);
	}
	}
	return (node);
	}

	#define NG_ALLOC_HOOK(hook) do { (hook) = ng_alloc_hook(); } while (0)
	#define NG_ALLOC_NODE(node) do { (node) = ng_alloc_node(); } while (0)


	#define NG_FREE_HOOK(hook) \
	do { \
	mtx_lock(&ng_nodelist_mtx); \
	LIST_INSERT_HEAD(&ng_freehooks, hook, hk_hooks); \
	hook->hk_magic = 0; \
	mtx_unlock(&ng_nodelist_mtx); \
	} while (0)

	#define NG_FREE_NODE(node) \
	do { \
	mtx_lock(&ng_nodelist_mtx); \
	LIST_INSERT_HEAD(&ng_freenodes, node, nd_nodes); \
	node->nd_magic = 0; \
	mtx_unlock(&ng_nodelist_mtx); \
	} while (0)

	#else /* NETGRAPH_DEBUG / /----------------------------------------------*/

	#define NG_ALLOC_HOOK(hook) _NG_ALLOC_HOOK(hook)
	#define NG_ALLOC_NODE(node) _NG_ALLOC_NODE(node)

	#define NG_FREE_HOOK(hook) do { FREE((hook), M_NETGRAPH_HOOK); } while (0)
	#define NG_FREE_NODE(node) do { FREE((node), M_NETGRAPH_NODE); } while (0)

	#endif /* NETGRAPH_DEBUG / /----------------------------------------------*/

	/* Set this to kdb_enter("X") to catch all errors as they occur */
	#ifndef TRAP_ERROR
	#define TRAP_ERROR()
	#endif

	static ng_ID_t nextID = 1;

	#ifdef INVARIANTS
	#define CHECK_DATA_MBUF(m) do { \
	struct mbuf *n; \
	int total; \
	\
	M_ASSERTPKTHDR(m); \
	for (total = 0, n = (m); n != NULL; n = n->m_next) { \
	total += n->m_len; \
	if (n->m_nextpkt != NULL) \
	panic("%s: m_nextpkt", __func__); \
	} \
	\
	if ((m)->m_pkthdr.len != total) { \
	panic("%s: %d != %d", \
	__func__, (m)->m_pkthdr.len, total); \
	} \
	} while (0)
	#else
	#define CHECK_DATA_MBUF(m)
	#endif

	#define ERROUT(x) do { error = (x); goto done; } while (0)

	/************************************************************************
	Parse type definitions for generic messages
	************************************************************************/

	/* Handy structure parse type defining macro */
	#define DEFINE_PARSE_STRUCT_TYPE(lo, up, args) \
	static const struct ng_parse_struct_field \
	ng_ ## lo ## _type_fields[] = NG_GENERIC_ ## up ## _INFO args; \
	static const struct ng_parse_type ng_generic_ ## lo ## _type = { \
	&ng_parse_struct_type, \
	&ng_ ## lo ## _type_fields \
	}

	DEFINE_PARSE_STRUCT_TYPE(mkpeer, MKPEER, ());
	DEFINE_PARSE_STRUCT_TYPE(connect, CONNECT, ());
	DEFINE_PARSE_STRUCT_TYPE(name, NAME, ());
	DEFINE_PARSE_STRUCT_TYPE(rmhook, RMHOOK, ());
	DEFINE_PARSE_STRUCT_TYPE(nodeinfo, NODEINFO, ());
	DEFINE_PARSE_STRUCT_TYPE(typeinfo, TYPEINFO, ());
	DEFINE_PARSE_STRUCT_TYPE(linkinfo, LINKINFO, (&ng_generic_nodeinfo_type));

	/* Get length of an array when the length is stored as a 32 bit
	value immediately preceding the array -- as with struct namelist
	and struct typelist. */
	static int
	ng_generic_list_getLength(const struct ng_parse_type *type,
	const u_char start, const u_char buf)
	{
	return ((const u_int32_t )(buf - 4));
	}

	/* Get length of the array of struct linkinfo inside a struct hooklist */
	static int
	ng_generic_linkinfo_getLength(const struct ng_parse_type *type,
	const u_char start, const u_char buf)
	{
	const struct hooklist hl = (const struct hooklist )start;

	return hl->nodeinfo.hooks;
	}

	/* Array type for a variable length array of struct namelist */
	static const struct ng_parse_array_info ng_nodeinfoarray_type_info = {
	&ng_generic_nodeinfo_type,
	&ng_generic_list_getLength
	};
	static const struct ng_parse_type ng_generic_nodeinfoarray_type = {
	&ng_parse_array_type,
	&ng_nodeinfoarray_type_info
	};

	/* Array type for a variable length array of struct typelist */
	static const struct ng_parse_array_info ng_typeinfoarray_type_info = {
	&ng_generic_typeinfo_type,
	&ng_generic_list_getLength
	};
	static const struct ng_parse_type ng_generic_typeinfoarray_type = {
	&ng_parse_array_type,
	&ng_typeinfoarray_type_info
	};

	/* Array type for array of struct linkinfo in struct hooklist */
	static const struct ng_parse_array_info ng_generic_linkinfo_array_type_info = {
	&ng_generic_linkinfo_type,
	&ng_generic_linkinfo_getLength
	};
	static const struct ng_parse_type ng_generic_linkinfo_array_type = {
	&ng_parse_array_type,
	&ng_generic_linkinfo_array_type_info
	};

	DEFINE_PARSE_STRUCT_TYPE(typelist, TYPELIST, (&ng_generic_nodeinfoarray_type));
	DEFINE_PARSE_STRUCT_TYPE(hooklist, HOOKLIST,
	(&ng_generic_nodeinfo_type, &ng_generic_linkinfo_array_type));
	DEFINE_PARSE_STRUCT_TYPE(listnodes, LISTNODES,
	(&ng_generic_nodeinfoarray_type));

	/* List of commands and how to convert arguments to/from ASCII */
	static const struct ng_cmdlist ng_generic_cmds[] = {
	{
	NGM_GENERIC_COOKIE,
	NGM_SHUTDOWN,
	"shutdown",
	NULL,
	NULL
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_MKPEER,
	"mkpeer",
	&ng_generic_mkpeer_type,
	NULL
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_CONNECT,
	"connect",
	&ng_generic_connect_type,
	NULL
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_NAME,
	"name",
	&ng_generic_name_type,
	NULL
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_RMHOOK,
	"rmhook",
	&ng_generic_rmhook_type,
	NULL
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_NODEINFO,
	"nodeinfo",
	NULL,
	&ng_generic_nodeinfo_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_LISTHOOKS,
	"listhooks",
	NULL,
	&ng_generic_hooklist_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_LISTNAMES,
	"listnames",
	NULL,
	&ng_generic_listnodes_type /* same as NGM_LISTNODES */
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_LISTNODES,
	"listnodes",
	NULL,
	&ng_generic_listnodes_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_LISTTYPES,
	"listtypes",
	NULL,
	&ng_generic_typeinfo_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_TEXT_CONFIG,
	"textconfig",
	NULL,
	&ng_parse_string_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_TEXT_STATUS,
	"textstatus",
	NULL,
	&ng_parse_string_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_ASCII2BINARY,
	"ascii2binary",
	&ng_parse_ng_mesg_type,
	&ng_parse_ng_mesg_type
	},
	{
	NGM_GENERIC_COOKIE,
	NGM_BINARY2ASCII,
	"binary2ascii",
	&ng_parse_ng_mesg_type,
	&ng_parse_ng_mesg_type
	},
	{ 0 }
	};

	/************************************************************************
	Node routines
	************************************************************************/

	/*
	* Instantiate a node of the requested type
	*/
	int
	ng_make_node(const char typename, node_p nodepp)
	{
	struct ng_type *type;
	int error;

	/* Check that the type makes sense */
	if (typename == NULL) {
	TRAP_ERROR();
	return (EINVAL);
	}

	/* Locate the node type. If we fail we return. Do not try to load
	* module.
	*/
	if ((type = ng_findtype(typename)) == NULL)
	return (ENXIO);

	/*
	* If we have a constructor, then make the node and
	* call the constructor to do type specific initialisation.
	*/
	if (type->constructor != NULL) {
	if ((error = ng_make_node_common(type, nodepp)) == 0) {
	if ((error = ((type->constructor)(nodepp)) != 0)) {
	NG_NODE_UNREF(*nodepp);
	}
	}
	} else {
	/*
	* Node has no constructor. We cannot ask for one
	* to be made. It must be brought into existence by
	* some external agency. The external agency should
	* call ng_make_node_common() directly to get the
	* netgraph part initialised.
	*/
	TRAP_ERROR();
	error = EINVAL;
	}
	return (error);
	}

	/*
	* Generic node creation. Called by node initialisation for externally
	* instantiated nodes (e.g. hardware, sockets, etc ).
	* The returned node has a reference count of 1.
	*/
	int
	ng_make_node_common(struct ng_type type, node_p nodepp)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	node_p node;

	/* Require the node type to have been already installed */
	if (ng_findtype(type->name) == NULL) {
	TRAP_ERROR();
	return (EINVAL);
	}

	/* Make a node and try attach it to the type */
	NG_ALLOC_NODE(node);
	if (node == NULL) {
	TRAP_ERROR();
	return (ENOMEM);
	}
	node->nd_type = type;
	NG_NODE_REF(node); /* note reference */
	type->refs++;

	NG_QUEUE_LOCK_INIT(&node->nd_input_queue);
	STAILQ_INIT(&node->nd_input_queue.queue);
	node->nd_input_queue.q_flags = 0;

	/* Initialize hook list for new node */
	LIST_INIT(&node->nd_hooks);

	/* Link us into the name hash. */
	mtx_lock(&ng_namehash_mtx);
	LIST_INSERT_HEAD(&V_ng_name_hash[0], node, nd_nodes);
	mtx_unlock(&ng_namehash_mtx);

	/* get an ID and put us in the hash chain */
	mtx_lock(&ng_idhash_mtx);
	for (;;) { /* wrap protection, even if silly */
	node_p node2 = NULL;
	node->nd_ID = V_nextID++; /* 137/sec for 1 year before wrap */

	/* Is there a problem with the new number? */
	NG_IDHASH_FIND(node->nd_ID, node2); /* already taken? */
	if ((node->nd_ID != 0) && (node2 == NULL)) {
	break;
	}
	}
	LIST_INSERT_HEAD(&V_ng_ID_hash[NG_IDHASH_FN(node->nd_ID)],
	node, nd_idnodes);
	mtx_unlock(&ng_idhash_mtx);

	/* Done */
	*nodepp = node;
	return (0);
	}

	/*
	* Forceably start the shutdown process on a node. Either call
	* its shutdown method, or do the default shutdown if there is
	* no type-specific method.
	*
	* We can only be called from a shutdown message, so we know we have
	* a writer lock, and therefore exclusive access. It also means
	* that we should not be on the work queue, but we check anyhow.
	*
	* Persistent node types must have a type-specific method which
	* allocates a new node in which case, this one is irretrievably going away,
	* or cleans up anything it needs, and just makes the node valid again,
	* in which case we allow the node to survive.
	*
	* XXX We need to think of how to tell a persistent node that we
	* REALLY need to go away because the hardware has gone or we
	* are rebooting.... etc.
	*/
	void
	ng_rmnode(node_p node, hook_p dummy1, void *dummy2, int dummy3)
	{
	hook_p hook;

	/* Check if it's already shutting down */
	if ((node->nd_flags & NGF_CLOSING) != 0)
	return;

	if (node == &ng_deadnode) {
	printf ("shutdown called on deadnode\n");
	return;
	}

	/* Add an extra reference so it doesn't go away during this */
	NG_NODE_REF(node);

	/*
	* Mark it invalid so any newcomers know not to try use it
	* Also add our own mark so we can't recurse
	* note that NGF_INVALID does not do this as it's also set during
	* creation
	*/
	node->nd_flags \|= NGF_INVALID\|NGF_CLOSING;

	/* If node has its pre-shutdown method, then call it first*/
	if (node->nd_type && node->nd_type->close)
	(*node->nd_type->close)(node);

	/* Notify all remaining connected nodes to disconnect */
	while ((hook = LIST_FIRST(&node->nd_hooks)) != NULL)
	ng_destroy_hook(hook);

	/*
	* Drain the input queue forceably.
	* it has no hooks so what's it going to do, bleed on someone?
	* Theoretically we came here from a queue entry that was added
	* Just before the queue was closed, so it should be empty anyway.
	* Also removes us from worklist if needed.
	*/
	ng_flush_input_queue(node);

	/* Ask the type if it has anything to do in this case */
	if (node->nd_type && node->nd_type->shutdown) {
	(*node->nd_type->shutdown)(node);
	if (NG_NODE_IS_VALID(node)) {
	/*
	* Well, blow me down if the node code hasn't declared
	* that it doesn't want to die.
	* Presumably it is a persistant node.
	* If we REALLY want it to go away,
	* e.g. hardware going away,
	* Our caller should set NGF_REALLY_DIE in nd_flags.
	*/
	node->nd_flags &= ~(NGF_INVALID\|NGF_CLOSING);
	NG_NODE_UNREF(node); /* Assume they still have theirs */
	return;
	}
	} else { /* do the default thing */
	NG_NODE_UNREF(node);
	}

	ng_unname(node); /* basically a NOP these days */

	/*
	* Remove extra reference, possibly the last
	* Possible other holders of references may include
	* timeout callouts, but theoretically the node's supposed to
	* have cancelled them. Possibly hardware dependencies may
	* force a driver to 'linger' with a reference.
	*/
	NG_NODE_UNREF(node);
	}

	/*
	* Remove a reference to the node, possibly the last.
	* deadnode always acts as it it were the last.
	*/
	int
	ng_unref_node(node_p node)
	{
	int v;

	if (node == &ng_deadnode) {
	return (0);
	}

	v = atomic_fetchadd_int(&node->nd_refs, -1);

	if (v == 1) { /* we were the last */

	mtx_lock(&ng_namehash_mtx);
	node->nd_type->refs--; /* XXX maybe should get types lock? */
	LIST_REMOVE(node, nd_nodes);
	mtx_unlock(&ng_namehash_mtx);

	mtx_lock(&ng_idhash_mtx);
	LIST_REMOVE(node, nd_idnodes);
	mtx_unlock(&ng_idhash_mtx);

	mtx_destroy(&node->nd_input_queue.q_mtx);
	NG_FREE_NODE(node);
	}
	return (v - 1);
	}

	/************************************************************************
	Node ID handling
	************************************************************************/
	static node_p
	ng_ID2noderef(ng_ID_t ID)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	node_p node;
	mtx_lock(&ng_idhash_mtx);
	NG_IDHASH_FIND(ID, node);
	if(node)
	NG_NODE_REF(node);
	mtx_unlock(&ng_idhash_mtx);
	return(node);
	}

	ng_ID_t
	ng_node2ID(node_p node)
	{
	return (node ? NG_NODE_ID(node) : 0);
	}

	/************************************************************************
	Node name handling
	************************************************************************/

	/*
	* Assign a node a name. Once assigned, the name cannot be changed.
	*/
	int
	ng_name_node(node_p node, const char *name)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	int i, hash;
	node_p node2;

	/* Check the name is valid */
	for (i = 0; i < NG_NODESIZ; i++) {
	if (name[i] == '\0' \|\| name[i] == '.' \|\| name[i] == ':')
	break;
	}
	if (i == 0 \|\| name[i] != '\0') {
	TRAP_ERROR();
	return (EINVAL);
	}
	if (ng_decodeidname(name) != 0) { /* valid IDs not allowed here */
	TRAP_ERROR();
	return (EINVAL);
	}

	/* Check the name isn't already being used */
	if ((node2 = ng_name2noderef(node, name)) != NULL) {
	NG_NODE_UNREF(node2);
	TRAP_ERROR();
	return (EADDRINUSE);
	}

	/* copy it */
	strlcpy(NG_NODE_NAME(node), name, NG_NODESIZ);

	/* Update name hash. */
	NG_NAMEHASH(name, hash);
	mtx_lock(&ng_namehash_mtx);
	LIST_REMOVE(node, nd_nodes);
	LIST_INSERT_HEAD(&V_ng_name_hash[hash], node, nd_nodes);
	mtx_unlock(&ng_namehash_mtx);

	return (0);
	}

	/*
	* Find a node by absolute name. The name should NOT end with ':'
	* The name "." means "this node" and "[xxx]" means "the node
	* with ID (ie, at address) xxx".
	*
	* Returns the node if found, else NULL.
	* Eventually should add something faster than a sequential search.
	* Note it acquires a reference on the node so you can be sure it's still
	* there.
	*/
	node_p
	ng_name2noderef(node_p here, const char *name)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	node_p node;
	ng_ID_t temp;
	int hash;

	/* "." means "this node" */
	if (strcmp(name, ".") == 0) {
	NG_NODE_REF(here);
	return(here);
	}

	/* Check for name-by-ID */
	if ((temp = ng_decodeidname(name)) != 0) {
	return (ng_ID2noderef(temp));
	}

	/* Find node by name */
	NG_NAMEHASH(name, hash);
	mtx_lock(&ng_namehash_mtx);
	LIST_FOREACH(node, &V_ng_name_hash[hash], nd_nodes) {
	if (NG_NODE_IS_VALID(node) &&
	(strcmp(NG_NODE_NAME(node), name) == 0)) {
	break;
	}
	}
	if (node)
	NG_NODE_REF(node);
	mtx_unlock(&ng_namehash_mtx);
	return (node);
	}

	/*
	* Decode an ID name, eg. "[f03034de]". Returns 0 if the
	* string is not valid, otherwise returns the value.
	*/
	static ng_ID_t
	ng_decodeidname(const char *name)
	{
	const int len = strlen(name);
	char *eptr;
	u_long val;

	/* Check for proper length, brackets, no leading junk */
	if ((len < 3)
	\|\| (name[0] != '[')
	\|\| (name[len - 1] != ']')
	\|\| (!isxdigit(name[1]))) {
	return ((ng_ID_t)0);
	}

	/* Decode number */
	val = strtoul(name + 1, &eptr, 16);
	if ((eptr - name != len - 1)
	\|\| (val == ULONG_MAX)
	\|\| (val == 0)) {
	return ((ng_ID_t)0);
	}
	return (ng_ID_t)val;
	}

	/*
	* Remove a name from a node. This should only be called
	* when shutting down and removing the node.
	* IF we allow name changing this may be more resurrected.
	*/
	void
	ng_unname(node_p node)
	{
	}

	/************************************************************************
	Hook routines
	Names are not optional. Hooks are always connected, except for a
	brief moment within these routines. On invalidation or during creation
	they are connected to the 'dead' hook.
	************************************************************************/

	/*
	* Remove a hook reference
	*/
	void
	ng_unref_hook(hook_p hook)
	{
	int v;

	if (hook == &ng_deadhook) {
	return;
	}

	v = atomic_fetchadd_int(&hook->hk_refs, -1);

	if (v == 1) { /* we were the last */
	if (_NG_HOOK_NODE(hook)) /* it'll probably be ng_deadnode */
	_NG_NODE_UNREF((_NG_HOOK_NODE(hook)));
	NG_FREE_HOOK(hook);
	}
	}

	/*
	* Add an unconnected hook to a node. Only used internally.
	* Assumes node is locked. (XXX not yet true )
	*/
	static int
	ng_add_hook(node_p node, const char name, hook_p hookp)
	{
	hook_p hook;
	int error = 0;

	/* Check that the given name is good */
	if (name == NULL) {
	TRAP_ERROR();
	return (EINVAL);
	}
	if (ng_findhook(node, name) != NULL) {
	TRAP_ERROR();
	return (EEXIST);
	}

	/* Allocate the hook and link it up */
	NG_ALLOC_HOOK(hook);
	if (hook == NULL) {
	TRAP_ERROR();
	return (ENOMEM);
	}
	hook->hk_refs = 1; /* add a reference for us to return */
	hook->hk_flags = HK_INVALID;
	hook->hk_peer = &ng_deadhook; /* start off this way */
	hook->hk_node = node;
	NG_NODE_REF(node); /* each hook counts as a reference */

	/* Set hook name */
	strlcpy(NG_HOOK_NAME(hook), name, NG_HOOKSIZ);

	/*
	* Check if the node type code has something to say about it
	* If it fails, the unref of the hook will also unref the node.
	*/
	if (node->nd_type->newhook != NULL) {
	if ((error = (*node->nd_type->newhook)(node, hook, name))) {
	NG_HOOK_UNREF(hook); /* this frees the hook */
	return (error);
	}
	}
	/*
	* The 'type' agrees so far, so go ahead and link it in.
	* We'll ask again later when we actually connect the hooks.
	*/
	LIST_INSERT_HEAD(&node->nd_hooks, hook, hk_hooks);
	node->nd_numhooks++;
	NG_HOOK_REF(hook); /* one for the node */

	if (hookp)
	*hookp = hook;
	return (0);
	}

	/*
	* Find a hook
	*
	* Node types may supply their own optimized routines for finding
	* hooks. If none is supplied, we just do a linear search.
	* XXX Possibly we should add a reference to the hook?
	*/
	hook_p
	ng_findhook(node_p node, const char *name)
	{
	hook_p hook;

	if (node->nd_type->findhook != NULL)
	return (*node->nd_type->findhook)(node, name);
	LIST_FOREACH(hook, &node->nd_hooks, hk_hooks) {
	if (NG_HOOK_IS_VALID(hook)
	&& (strcmp(NG_HOOK_NAME(hook), name) == 0))
	return (hook);
	}
	return (NULL);
	}

	/*
	* Destroy a hook
	*
	* As hooks are always attached, this really destroys two hooks.
	* The one given, and the one attached to it. Disconnect the hooks
	* from each other first. We reconnect the peer hook to the 'dead'
	* hook so that it can still exist after we depart. We then
	* send the peer its own destroy message. This ensures that we only
	* interact with the peer's structures when it is locked processing that
	* message. We hold a reference to the peer hook so we are guaranteed that
	* the peer hook and node are still going to exist until
	* we are finished there as the hook holds a ref on the node.
	* We run this same code again on the peer hook, but that time it is already
	* attached to the 'dead' hook.
	*
	* This routine is called at all stages of hook creation
	* on error detection and must be able to handle any such stage.
	*/
	void
	ng_destroy_hook(hook_p hook)
	{
	hook_p peer;
	node_p node;

	if (hook == &ng_deadhook) { /* better safe than sorry */
	printf("ng_destroy_hook called on deadhook\n");
	return;
	}

	/*
	* Protect divorce process with mutex, to avoid races on
	* simultaneous disconnect.
	*/
	mtx_lock(&ng_topo_mtx);

	hook->hk_flags \|= HK_INVALID;

	peer = NG_HOOK_PEER(hook);
	node = NG_HOOK_NODE(hook);

	if (peer && (peer != &ng_deadhook)) {
	/*
	* Set the peer to point to ng_deadhook
	* from this moment on we are effectively independent it.
	* send it an rmhook message of it's own.
	*/
	peer->hk_peer = &ng_deadhook; /* They no longer know us */
	hook->hk_peer = &ng_deadhook; /* Nor us, them */
	if (NG_HOOK_NODE(peer) == &ng_deadnode) {
	/*
	* If it's already divorced from a node,
	* just free it.
	*/
	mtx_unlock(&ng_topo_mtx);
	} else {
	mtx_unlock(&ng_topo_mtx);
	ng_rmhook_self(peer); /* Send it a surprise */
	}
	NG_HOOK_UNREF(peer); /* account for peer link */
	NG_HOOK_UNREF(hook); /* account for peer link */
	} else
	mtx_unlock(&ng_topo_mtx);

	mtx_assert(&ng_topo_mtx, MA_NOTOWNED);

	/*
	* Remove the hook from the node's list to avoid possible recursion
	* in case the disconnection results in node shutdown.
	*/
	if (node == &ng_deadnode) { /* happens if called from ng_con_nodes() */
	return;
	}
	LIST_REMOVE(hook, hk_hooks);
	node->nd_numhooks--;
	if (node->nd_type->disconnect) {
	/*
	* The type handler may elect to destroy the node so don't
	* trust its existence after this point. (except
	* that we still hold a reference on it. (which we
	* inherrited from the hook we are destroying)
	*/
	(*node->nd_type->disconnect) (hook);
	}

	/*
	* Note that because we will point to ng_deadnode, the original node
	* is not decremented automatically so we do that manually.
	*/
	_NG_HOOK_NODE(hook) = &ng_deadnode;
	NG_NODE_UNREF(node); /* We no longer point to it so adjust count */
	NG_HOOK_UNREF(hook); /* Account for linkage (in list) to node */
	}

	/*
	* Take two hooks on a node and merge the connection so that the given node
	* is effectively bypassed.
	*/
	int
	ng_bypass(hook_p hook1, hook_p hook2)
	{
	if (hook1->hk_node != hook2->hk_node) {
	TRAP_ERROR();
	return (EINVAL);
	}
	hook1->hk_peer->hk_peer = hook2->hk_peer;
	hook2->hk_peer->hk_peer = hook1->hk_peer;

	hook1->hk_peer = &ng_deadhook;
	hook2->hk_peer = &ng_deadhook;

	NG_HOOK_UNREF(hook1);
	NG_HOOK_UNREF(hook2);

	/* XXX If we ever cache methods on hooks update them as well */
	ng_destroy_hook(hook1);
	ng_destroy_hook(hook2);
	return (0);
	}

	/*
	* Install a new netgraph type
	*/
	int
	ng_newtype(struct ng_type *tp)
	{
	const size_t namelen = strlen(tp->name);

	/* Check version and type name fields */
	if ((tp->version != NG_ABI_VERSION)
	\|\| (namelen == 0)
	\|\| (namelen >= NG_TYPESIZ)) {
	TRAP_ERROR();
	if (tp->version != NG_ABI_VERSION) {
	printf("Netgraph: Node type rejected. ABI mismatch. Suggest recompile\n");
	}
	return (EINVAL);
	}

	/* Check for name collision */
	if (ng_findtype(tp->name) != NULL) {
	TRAP_ERROR();
	return (EEXIST);
	}


	/* Link in new type */
	mtx_lock(&ng_typelist_mtx);
	LIST_INSERT_HEAD(&ng_typelist, tp, types);
	tp->refs = 1; /* first ref is linked list */
	mtx_unlock(&ng_typelist_mtx);
	return (0);
	}

	/*
	* unlink a netgraph type
	* If no examples exist
	*/
	int
	ng_rmtype(struct ng_type *tp)
	{
	/* Check for name collision */
	if (tp->refs != 1) {
	TRAP_ERROR();
	return (EBUSY);
	}

	/* Unlink type */
	mtx_lock(&ng_typelist_mtx);
	LIST_REMOVE(tp, types);
	mtx_unlock(&ng_typelist_mtx);
	return (0);
	}

	/*
	* Look for a type of the name given
	*/
	struct ng_type *
	ng_findtype(const char *typename)
	{
	struct ng_type *type;

	mtx_lock(&ng_typelist_mtx);
	LIST_FOREACH(type, &ng_typelist, types) {
	if (strcmp(type->name, typename) == 0)
	break;
	}
	mtx_unlock(&ng_typelist_mtx);
	return (type);
	}

	/************************************************************************
	Composite routines
	************************************************************************/
	/*
	* Connect two nodes using the specified hooks, using queued functions.
	*/
	static int
	ng_con_part3(node_p node, item_p item, hook_p hook)
	{
	int error = 0;

	/*
	* When we run, we know that the node 'node' is locked for us.
	* Our caller has a reference on the hook.
	* Our caller has a reference on the node.
	* (In this case our caller is ng_apply_item() ).
	* The peer hook has a reference on the hook.
	* We are all set up except for the final call to the node, and
	* the clearing of the INVALID flag.
	*/
	if (NG_HOOK_NODE(hook) == &ng_deadnode) {
	/*
	* The node must have been freed again since we last visited
	* here. ng_destry_hook() has this effect but nothing else does.
	* We should just release our references and
	* free anything we can think of.
	* Since we know it's been destroyed, and it's our caller
	* that holds the references, just return.
	*/
	ERROUT(ENOENT);
	}
	if (hook->hk_node->nd_type->connect) {
	if ((error = (*hook->hk_node->nd_type->connect) (hook))) {
	ng_destroy_hook(hook); /* also zaps peer */
	printf("failed in ng_con_part3()\n");
	ERROUT(error);
	}
	}
	/*
	* XXX this is wrong for SMP. Possibly we need
	* to separate out 'create' and 'invalid' flags.
	* should only set flags on hooks we have locked under our node.
	*/
	hook->hk_flags &= ~HK_INVALID;
	done:
	NG_FREE_ITEM(item);
	return (error);
	}

	static int
	ng_con_part2(node_p node, item_p item, hook_p hook)
	{
	hook_p peer;
	int error = 0;

	/*
	* When we run, we know that the node 'node' is locked for us.
	* Our caller has a reference on the hook.
	* Our caller has a reference on the node.
	* (In this case our caller is ng_apply_item() ).
	* The peer hook has a reference on the hook.
	* our node pointer points to the 'dead' node.
	* First check the hook name is unique.
	* Should not happen because we checked before queueing this.
	*/
	if (ng_findhook(node, NG_HOOK_NAME(hook)) != NULL) {
	TRAP_ERROR();
	ng_destroy_hook(hook); /* should destroy peer too */
	printf("failed in ng_con_part2()\n");
	ERROUT(EEXIST);
	}
	/*
	* Check if the node type code has something to say about it
	* If it fails, the unref of the hook will also unref the attached node,
	* however since that node is 'ng_deadnode' this will do nothing.
	* The peer hook will also be destroyed.
	*/
	if (node->nd_type->newhook != NULL) {
	if ((error = (*node->nd_type->newhook)(node, hook,
	hook->hk_name))) {
	ng_destroy_hook(hook); /* should destroy peer too */
	printf("failed in ng_con_part2()\n");
	ERROUT(error);
	}
	}

	/*
	* The 'type' agrees so far, so go ahead and link it in.
	* We'll ask again later when we actually connect the hooks.
	*/
	hook->hk_node = node; /* just overwrite ng_deadnode */
	NG_NODE_REF(node); /* each hook counts as a reference */
	LIST_INSERT_HEAD(&node->nd_hooks, hook, hk_hooks);
	node->nd_numhooks++;
	NG_HOOK_REF(hook); /* one for the node */

	/*
	* We now have a symmetrical situation, where both hooks have been
	* linked to their nodes, the newhook methods have been called
	* And the references are all correct. The hooks are still marked
	* as invalid, as we have not called the 'connect' methods
	* yet.
	* We can call the local one immediately as we have the
	* node locked, but we need to queue the remote one.
	*/
	if (hook->hk_node->nd_type->connect) {
	if ((error = (*hook->hk_node->nd_type->connect) (hook))) {
	ng_destroy_hook(hook); /* also zaps peer */
	printf("failed in ng_con_part2(A)\n");
	ERROUT(error);
	}
	}

	/*
	* Acquire topo mutex to avoid race with ng_destroy_hook().
	*/
	mtx_lock(&ng_topo_mtx);
	peer = hook->hk_peer;
	if (peer == &ng_deadhook) {
	mtx_unlock(&ng_topo_mtx);
	printf("failed in ng_con_part2(B)\n");
	ng_destroy_hook(hook);
	ERROUT(ENOENT);
	}
	mtx_unlock(&ng_topo_mtx);

	if ((error = ng_send_fn2(peer->hk_node, peer, item, &ng_con_part3,
	NULL, 0, NG_REUSE_ITEM))) {
	printf("failed in ng_con_part2(C)\n");
	ng_destroy_hook(hook); /* also zaps peer */
	return (error); /* item was consumed. */
	}
	hook->hk_flags &= ~HK_INVALID; /* need both to be able to work */
	return (0); /* item was consumed. */
	done:
	NG_FREE_ITEM(item);
	return (error);
	}

	/*
	* Connect this node with another node. We assume that this node is
	* currently locked, as we are only called from an NGM_CONNECT message.
	*/
	static int
	ng_con_nodes(item_p item, node_p node, const char *name,
	node_p node2, const char *name2)
	{
	int error;
	hook_p hook;
	hook_p hook2;

	if (ng_findhook(node2, name2) != NULL) {
	return(EEXIST);
	}
	if ((error = ng_add_hook(node, name, &hook))) /* gives us a ref */
	return (error);
	/* Allocate the other hook and link it up */
	NG_ALLOC_HOOK(hook2);
	if (hook2 == NULL) {
	TRAP_ERROR();
	ng_destroy_hook(hook); /* XXX check ref counts so far */
	NG_HOOK_UNREF(hook); /* including our ref */
	return (ENOMEM);
	}
	hook2->hk_refs = 1; /* start with a reference for us. */
	hook2->hk_flags = HK_INVALID;
	hook2->hk_peer = hook; /* Link the two together */
	hook->hk_peer = hook2;
	NG_HOOK_REF(hook); /* Add a ref for the peer to each*/
	NG_HOOK_REF(hook2);
	hook2->hk_node = &ng_deadnode;
	strlcpy(NG_HOOK_NAME(hook2), name2, NG_HOOKSIZ);

	/*
	* Queue the function above.
	* Procesing continues in that function in the lock context of
	* the other node.
	*/
	if ((error = ng_send_fn2(node2, hook2, item, &ng_con_part2, NULL, 0,
	NG_NOFLAGS))) {
	printf("failed in ng_con_nodes(): %d\n", error);
	ng_destroy_hook(hook); /* also zaps peer */
	}

	NG_HOOK_UNREF(hook); /* Let each hook go if it wants to */
	NG_HOOK_UNREF(hook2);
	return (error);
	}

	/*
	* Make a peer and connect.
	* We assume that the local node is locked.
	* The new node probably doesn't need a lock until
	* it has a hook, because it cannot really have any work until then,
	* but we should think about it a bit more.
	*
	* The problem may come if the other node also fires up
	* some hardware or a timer or some other source of activation,
	* also it may already get a command msg via it's ID.
	*
	* We could use the same method as ng_con_nodes() but we'd have
	* to add ability to remove the node when failing. (Not hard, just
	* make arg1 point to the node to remove).
	* Unless of course we just ignore failure to connect and leave
	* an unconnected node?
	*/
	static int
	ng_mkpeer(node_p node, const char name, const char name2, char *type)
	{
	node_p node2;
	hook_p hook1, hook2;
	int error;

	if ((error = ng_make_node(type, &node2))) {
	return (error);
	}

	if ((error = ng_add_hook(node, name, &hook1))) { /* gives us a ref */
	ng_rmnode(node2, NULL, NULL, 0);
	return (error);
	}

	if ((error = ng_add_hook(node2, name2, &hook2))) {
	ng_rmnode(node2, NULL, NULL, 0);
	ng_destroy_hook(hook1);
	NG_HOOK_UNREF(hook1);
	return (error);
	}

	/*
	* Actually link the two hooks together.
	*/
	hook1->hk_peer = hook2;
	hook2->hk_peer = hook1;

	/* Each hook is referenced by the other */
	NG_HOOK_REF(hook1);
	NG_HOOK_REF(hook2);

	/* Give each node the opportunity to veto the pending connection */
	if (hook1->hk_node->nd_type->connect) {
	error = (*hook1->hk_node->nd_type->connect) (hook1);
	}

	if ((error == 0) && hook2->hk_node->nd_type->connect) {
	error = (*hook2->hk_node->nd_type->connect) (hook2);

	}

	/*
	* drop the references we were holding on the two hooks.
	*/
	if (error) {
	ng_destroy_hook(hook2); /* also zaps hook1 */
	ng_rmnode(node2, NULL, NULL, 0);
	} else {
	/* As a last act, allow the hooks to be used */
	hook1->hk_flags &= ~HK_INVALID;
	hook2->hk_flags &= ~HK_INVALID;
	}
	NG_HOOK_UNREF(hook1);
	NG_HOOK_UNREF(hook2);
	return (error);
	}

	/************************************************************************
	Utility routines to send self messages
	************************************************************************/

	/* Shut this node down as soon as everyone is clear of it */
	/* Should add arg "immediately" to jump the queue */
	int
	ng_rmnode_flags(node_p node, int flags)
	{
	int error;

	if (node == &ng_deadnode)
	return (0);
	node->nd_flags \|= NGF_INVALID;
	if (node->nd_flags & NGF_CLOSING)
	return (0);

	error = ng_send_fn1(node, NULL, &ng_rmnode, NULL, 0, flags);
	return (error);
	}

	int
	ng_rmnode_self(node_p node)
	{
	return (ng_rmnode_flags(node, NG_NOFLAGS));
	}

	static void
	ng_rmhook_part2(node_p node, hook_p hook, void *arg1, int arg2)
	{
	ng_destroy_hook(hook);
	return ;
	}

	int
	ng_rmhook_self(hook_p hook)
	{
	int error;
	node_p node = NG_HOOK_NODE(hook);

	if (node == &ng_deadnode)
	return (0);

	error = ng_send_fn(node, hook, &ng_rmhook_part2, NULL, 0);
	return (error);
	}

	/***********************************************************************
	* Parse and verify a string of the form: <NODE:><PATH>
	*
	* Such a string can refer to a specific node or a specific hook
	* on a specific node, depending on how you look at it. In the
	* latter case, the PATH component must not end in a dot.
	*
	* Both <NODE:> and <PATH> are optional. The <PATH> is a string
	* of hook names separated by dots. This breaks out the original
	* string, setting nodep to "NODE" (or NULL if none) and pathp
	* to "PATH" (or NULL if degenerate). Also, *hookp will point to
	* the final hook component of <PATH>, if any, otherwise NULL.
	*
	* This returns -1 if the path is malformed. The char ** are optional.
	***********************************************************************/
	int
	ng_path_parse(char addr, char nodep, char pathp, char *hookp)
	{
	char node, path, *hook;
	int k;

	/*
	* Extract absolute NODE, if any
	*/
	for (path = addr; path && path != ':'; path++);
	if (*path) {
	node = addr; /* Here's the NODE */
	path++ = '\0'; / Here's the PATH */

	/* Node name must not be empty */
	if (!*node)
	return -1;

	/* A name of "." is OK; otherwise '.' not allowed */
	if (strcmp(node, ".") != 0) {
	for (k = 0; node[k]; k++)
	if (node[k] == '.')
	return -1;
	}
	} else {
	node = NULL; /* No absolute NODE */
	path = addr; /* Here's the PATH */
	}

	/* Snoop for illegal characters in PATH */
	for (k = 0; path[k]; k++)
	if (path[k] == ':')
	return -1;

	/* Check for no repeated dots in PATH */
	for (k = 0; path[k]; k++)
	if (path[k] == '.' && path[k + 1] == '.')
	return -1;

	/* Remove extra (degenerate) dots from beginning or end of PATH */
	if (path[0] == '.')
	path++;
	if (*path && path[strlen(path) - 1] == '.')
	path[strlen(path) - 1] = 0;

	/* If PATH has a dot, then we're not talking about a hook */
	if (*path) {
	for (hook = path, k = 0; path[k]; k++)
	if (path[k] == '.') {
	hook = NULL;
	break;
	}
	} else
	path = hook = NULL;

	/* Done */
	if (nodep)
	*nodep = node;
	if (pathp)
	*pathp = path;
	if (hookp)
	*hookp = hook;
	return (0);
	}

	/*
	* Given a path, which may be absolute or relative, and a starting node,
	* return the destination node.
	*/
	int
	ng_path2noderef(node_p here, const char *address,
	node_p destp, hook_p lasthook)
	{
	char fullpath[NG_PATHSIZ];
	char nodename, path, pbuf[2];
	node_p node, oldnode;
	char *cp;
	hook_p hook = NULL;

	/* Initialize */
	if (destp == NULL) {
	TRAP_ERROR();
	return EINVAL;
	}
	*destp = NULL;

	/* Make a writable copy of address for ng_path_parse() */
	strncpy(fullpath, address, sizeof(fullpath) - 1);
	fullpath[sizeof(fullpath) - 1] = '\0';

	/* Parse out node and sequence of hooks */
	if (ng_path_parse(fullpath, &nodename, &path, NULL) < 0) {
	TRAP_ERROR();
	return EINVAL;
	}
	if (path == NULL) {
	pbuf[0] = '.'; /* Needs to be writable */
	pbuf[1] = '\0';
	path = pbuf;
	}

	/*
	* For an absolute address, jump to the starting node.
	* Note that this holds a reference on the node for us.
	* Don't forget to drop the reference if we don't need it.
	*/
	if (nodename) {
	node = ng_name2noderef(here, nodename);
	if (node == NULL) {
	TRAP_ERROR();
	return (ENOENT);
	}
	} else {
	if (here == NULL) {
	TRAP_ERROR();
	return (EINVAL);
	}
	node = here;
	NG_NODE_REF(node);
	}

	/*
	* Now follow the sequence of hooks
	* XXX
	* We actually cannot guarantee that the sequence
	* is not being demolished as we crawl along it
	* without extra-ordinary locking etc.
	* So this is a bit dodgy to say the least.
	* We can probably hold up some things by holding
	* the nodelist mutex for the time of this
	* crawl if we wanted.. At least that way we wouldn't have to
	* worry about the nodes disappearing, but the hooks would still
	* be a problem.
	*/
	for (cp = path; node != NULL && *cp != '\0'; ) {
	char *segment;

	/*
	* Break out the next path segment. Replace the dot we just
	* found with a NUL; "cp" points to the next segment (or the
	* NUL at the end).
	*/
	for (segment = cp; *cp != '\0'; cp++) {
	if (*cp == '.') {
	*cp++ = '\0';
	break;
	}
	}

	/* Empty segment */
	if (*segment == '\0')
	continue;

	/* We have a segment, so look for a hook by that name */
	hook = ng_findhook(node, segment);

	/* Can't get there from here... */
	if (hook == NULL
	\|\| NG_HOOK_PEER(hook) == NULL
	\|\| NG_HOOK_NOT_VALID(hook)
	\|\| NG_HOOK_NOT_VALID(NG_HOOK_PEER(hook))) {
	TRAP_ERROR();
	NG_NODE_UNREF(node);
	#if 0
	printf("hooknotvalid %s %s %d %d %d %d ",
	path,
	segment,
	hook == NULL,
	NG_HOOK_PEER(hook) == NULL,
	NG_HOOK_NOT_VALID(hook),
	NG_HOOK_NOT_VALID(NG_HOOK_PEER(hook)));
	#endif
	return (ENOENT);
	}

	/*
	* Hop on over to the next node
	* XXX
	* Big race conditions here as hooks and nodes go away
	* *** Idea.. store an ng_ID_t in each hook and use that
	* instead of the direct hook in this crawl?
	*/
	oldnode = node;
	if ((node = NG_PEER_NODE(hook)))
	NG_NODE_REF(node); /* XXX RACE */
	NG_NODE_UNREF(oldnode); /* XXX another race */
	if (NG_NODE_NOT_VALID(node)) {
	NG_NODE_UNREF(node); /* XXX more races */
	node = NULL;
	}
	}

	/* If node somehow missing, fail here (probably this is not needed) */
	if (node == NULL) {
	TRAP_ERROR();
	return (ENXIO);
	}

	/* Done */
	*destp = node;
	if (lasthook != NULL)
	*lasthook = (hook ? NG_HOOK_PEER(hook) : NULL);
	return (0);
	}

	/***************************************************************\
	* Input queue handling.
	* All activities are submitted to the node via the input queue
	* which implements a multiple-reader/single-writer gate.
	* Items which cannot be handled immediately are queued.
	*
	* read-write queue locking inline functions *
	\***************************************************************/

	static __inline void ng_queue_rw(node_p node, item_p item, int rw);
	static __inline item_p ng_dequeue(node_p node, int *rw);
	static __inline item_p ng_acquire_read(node_p node, item_p item);
	static __inline item_p ng_acquire_write(node_p node, item_p item);
	static __inline void ng_leave_read(node_p node);
	static __inline void ng_leave_write(node_p node);

	/*
	* Definition of the bits fields in the ng_queue flag word.
	* Defined here rather than in netgraph.h because no-one should fiddle
	* with them.
	*
	* The ordering here may be important! don't shuffle these.
	*/
	/*-
	Safety Barrier--------+ (adjustable to suit taste) (not used yet)
	\|
	V
	+-------+-------+-------+-------+-------+-------+-------+-------+
	\| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|
	\| \|A\|c\|t\|i\|v\|e\| \|R\|e\|a\|d\|e\|r\| \|C\|o\|u\|n\|t\| \| \| \| \| \| \| \| \| \|P\|A\|
	\| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \|O\|W\|
	+-------+-------+-------+-------+-------+-------+-------+-------+
	\___________________________ ____________________________/ \| \|
	V \| \|
	[active reader count] \| \|
	\| \|
	Operation Pending -------------------------------+ \|
	\|
	Active Writer ---------------------------------------+

	Node queue has such semantics:
	- All flags modifications are atomic.
	- Reader count can be incremented only if there is no writer or pending flags.
	As soon as this can't be done with single operation, it is implemented with
	spin loop and atomic_cmpset().
	- Writer flag can be set only if there is no any bits set.
	It is implemented with atomic_cmpset().
	- Pending flag can be set any time, but to avoid collision on queue processing
	all queue fields are protected by the mutex.
	- Queue processing thread reads queue holding the mutex, but releases it while
	processing. When queue is empty pending flag is removed.
	*/

	#define WRITER_ACTIVE 0x00000001
	#define OP_PENDING 0x00000002
	#define READER_INCREMENT 0x00000004
	#define READER_MASK 0xfffffffc /* Not valid if WRITER_ACTIVE is set */
	#define SAFETY_BARRIER 0x00100000 /* 128K items queued should be enough */

	/* Defines of more elaborate states on the queue */
	/* Mask of bits a new read cares about */
	#define NGQ_RMASK (WRITER_ACTIVE\|OP_PENDING)

	/* Mask of bits a new write cares about */
	#define NGQ_WMASK (NGQ_RMASK\|READER_MASK)

	/* Test to decide if there is something on the queue. */
	#define QUEUE_ACTIVE(QP) ((QP)->q_flags & OP_PENDING)

	/* How to decide what the next queued item is. */
	#define HEAD_IS_READER(QP) NGI_QUEUED_READER(STAILQ_FIRST(&(QP)->queue))
	#define HEAD_IS_WRITER(QP) NGI_QUEUED_WRITER(STAILQ_FIRST(&(QP)->queue)) /* notused */

	/* Read the status to decide if the next item on the queue can now run. */
	#define QUEUED_READER_CAN_PROCEED(QP) \
	(((QP)->q_flags & (NGQ_RMASK & ~OP_PENDING)) == 0)
	#define QUEUED_WRITER_CAN_PROCEED(QP) \
	(((QP)->q_flags & (NGQ_WMASK & ~OP_PENDING)) == 0)

	/* Is there a chance of getting ANY work off the queue? */
	#define NEXT_QUEUED_ITEM_CAN_PROCEED(QP) \
	((HEAD_IS_READER(QP)) ? QUEUED_READER_CAN_PROCEED(QP) : \
	QUEUED_WRITER_CAN_PROCEED(QP))

	#define NGQRW_R 0
	#define NGQRW_W 1

	#define NGQ2_WORKQ 0x00000001

	/*
	* Taking into account the current state of the queue and node, possibly take
	* the next entry off the queue and return it. Return NULL if there was
	* nothing we could return, either because there really was nothing there, or
	* because the node was in a state where it cannot yet process the next item
	* on the queue.
	*/
	static __inline item_p
	ng_dequeue(node_p node, int *rw)
	{
	item_p item;
	struct ng_queue *ngq = &node->nd_input_queue;

	/* This MUST be called with the mutex held. */
	mtx_assert(&ngq->q_mtx, MA_OWNED);

	/* If there is nothing queued, then just return. */
	if (!QUEUE_ACTIVE(ngq)) {
	CTR4(KTR_NET, "%20s: node [%x] (%p) queue empty; "
	"queue flags 0x%lx", __func__,
	node->nd_ID, node, ngq->q_flags);
	return (NULL);
	}

	/*
	* From here, we can assume there is a head item.
	* We need to find out what it is and if it can be dequeued, given
	* the current state of the node.
	*/
	if (HEAD_IS_READER(ngq)) {
	while (1) {
	long t = ngq->q_flags;
	if (t & WRITER_ACTIVE) {
	/* There is writer, reader can't proceed. */
	CTR4(KTR_NET, "%20s: node [%x] (%p) queued reader "
	"can't proceed; queue flags 0x%lx", __func__,
	node->nd_ID, node, t);
	return (NULL);
	}
	if (atomic_cmpset_acq_int(&ngq->q_flags, t,
	t + READER_INCREMENT))
	break;
	cpu_spinwait();
	}
	/* We have got reader lock for the node. */
	*rw = NGQRW_R;
	} else if (atomic_cmpset_acq_int(&ngq->q_flags, OP_PENDING,
	OP_PENDING + WRITER_ACTIVE)) {
	/* We have got writer lock for the node. */
	*rw = NGQRW_W;
	} else {
	/* There is somebody other, writer can't proceed. */
	CTR4(KTR_NET, "%20s: node [%x] (%p) queued writer "
	"can't proceed; queue flags 0x%lx", __func__,
	node->nd_ID, node, ngq->q_flags);
	return (NULL);
	}

	/*
	* Now we dequeue the request (whatever it may be) and correct the
	* pending flags and the next and last pointers.
	*/
	item = STAILQ_FIRST(&ngq->queue);
	STAILQ_REMOVE_HEAD(&ngq->queue, el_next);
	if (STAILQ_EMPTY(&ngq->queue))
	atomic_clear_int(&ngq->q_flags, OP_PENDING);
	CTR6(KTR_NET, "%20s: node [%x] (%p) returning item %p as %s; "
	"queue flags 0x%lx", __func__,
	node->nd_ID, node, item, *rw ? "WRITER" : "READER" ,
	ngq->q_flags);
	return (item);
	}

	/*
	* Queue a packet to be picked up later by someone else.
	* If the queue could be run now, add node to the queue handler's worklist.
	*/
	static __inline void
	ng_queue_rw(node_p node, item_p item, int rw)
	{
	struct ng_queue *ngq = &node->nd_input_queue;
	if (rw == NGQRW_W)
	NGI_SET_WRITER(item);
	else
	NGI_SET_READER(item);

	NG_QUEUE_LOCK(ngq);
	/* Set OP_PENDING flag and enqueue the item. */
	atomic_set_int(&ngq->q_flags, OP_PENDING);
	STAILQ_INSERT_TAIL(&ngq->queue, item, el_next);

	CTR5(KTR_NET, "%20s: node [%x] (%p) queued item %p as %s", __func__,
	node->nd_ID, node, item, rw ? "WRITER" : "READER" );

	/*
	* We can take the worklist lock with the node locked
	* BUT NOT THE REVERSE!
	*/
	if (NEXT_QUEUED_ITEM_CAN_PROCEED(ngq))
	ng_worklist_add(node);
	NG_QUEUE_UNLOCK(ngq);
	}

	/* Acquire reader lock on node. If node is busy, queue the packet. */
	static __inline item_p
	ng_acquire_read(node_p node, item_p item)
	{
	KASSERT(node != &ng_deadnode,
	("%s: working on deadnode", __func__));

	/* Reader needs node without writer and pending items. */
	while (1) {
	long t = node->nd_input_queue.q_flags;
	if (t & NGQ_RMASK)
	break; /* Node is not ready for reader. */
	if (atomic_cmpset_acq_int(&node->nd_input_queue.q_flags,
	t, t + READER_INCREMENT)) {
	/* Successfully grabbed node */
	CTR4(KTR_NET, "%20s: node [%x] (%p) acquired item %p",
	__func__, node->nd_ID, node, item);
	return (item);
	}
	cpu_spinwait();
	};

	/* Queue the request for later. */
	ng_queue_rw(node, item, NGQRW_R);

	return (NULL);
	}

	/* Acquire writer lock on node. If node is busy, queue the packet. */
	static __inline item_p
	ng_acquire_write(node_p node, item_p item)
	{
	KASSERT(node != &ng_deadnode,
	("%s: working on deadnode", __func__));

	/* Writer needs completely idle node. */
	if (atomic_cmpset_acq_int(&node->nd_input_queue.q_flags,
	0, WRITER_ACTIVE)) {
	/* Successfully grabbed node */
	CTR4(KTR_NET, "%20s: node [%x] (%p) acquired item %p",
	__func__, node->nd_ID, node, item);
	return (item);
	}

	/* Queue the request for later. */
	ng_queue_rw(node, item, NGQRW_W);

	return (NULL);
	}

	#if 0
	static __inline item_p
	ng_upgrade_write(node_p node, item_p item)
	{
	struct ng_queue *ngq = &node->nd_input_queue;
	KASSERT(node != &ng_deadnode,
	("%s: working on deadnode", __func__));

	NGI_SET_WRITER(item);

	NG_QUEUE_LOCK(ngq);

	/*
	* There will never be no readers as we are there ourselves.
	* Set the WRITER_ACTIVE flags ASAP to block out fast track readers.
	* The caller we are running from will call ng_leave_read()
	* soon, so we must account for that. We must leave again with the
	* READER lock. If we find other readers, then
	* queue the request for later. However "later" may be rignt now
	* if there are no readers. We don't really care if there are queued
	* items as we will bypass them anyhow.
	*/
	atomic_add_int(&ngq->q_flags, WRITER_ACTIVE - READER_INCREMENT);
	if ((ngq->q_flags & (NGQ_WMASK & ~OP_PENDING)) == WRITER_ACTIVE) {
	NG_QUEUE_UNLOCK(ngq);

	/* It's just us, act on the item. */
	/* will NOT drop writer lock when done */
	ng_apply_item(node, item, 0);

	/*
	* Having acted on the item, atomically
	* down grade back to READER and finish up
	*/
	atomic_add_int(&ngq->q_flags,
	READER_INCREMENT - WRITER_ACTIVE);

	/* Our caller will call ng_leave_read() */
	return;
	}
	/*
	* It's not just us active, so queue us AT THE HEAD.
	* "Why?" I hear you ask.
	* Put us at the head of the queue as we've already been
	* through it once. If there is nothing else waiting,
	* set the correct flags.
	*/
	if (STAILQ_EMPTY(&ngq->queue)) {
	/* We've gone from, 0 to 1 item in the queue */
	atomic_set_int(&ngq->q_flags, OP_PENDING);

	CTR3(KTR_NET, "%20s: node [%x] (%p) set OP_PENDING", __func__,
	node->nd_ID, node);
	};
	STAILQ_INSERT_HEAD(&ngq->queue, item, el_next);
	CTR4(KTR_NET, "%20s: node [%x] (%p) requeued item %p as WRITER",
	__func__, node->nd_ID, node, item );

	/* Reverse what we did above. That downgrades us back to reader */
	atomic_add_int(&ngq->q_flags, READER_INCREMENT - WRITER_ACTIVE);
	if (QUEUE_ACTIVE(ngq) && NEXT_QUEUED_ITEM_CAN_PROCEED(ngq))
	ng_worklist_add(node);
	NG_QUEUE_UNLOCK(ngq);

	return;
	}
	#endif

	/* Release reader lock. */
	static __inline void
	ng_leave_read(node_p node)
	{
	atomic_subtract_rel_int(&node->nd_input_queue.q_flags, READER_INCREMENT);
	}

	/* Release writer lock. */
	static __inline void
	ng_leave_write(node_p node)
	{
	atomic_clear_rel_int(&node->nd_input_queue.q_flags, WRITER_ACTIVE);
	}

	/* Purge node queue. Called on node shutdown. */
	static void
	ng_flush_input_queue(node_p node)
	{
	struct ng_queue *ngq = &node->nd_input_queue;
	item_p item;

	NG_QUEUE_LOCK(ngq);
	while ((item = STAILQ_FIRST(&ngq->queue)) != NULL) {
	STAILQ_REMOVE_HEAD(&ngq->queue, el_next);
	if (STAILQ_EMPTY(&ngq->queue))
	atomic_clear_int(&ngq->q_flags, OP_PENDING);
	NG_QUEUE_UNLOCK(ngq);

	/* If the item is supplying a callback, call it with an error */
	if (item->apply != NULL) {
	if (item->depth == 1)
	item->apply->error = ENOENT;
	if (refcount_release(&item->apply->refs)) {
	(*item->apply->apply)(item->apply->context,
	item->apply->error);
	}
	}
	NG_FREE_ITEM(item);
	NG_QUEUE_LOCK(ngq);
	}
	NG_QUEUE_UNLOCK(ngq);
	}

	/***********************************************************************
	* Externally visible method for sending or queueing messages or data.
	***********************************************************************/

	/*
	* The module code should have filled out the item correctly by this stage:
	* Common:
	* reference to destination node.
	* Reference to destination rcv hook if relevant.
	* apply pointer must be or NULL or reference valid struct ng_apply_info.
	* Data:
	* pointer to mbuf
	* Control_Message:
	* pointer to msg.
	* ID of original sender node. (return address)
	* Function:
	* Function pointer
	* void * argument
	* integer argument
	*
	* The nodes have several routines and macros to help with this task:
	*/

	int
	ng_snd_item(item_p item, int flags)
	{
	hook_p hook;
	node_p node;
	int queue, rw;
	struct ng_queue *ngq;
	int error = 0;

	/* We are sending item, so it must be present! */
	KASSERT(item != NULL, ("ng_snd_item: item is NULL"));

	#ifdef NETGRAPH_DEBUG
	_ngi_check(item, __FILE__, __LINE__);
	#endif

	/* Item was sent once more, postpone apply() call. */
	if (item->apply)
	refcount_acquire(&item->apply->refs);

	node = NGI_NODE(item);
	/* Node is never optional. */
	KASSERT(node != NULL, ("ng_snd_item: node is NULL"));

	hook = NGI_HOOK(item);
	/* Valid hook and mbuf are mandatory for data. */
	if ((item->el_flags & NGQF_TYPE) == NGQF_DATA) {
	KASSERT(hook != NULL, ("ng_snd_item: hook for data is NULL"));
	if (NGI_M(item) == NULL)
	ERROUT(EINVAL);
	CHECK_DATA_MBUF(NGI_M(item));
	}

	/*
	* If the item or the node specifies single threading, force
	* writer semantics. Similarly, the node may say one hook always
	* produces writers. These are overrides.
	*/
	if (((item->el_flags & NGQF_RW) == NGQF_WRITER) \|\|
	(node->nd_flags & NGF_FORCE_WRITER) \|\|
	(hook && (hook->hk_flags & HK_FORCE_WRITER))) {
	rw = NGQRW_W;
	} else {
	rw = NGQRW_R;
	}

	/*
	* If sender or receiver requests queued delivery or stack usage
	* level is dangerous - enqueue message.
	*/
	if ((flags & NG_QUEUE) \|\| (hook && (hook->hk_flags & HK_QUEUE))) {
	queue = 1;
	} else {
	queue = 0;
	#ifdef GET_STACK_USAGE
	/*
	* Most of netgraph nodes have small stack consumption and
	* for them 25% of free stack space is more than enough.
	* Nodes/hooks with higher stack usage should be marked as
	* HI_STACK. For them 50% of stack will be guaranteed then.
	* XXX: Values 25% and 50% are completely empirical.
	*/
	size_t st, su, sl;
	GET_STACK_USAGE(st, su);
	sl = st - su;
	if ((sl * 4 < st) \|\|
	((sl * 2 < st) && ((node->nd_flags & NGF_HI_STACK) \|\|
	(hook && (hook->hk_flags & HK_HI_STACK))))) {
	queue = 1;
	}
	#endif
	}

	if (queue) {
	item->depth = 1;
	/* Put it on the queue for that node*/
	ng_queue_rw(node, item, rw);
	return ((flags & NG_PROGRESS) ? EINPROGRESS : 0);
	}

	/*
	* We already decided how we will be queueud or treated.
	* Try get the appropriate operating permission.
	*/
	if (rw == NGQRW_R)
	item = ng_acquire_read(node, item);
	else
	item = ng_acquire_write(node, item);

	/* Item was queued while trying to get permission. */
	if (item == NULL)
	return ((flags & NG_PROGRESS) ? EINPROGRESS : 0);

	NGI_GET_NODE(item, node); /* zaps stored node */

	item->depth++;
	error = ng_apply_item(node, item, rw); /* drops r/w lock when done */

	/* If something is waiting on queue and ready, schedule it. */
	ngq = &node->nd_input_queue;
	if (QUEUE_ACTIVE(ngq)) {
	NG_QUEUE_LOCK(ngq);
	if (QUEUE_ACTIVE(ngq) && NEXT_QUEUED_ITEM_CAN_PROCEED(ngq))
	ng_worklist_add(node);
	NG_QUEUE_UNLOCK(ngq);
	}

	/*
	* Node may go away as soon as we remove the reference.
	* Whatever we do, DO NOT access the node again!
	*/
	NG_NODE_UNREF(node);

	return (error);

	done:
	/* If was not sent, apply callback here. */
	if (item->apply != NULL) {
	if (item->depth == 0 && error != 0)
	item->apply->error = error;
	if (refcount_release(&item->apply->refs)) {
	(*item->apply->apply)(item->apply->context,
	item->apply->error);
	}
	}

	NG_FREE_ITEM(item);
	return (error);
	}

	/*
	* We have an item that was possibly queued somewhere.
	* It should contain all the information needed
	* to run it on the appropriate node/hook.
	* If there is apply pointer and we own the last reference, call apply().
	*/
	static int
	ng_apply_item(node_p node, item_p item, int rw)
	{
	hook_p hook;
	ng_rcvdata_t *rcvdata;
	ng_rcvmsg_t *rcvmsg;
	struct ng_apply_info *apply;
	int error = 0, depth;

	/* Node and item are never optional. */
	KASSERT(node != NULL, ("ng_apply_item: node is NULL"));
	KASSERT(item != NULL, ("ng_apply_item: item is NULL"));

	NGI_GET_HOOK(item, hook); /* clears stored hook */
	#ifdef NETGRAPH_DEBUG
	_ngi_check(item, __FILE__, __LINE__);
	#endif

	apply = item->apply;
	depth = item->depth;

	switch (item->el_flags & NGQF_TYPE) {
	case NGQF_DATA:
	/*
	* Check things are still ok as when we were queued.
	*/
	KASSERT(hook != NULL, ("ng_apply_item: hook for data is NULL"));
	if (NG_HOOK_NOT_VALID(hook) \|\|
	NG_NODE_NOT_VALID(node)) {
	error = EIO;
	NG_FREE_ITEM(item);
	break;
	}
	/*
	* If no receive method, just silently drop it.
	* Give preference to the hook over-ride method
	*/
	if ((!(rcvdata = hook->hk_rcvdata))
	&& (!(rcvdata = NG_HOOK_NODE(hook)->nd_type->rcvdata))) {
	error = 0;
	NG_FREE_ITEM(item);
	break;
	}
	error = (*rcvdata)(hook, item);
	break;
	case NGQF_MESG:
	if (hook && NG_HOOK_NOT_VALID(hook)) {
	/*
	* The hook has been zapped then we can't use it.
	* Immediately drop its reference.
	* The message may not need it.
	*/
	NG_HOOK_UNREF(hook);
	hook = NULL;
	}
	/*
	* Similarly, if the node is a zombie there is
	* nothing we can do with it, drop everything.
	*/
	if (NG_NODE_NOT_VALID(node)) {
	TRAP_ERROR();
	error = EINVAL;
	NG_FREE_ITEM(item);
	break;
	}
	/*
	* Call the appropriate message handler for the object.
	* It is up to the message handler to free the message.
	* If it's a generic message, handle it generically,
	* otherwise call the type's message handler (if it exists).
	* XXX (race). Remember that a queued message may
	* reference a node or hook that has just been
	* invalidated. It will exist as the queue code
	* is holding a reference, but..
	*/
	if ((NGI_MSG(item)->header.typecookie == NGM_GENERIC_COOKIE) &&
	((NGI_MSG(item)->header.flags & NGF_RESP) == 0)) {
	error = ng_generic_msg(node, item, hook);
	break;
	}
	if (((!hook) \|\| (!(rcvmsg = hook->hk_rcvmsg))) &&
	(!(rcvmsg = node->nd_type->rcvmsg))) {
	TRAP_ERROR();
	error = 0;
	NG_FREE_ITEM(item);
	break;
	}
	error = (*rcvmsg)(node, item, hook);
	break;
	case NGQF_FN:
	case NGQF_FN2:
	/*
	* In the case of the shutdown message we allow it to hit
	* even if the node is invalid.
	*/
	if (NG_NODE_NOT_VALID(node) &&
	NGI_FN(item) != &ng_rmnode) {
	TRAP_ERROR();
	error = EINVAL;
	NG_FREE_ITEM(item);
	break;
	}
	/* Same is about some internal functions and invalid hook. */
	if (hook && NG_HOOK_NOT_VALID(hook) &&
	NGI_FN2(item) != &ng_con_part2 &&
	NGI_FN2(item) != &ng_con_part3 &&
	NGI_FN(item) != &ng_rmhook_part2) {
	TRAP_ERROR();
	error = EINVAL;
	NG_FREE_ITEM(item);
	break;
	}

	if ((item->el_flags & NGQF_TYPE) == NGQF_FN) {
	(*NGI_FN(item))(node, hook, NGI_ARG1(item),
	NGI_ARG2(item));
	NG_FREE_ITEM(item);
	} else /* it is NGQF_FN2 */
	error = (*NGI_FN2(item))(node, item, hook);
	break;
	}
	/*
	* We held references on some of the resources
	* that we took from the item. Now that we have
	* finished doing everything, drop those references.
	*/
	if (hook)
	NG_HOOK_UNREF(hook);

	if (rw == NGQRW_R)
	ng_leave_read(node);
	else
	ng_leave_write(node);

	/* Apply callback. */
	if (apply != NULL) {
	if (depth == 1 && error != 0)
	apply->error = error;
	if (refcount_release(&apply->refs))
	(*apply->apply)(apply->context, apply->error);
	}

	return (error);
	}

	/***********************************************************************
	* Implement the 'generic' control messages
	***********************************************************************/
	static int
	ng_generic_msg(node_p here, item_p item, hook_p lasthook)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	int error = 0;
	struct ng_mesg *msg;
	struct ng_mesg *resp = NULL;

	NGI_GET_MSG(item, msg);
	if (msg->header.typecookie != NGM_GENERIC_COOKIE) {
	TRAP_ERROR();
	error = EINVAL;
	goto out;
	}
	switch (msg->header.cmd) {
	case NGM_SHUTDOWN:
	ng_rmnode(here, NULL, NULL, 0);
	break;
	case NGM_MKPEER:
	{
	struct ngm_mkpeer const mkp = (struct ngm_mkpeer ) msg->data;

	if (msg->header.arglen != sizeof(*mkp)) {
	TRAP_ERROR();
	error = EINVAL;
	break;
	}
	mkp->type[sizeof(mkp->type) - 1] = '\0';
	mkp->ourhook[sizeof(mkp->ourhook) - 1] = '\0';
	mkp->peerhook[sizeof(mkp->peerhook) - 1] = '\0';
	error = ng_mkpeer(here, mkp->ourhook, mkp->peerhook, mkp->type);
	break;
	}
	case NGM_CONNECT:
	{
	struct ngm_connect *const con =
	(struct ngm_connect *) msg->data;
	node_p node2;

	if (msg->header.arglen != sizeof(*con)) {
	TRAP_ERROR();
	error = EINVAL;
	break;
	}
	con->path[sizeof(con->path) - 1] = '\0';
	con->ourhook[sizeof(con->ourhook) - 1] = '\0';
	con->peerhook[sizeof(con->peerhook) - 1] = '\0';
	/* Don't forget we get a reference.. */
	error = ng_path2noderef(here, con->path, &node2, NULL);
	if (error)
	break;
	error = ng_con_nodes(item, here, con->ourhook,
	node2, con->peerhook);
	NG_NODE_UNREF(node2);
	break;
	}
	case NGM_NAME:
	{
	struct ngm_name const nam = (struct ngm_name ) msg->data;

	if (msg->header.arglen != sizeof(*nam)) {
	TRAP_ERROR();
	error = EINVAL;
	break;
	}
	nam->name[sizeof(nam->name) - 1] = '\0';
	error = ng_name_node(here, nam->name);
	break;
	}
	case NGM_RMHOOK:
	{
	struct ngm_rmhook const rmh = (struct ngm_rmhook ) msg->data;
	hook_p hook;

	if (msg->header.arglen != sizeof(*rmh)) {
	TRAP_ERROR();
	error = EINVAL;
	break;
	}
	rmh->ourhook[sizeof(rmh->ourhook) - 1] = '\0';
	if ((hook = ng_findhook(here, rmh->ourhook)) != NULL)
	ng_destroy_hook(hook);
	break;
	}
	case NGM_NODEINFO:
	{
	struct nodeinfo *ni;

	NG_MKRESPONSE(resp, msg, sizeof(*ni), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}

	/* Fill in node info */
	ni = (struct nodeinfo *) resp->data;
	if (NG_NODE_HAS_NAME(here))
	strcpy(ni->name, NG_NODE_NAME(here));
	strcpy(ni->type, here->nd_type->name);
	ni->id = ng_node2ID(here);
	ni->hooks = here->nd_numhooks;
	break;
	}
	case NGM_LISTHOOKS:
	{
	const int nhooks = here->nd_numhooks;
	struct hooklist *hl;
	struct nodeinfo *ni;
	hook_p hook;

	/* Get response struct */
	NG_MKRESPONSE(resp, msg, sizeof(*hl)
	+ (nhooks * sizeof(struct linkinfo)), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	hl = (struct hooklist *) resp->data;
	ni = &hl->nodeinfo;

	/* Fill in node info */
	if (NG_NODE_HAS_NAME(here))
	strcpy(ni->name, NG_NODE_NAME(here));
	strcpy(ni->type, here->nd_type->name);
	ni->id = ng_node2ID(here);

	/* Cycle through the linked list of hooks */
	ni->hooks = 0;
	LIST_FOREACH(hook, &here->nd_hooks, hk_hooks) {
	struct linkinfo *const link = &hl->link[ni->hooks];

	if (ni->hooks >= nhooks) {
	log(LOG_ERR, "%s: number of %s changed\n",
	__func__, "hooks");
	break;
	}
	if (NG_HOOK_NOT_VALID(hook))
	continue;
	strcpy(link->ourhook, NG_HOOK_NAME(hook));
	strcpy(link->peerhook, NG_PEER_HOOK_NAME(hook));
	if (NG_PEER_NODE_NAME(hook)[0] != '\0')
	strcpy(link->nodeinfo.name,
	NG_PEER_NODE_NAME(hook));
	strcpy(link->nodeinfo.type,
	NG_PEER_NODE(hook)->nd_type->name);
	link->nodeinfo.id = ng_node2ID(NG_PEER_NODE(hook));
	link->nodeinfo.hooks = NG_PEER_NODE(hook)->nd_numhooks;
	ni->hooks++;
	}
	break;
	}

	case NGM_LISTNAMES:
	case NGM_LISTNODES:
	{
	const int unnamed = (msg->header.cmd == NGM_LISTNODES);
	struct namelist *nl;
	node_p node;
	int num = 0, i;

	mtx_lock(&ng_namehash_mtx);
	/* Count number of nodes */
	for (i = 0; i < NG_NAME_HASH_SIZE; i++) {
	LIST_FOREACH(node, &V_ng_name_hash[i], nd_nodes) {
	if (NG_NODE_IS_VALID(node) &&
	(unnamed \|\| NG_NODE_HAS_NAME(node))) {
	num++;
	}
	}
	}
	mtx_unlock(&ng_namehash_mtx);

	/* Get response struct */
	NG_MKRESPONSE(resp, msg, sizeof(*nl)
	+ (num * sizeof(struct nodeinfo)), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	nl = (struct namelist *) resp->data;

	/* Cycle through the linked list of nodes */
	nl->numnames = 0;
	mtx_lock(&ng_namehash_mtx);
	for (i = 0; i < NG_NAME_HASH_SIZE; i++) {
	LIST_FOREACH(node, &V_ng_name_hash[i], nd_nodes) {
	struct nodeinfo *const np =
	&nl->nodeinfo[nl->numnames];

	if (NG_NODE_NOT_VALID(node))
	continue;
	if (!unnamed && (! NG_NODE_HAS_NAME(node)))
	continue;
	if (nl->numnames >= num) {
	log(LOG_ERR, "%s: number of nodes changed\n",
	__func__);
	break;
	}
	if (NG_NODE_HAS_NAME(node))
	strcpy(np->name, NG_NODE_NAME(node));
	strcpy(np->type, node->nd_type->name);
	np->id = ng_node2ID(node);
	np->hooks = node->nd_numhooks;
	nl->numnames++;
	}
	}
	mtx_unlock(&ng_namehash_mtx);
	break;
	}

	case NGM_LISTTYPES:
	{
	struct typelist *tl;
	struct ng_type *type;
	int num = 0;

	mtx_lock(&ng_typelist_mtx);
	/* Count number of types */
	LIST_FOREACH(type, &ng_typelist, types) {
	num++;
	}
	mtx_unlock(&ng_typelist_mtx);

	/* Get response struct */
	NG_MKRESPONSE(resp, msg, sizeof(*tl)
	+ (num * sizeof(struct typeinfo)), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	tl = (struct typelist *) resp->data;

	/* Cycle through the linked list of types */
	tl->numtypes = 0;
	mtx_lock(&ng_typelist_mtx);
	LIST_FOREACH(type, &ng_typelist, types) {
	struct typeinfo *const tp = &tl->typeinfo[tl->numtypes];

	if (tl->numtypes >= num) {
	log(LOG_ERR, "%s: number of %s changed\n",
	__func__, "types");
	break;
	}
	strcpy(tp->type_name, type->name);
	tp->numnodes = type->refs - 1; /* don't count list */
	tl->numtypes++;
	}
	mtx_unlock(&ng_typelist_mtx);
	break;
	}

	case NGM_BINARY2ASCII:
	{
	int bufSize = 20 * 1024; /* XXX hard coded constant */
	const struct ng_parse_type *argstype;
	const struct ng_cmdlist *c;
	struct ng_mesg binary, ascii;

	/* Data area must contain a valid netgraph message */
	binary = (struct ng_mesg *)msg->data;
	if (msg->header.arglen < sizeof(struct ng_mesg) \|\|
	(msg->header.arglen - sizeof(struct ng_mesg) <
	binary->header.arglen)) {
	TRAP_ERROR();
	error = EINVAL;
	break;
	}

	/* Get a response message with lots of room */
	NG_MKRESPONSE(resp, msg, sizeof(*ascii) + bufSize, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	ascii = (struct ng_mesg *)resp->data;

	/* Copy binary message header to response message payload */
	bcopy(binary, ascii, sizeof(*binary));

	/* Find command by matching typecookie and command number */
	for (c = here->nd_type->cmdlist;
	c != NULL && c->name != NULL; c++) {
	if (binary->header.typecookie == c->cookie
	&& binary->header.cmd == c->cmd)
	break;
	}
	if (c == NULL \|\| c->name == NULL) {
	for (c = ng_generic_cmds; c->name != NULL; c++) {
	if (binary->header.typecookie == c->cookie
	&& binary->header.cmd == c->cmd)
	break;
	}
	if (c->name == NULL) {
	NG_FREE_MSG(resp);
	error = ENOSYS;
	break;
	}
	}

	/* Convert command name to ASCII */
	snprintf(ascii->header.cmdstr, sizeof(ascii->header.cmdstr),
	"%s", c->name);

	/* Convert command arguments to ASCII */
	argstype = (binary->header.flags & NGF_RESP) ?
	c->respType : c->mesgType;
	if (argstype == NULL) {
	*ascii->data = '\0';
	} else {
	if ((error = ng_unparse(argstype,
	(u_char *)binary->data,
	ascii->data, bufSize)) != 0) {
	NG_FREE_MSG(resp);
	break;
	}
	}

	/* Return the result as struct ng_mesg plus ASCII string */
	bufSize = strlen(ascii->data) + 1;
	ascii->header.arglen = bufSize;
	resp->header.arglen = sizeof(*ascii) + bufSize;
	break;
	}

	case NGM_ASCII2BINARY:
	{
	int bufSize = 2000; /* XXX hard coded constant */
	const struct ng_cmdlist *c;
	const struct ng_parse_type *argstype;
	struct ng_mesg ascii, binary;
	int off = 0;

	/* Data area must contain at least a struct ng_mesg + '\0' */
	ascii = (struct ng_mesg *)msg->data;
	if ((msg->header.arglen < sizeof(*ascii) + 1) \|\|
	(ascii->header.arglen < 1) \|\|
	(msg->header.arglen < sizeof(*ascii) +
	ascii->header.arglen)) {
	TRAP_ERROR();
	error = EINVAL;
	break;
	}
	ascii->data[ascii->header.arglen - 1] = '\0';

	/* Get a response message with lots of room */
	NG_MKRESPONSE(resp, msg, sizeof(*binary) + bufSize, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	binary = (struct ng_mesg *)resp->data;

	/* Copy ASCII message header to response message payload */
	bcopy(ascii, binary, sizeof(*ascii));

	/* Find command by matching ASCII command string */
	for (c = here->nd_type->cmdlist;
	c != NULL && c->name != NULL; c++) {
	if (strcmp(ascii->header.cmdstr, c->name) == 0)
	break;
	}
	if (c == NULL \|\| c->name == NULL) {
	for (c = ng_generic_cmds; c->name != NULL; c++) {
	if (strcmp(ascii->header.cmdstr, c->name) == 0)
	break;
	}
	if (c->name == NULL) {
	NG_FREE_MSG(resp);
	error = ENOSYS;
	break;
	}
	}

	/* Convert command name to binary */
	binary->header.cmd = c->cmd;
	binary->header.typecookie = c->cookie;

	/* Convert command arguments to binary */
	argstype = (binary->header.flags & NGF_RESP) ?
	c->respType : c->mesgType;
	if (argstype == NULL) {
	bufSize = 0;
	} else {
	if ((error = ng_parse(argstype, ascii->data,
	&off, (u_char *)binary->data, &bufSize)) != 0) {
	NG_FREE_MSG(resp);
	break;
	}
	}

	/* Return the result */
	binary->header.arglen = bufSize;
	resp->header.arglen = sizeof(*binary) + bufSize;
	break;
	}

	case NGM_TEXT_CONFIG:
	case NGM_TEXT_STATUS:
	/*
	* This one is tricky as it passes the command down to the
	* actual node, even though it is a generic type command.
	* This means we must assume that the item/msg is already freed
	* when control passes back to us.
	*/
	if (here->nd_type->rcvmsg != NULL) {
	NGI_MSG(item) = msg; /* put it back as we found it */
	return((*here->nd_type->rcvmsg)(here, item, lasthook));
	}
	/* Fall through if rcvmsg not supported */
	default:
	TRAP_ERROR();
	error = EINVAL;
	}
	/*
	* Sometimes a generic message may be statically allocated
	* to avoid problems with allocating when in tight memeory situations.
	* Don't free it if it is so.
	* I break them appart here, because erros may cause a free if the item
	* in which case we'd be doing it twice.
	* they are kept together above, to simplify freeing.
	*/
	out:
	NG_RESPOND_MSG(error, here, item, resp);
	if (msg)
	NG_FREE_MSG(msg);
	return (error);
	}

	/************************************************************************
	Queue element get/free routines
	************************************************************************/

	uma_zone_t ng_qzone;
	uma_zone_t ng_qdzone;
	static int maxalloc = 4096;/* limit the damage of a leak */
	static int maxdata = 512; /* limit the damage of a DoS */

	TUNABLE_INT("net.graph.maxalloc", &maxalloc);
	SYSCTL_INT(_net_graph, OID_AUTO, maxalloc, CTLFLAG_RDTUN, &maxalloc,
	0, "Maximum number of non-data queue items to allocate");
	TUNABLE_INT("net.graph.maxdata", &maxdata);
	SYSCTL_INT(_net_graph, OID_AUTO, maxdata, CTLFLAG_RDTUN, &maxdata,
	0, "Maximum number of data queue items to allocate");

	#ifdef NETGRAPH_DEBUG
	static TAILQ_HEAD(, ng_item) ng_itemlist = TAILQ_HEAD_INITIALIZER(ng_itemlist);
	static int allocated; /* number of items malloc'd */
	#endif

	/*
	* Get a queue entry.
	* This is usually called when a packet first enters netgraph.
	* By definition, this is usually from an interrupt, or from a user.
	* Users are not so important, but try be quick for the times that it's
	* an interrupt.
	*/
	static __inline item_p
	ng_alloc_item(int type, int flags)
	{
	item_p item;

	KASSERT(((type & ~NGQF_TYPE) == 0),
	("%s: incorrect item type: %d", __func__, type));

	item = uma_zalloc((type == NGQF_DATA)?ng_qdzone:ng_qzone,
	((flags & NG_WAITOK) ? M_WAITOK : M_NOWAIT) \| M_ZERO);

	if (item) {
	item->el_flags = type;
	#ifdef NETGRAPH_DEBUG
	mtx_lock(&ngq_mtx);
	TAILQ_INSERT_TAIL(&ng_itemlist, item, all);
	allocated++;
	mtx_unlock(&ngq_mtx);
	#endif
	}

	return (item);
	}

	/*
	* Release a queue entry
	*/
	void
	ng_free_item(item_p item)
	{
	/*
	* The item may hold resources on it's own. We need to free
	* these before we can free the item. What they are depends upon
	* what kind of item it is. it is important that nodes zero
	* out pointers to resources that they remove from the item
	* or we release them again here.
	*/
	switch (item->el_flags & NGQF_TYPE) {
	case NGQF_DATA:
	/* If we have an mbuf still attached.. */
	NG_FREE_M(_NGI_M(item));
	break;
	case NGQF_MESG:
	_NGI_RETADDR(item) = 0;
	NG_FREE_MSG(_NGI_MSG(item));
	break;
	case NGQF_FN:
	case NGQF_FN2:
	/* nothing to free really, */
	_NGI_FN(item) = NULL;
	_NGI_ARG1(item) = NULL;
	_NGI_ARG2(item) = 0;
	break;
	}
	/* If we still have a node or hook referenced... */
	_NGI_CLR_NODE(item);
	_NGI_CLR_HOOK(item);

	#ifdef NETGRAPH_DEBUG
	mtx_lock(&ngq_mtx);
	TAILQ_REMOVE(&ng_itemlist, item, all);
	allocated--;
	mtx_unlock(&ngq_mtx);
	#endif
	uma_zfree(((item->el_flags & NGQF_TYPE) == NGQF_DATA)?
	ng_qdzone:ng_qzone, item);
	}

	/*
	* Change type of the queue entry.
	* Possibly reallocates it from another UMA zone.
	*/
	static __inline item_p
	ng_realloc_item(item_p pitem, int type, int flags)
	{
	item_p item;
	int from, to;

	KASSERT((pitem != NULL), ("%s: can't reallocate NULL", __func__));
	KASSERT(((type & ~NGQF_TYPE) == 0),
	("%s: incorrect item type: %d", __func__, type));

	from = ((pitem->el_flags & NGQF_TYPE) == NGQF_DATA);
	to = (type == NGQF_DATA);
	if (from != to) {
	/* If reallocation is required do it and copy item. */
	if ((item = ng_alloc_item(type, flags)) == NULL) {
	ng_free_item(pitem);
	return (NULL);
	}
	item = pitem;
	ng_free_item(pitem);
	} else
	item = pitem;
	item->el_flags = (item->el_flags & ~NGQF_TYPE) \| type;

	return (item);
	}

	/************************************************************************
	Module routines
	************************************************************************/

	/*
	* Handle the loading/unloading of a netgraph node type module
	*/
	int
	ng_mod_event(module_t mod, int event, void *data)
	{
	struct ng_type *const type = data;
	int s, error = 0;

	switch (event) {
	case MOD_LOAD:

	/* Register new netgraph node type */
	s = splnet();
	if ((error = ng_newtype(type)) != 0) {
	splx(s);
	break;
	}

	/* Call type specific code */
	if (type->mod_event != NULL)
	if ((error = (*type->mod_event)(mod, event, data))) {
	mtx_lock(&ng_typelist_mtx);
	type->refs--; /* undo it */
	LIST_REMOVE(type, types);
	mtx_unlock(&ng_typelist_mtx);
	}
	splx(s);
	break;

	case MOD_UNLOAD:
	s = splnet();
	if (type->refs > 1) { /* make sure no nodes exist! */
	error = EBUSY;
	} else {
	if (type->refs == 0) {
	/* failed load, nothing to undo */
	splx(s);
	break;
	}
	if (type->mod_event != NULL) { /* check with type */
	error = (*type->mod_event)(mod, event, data);
	if (error != 0) { /* type refuses.. */
	splx(s);
	break;
	}
	}
	mtx_lock(&ng_typelist_mtx);
	LIST_REMOVE(type, types);
	mtx_unlock(&ng_typelist_mtx);
	}
	splx(s);
	break;

	default:
	if (type->mod_event != NULL)
	error = (*type->mod_event)(mod, event, data);
	else
	error = EOPNOTSUPP; /* XXX ? */
	break;
	}
	return (error);
	}

	/*
	* Handle loading and unloading for this code.
	* The only thing we need to link into is the NETISR strucure.
	*/
	static int
	ngb_mod_event(module_t mod, int event, void *data)
	{
	int error = 0;

	switch (event) {
	case MOD_LOAD:
	/* Initialize everything. */
	NG_WORKLIST_LOCK_INIT();
	mtx_init(&ng_typelist_mtx, "netgraph types mutex", NULL,
	MTX_DEF);
	mtx_init(&ng_idhash_mtx, "netgraph idhash mutex", NULL,
	MTX_DEF);
	mtx_init(&ng_namehash_mtx, "netgraph namehash mutex", NULL,
	MTX_DEF);
	mtx_init(&ng_topo_mtx, "netgraph topology mutex", NULL,
	MTX_DEF);
	#ifdef NETGRAPH_DEBUG
	mtx_init(&ng_nodelist_mtx, "netgraph nodelist mutex", NULL,
	MTX_DEF);
	mtx_init(&ngq_mtx, "netgraph item list mutex", NULL,
	MTX_DEF);
	#endif
	ng_qzone = uma_zcreate("NetGraph items", sizeof(struct ng_item),
	NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
	uma_zone_set_max(ng_qzone, maxalloc);
	ng_qdzone = uma_zcreate("NetGraph data items", sizeof(struct ng_item),
	NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
	uma_zone_set_max(ng_qdzone, maxdata);
	netisr_register(NETISR_NETGRAPH, (netisr_t *)ngintr, NULL, 0);
	break;
	case MOD_UNLOAD:
	/* You can't unload it because an interface may be using it. */
	error = EBUSY;
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}
	return (error);
	}

	static moduledata_t netgraph_mod = {
	"netgraph",
	ngb_mod_event,
	(NULL)
	};
	DECLARE_MODULE(netgraph, netgraph_mod, SI_SUB_NETGRAPH, SI_ORDER_MIDDLE);
	SYSCTL_NODE(_net, OID_AUTO, graph, CTLFLAG_RW, 0, "netgraph Family");
	SYSCTL_INT(_net_graph, OID_AUTO, abi_version, CTLFLAG_RD, 0, NG_ABI_VERSION,"");
	SYSCTL_INT(_net_graph, OID_AUTO, msg_version, CTLFLAG_RD, 0, NG_VERSION, "");

	#ifdef NETGRAPH_DEBUG
	void
	dumphook (hook_p hook, char *file, int line)
	{
	printf("hook: name %s, %d refs, Last touched:\n",
	_NG_HOOK_NAME(hook), hook->hk_refs);
	printf(" Last active @ %s, line %d\n",
	hook->lastfile, hook->lastline);
	if (line) {
	printf(" problem discovered at file %s, line %d\n", file, line);
	}
	}

	void
	dumpnode(node_p node, char *file, int line)
	{
	printf("node: ID [%x]: type '%s', %d hooks, flags 0x%x, %d refs, %s:\n",
	_NG_NODE_ID(node), node->nd_type->name,
	node->nd_numhooks, node->nd_flags,
	node->nd_refs, node->nd_name);
	printf(" Last active @ %s, line %d\n",
	node->lastfile, node->lastline);
	if (line) {
	printf(" problem discovered at file %s, line %d\n", file, line);
	}
	}

	void
	dumpitem(item_p item, char *file, int line)
	{
	printf(" ACTIVE item, last used at %s, line %d",
	item->lastfile, item->lastline);
	switch(item->el_flags & NGQF_TYPE) {
	case NGQF_DATA:
	printf(" - [data]\n");
	break;
	case NGQF_MESG:
	printf(" - retaddr[%d]:\n", _NGI_RETADDR(item));
	break;
	case NGQF_FN:
	printf(" - fn@%p (%p, %p, %p, %d (%x))\n",
	_NGI_FN(item),
	_NGI_NODE(item),
	_NGI_HOOK(item),
	item->body.fn.fn_arg1,
	item->body.fn.fn_arg2,
	item->body.fn.fn_arg2);
	break;
	case NGQF_FN2:
	printf(" - fn2@%p (%p, %p, %p, %d (%x))\n",
	_NGI_FN2(item),
	_NGI_NODE(item),
	_NGI_HOOK(item),
	item->body.fn.fn_arg1,
	item->body.fn.fn_arg2,
	item->body.fn.fn_arg2);
	break;
	}
	if (line) {
	printf(" problem discovered at file %s, line %d\n", file, line);
	if (_NGI_NODE(item)) {
	printf("node %p ([%x])\n",
	_NGI_NODE(item), ng_node2ID(_NGI_NODE(item)));
	}
	}
	}

	static void
	ng_dumpitems(void)
	{
	item_p item;
	int i = 1;
	TAILQ_FOREACH(item, &ng_itemlist, all) {
	printf("[%d] ", i++);
	dumpitem(item, NULL, 0);
	}
	}

	static void
	ng_dumpnodes(void)
	{
	node_p node;
	int i = 1;
	mtx_lock(&ng_nodelist_mtx);
	SLIST_FOREACH(node, &ng_allnodes, nd_all) {
	printf("[%d] ", i++);
	dumpnode(node, NULL, 0);
	}
	mtx_unlock(&ng_nodelist_mtx);
	}

	static void
	ng_dumphooks(void)
	{
	hook_p hook;
	int i = 1;
	mtx_lock(&ng_nodelist_mtx);
	SLIST_FOREACH(hook, &ng_allhooks, hk_all) {
	printf("[%d] ", i++);
	dumphook(hook, NULL, 0);
	}
	mtx_unlock(&ng_nodelist_mtx);
	}

	static int
	sysctl_debug_ng_dump_items(SYSCTL_HANDLER_ARGS)
	{
	int error;
	int val;
	int i;

	val = allocated;
	i = 1;
	error = sysctl_handle_int(oidp, &val, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	if (val == 42) {
	ng_dumpitems();
	ng_dumpnodes();
	ng_dumphooks();
	}
	return (0);
	}

	SYSCTL_PROC(_debug, OID_AUTO, ng_dump_items, CTLTYPE_INT \| CTLFLAG_RW,
	0, sizeof(int), sysctl_debug_ng_dump_items, "I", "Number of allocated items");
	#endif /* NETGRAPH_DEBUG */


	/***********************************************************************
	* Worklist routines
	**********************************************************************/
	/* NETISR thread enters here */
	/*
	* Pick a node off the list of nodes with work,
	* try get an item to process off it.
	* If there are no more, remove the node from the list.
	*/
	static void
	ngintr(void)
	{
	for (;;) {
	node_p node;

	/* Get node from the worklist. */
	NG_WORKLIST_LOCK();
	node = STAILQ_FIRST(&ng_worklist);
	if (!node) {
	NG_WORKLIST_UNLOCK();
	break;
	}
	STAILQ_REMOVE_HEAD(&ng_worklist, nd_input_queue.q_work);
	NG_WORKLIST_UNLOCK();
	CTR3(KTR_NET, "%20s: node [%x] (%p) taken off worklist",
	__func__, node->nd_ID, node);
	/*
	* We have the node. We also take over the reference
	* that the list had on it.
	* Now process as much as you can, until it won't
	* let you have another item off the queue.
	* All this time, keep the reference
	* that lets us be sure that the node still exists.
	* Let the reference go at the last minute.
	*/
	for (;;) {
	item_p item;
	int rw;

	NG_QUEUE_LOCK(&node->nd_input_queue);
	item = ng_dequeue(node, &rw);
	if (item == NULL) {
	node->nd_input_queue.q_flags2 &= ~NGQ2_WORKQ;
	NG_QUEUE_UNLOCK(&node->nd_input_queue);
	break; /* go look for another node */
	} else {
	NG_QUEUE_UNLOCK(&node->nd_input_queue);
	NGI_GET_NODE(item, node); /* zaps stored node */
	ng_apply_item(node, item, rw);
	NG_NODE_UNREF(node);
	}
	}
	NG_NODE_UNREF(node);
	}
	}

	/*
	* XXX
	* It's posible that a debugging NG_NODE_REF may need
	* to be outside the mutex zone
	*/
	static void
	ng_worklist_add(node_p node)
	{

	mtx_assert(&node->nd_input_queue.q_mtx, MA_OWNED);

	if ((node->nd_input_queue.q_flags2 & NGQ2_WORKQ) == 0) {
	/*
	* If we are not already on the work queue,
	* then put us on.
	*/
	node->nd_input_queue.q_flags2 \|= NGQ2_WORKQ;
	NG_NODE_REF(node); /* XXX fafe in mutex? */
	NG_WORKLIST_LOCK();
	STAILQ_INSERT_TAIL(&ng_worklist, node, nd_input_queue.q_work);
	NG_WORKLIST_UNLOCK();
	schednetisr(NETISR_NETGRAPH);
	CTR3(KTR_NET, "%20s: node [%x] (%p) put on worklist", __func__,
	node->nd_ID, node);
	} else {
	CTR3(KTR_NET, "%20s: node [%x] (%p) already on worklist",
	__func__, node->nd_ID, node);
	}
	}


	/***********************************************************************
	* Externally useable functions to set up a queue item ready for sending
	***********************************************************************/

	#ifdef NETGRAPH_DEBUG
	#define ITEM_DEBUG_CHECKS \
	do { \
	if (NGI_NODE(item) ) { \
	printf("item already has node"); \
	kdb_enter(KDB_WHY_NETGRAPH, "has node"); \
	NGI_CLR_NODE(item); \
	} \
	if (NGI_HOOK(item) ) { \
	printf("item already has hook"); \
	kdb_enter(KDB_WHY_NETGRAPH, "has hook"); \
	NGI_CLR_HOOK(item); \
	} \
	} while (0)
	#else
	#define ITEM_DEBUG_CHECKS
	#endif

	/*
	* Put mbuf into the item.
	* Hook and node references will be removed when the item is dequeued.
	* (or equivalent)
	* (XXX) Unsafe because no reference held by peer on remote node.
	* remote node might go away in this timescale.
	* We know the hooks can't go away because that would require getting
	* a writer item on both nodes and we must have at least a reader
	* here to be able to do this.
	* Note that the hook loaded is the REMOTE hook.
	*
	* This is possibly in the critical path for new data.
	*/
	item_p
	ng_package_data(struct mbuf *m, int flags)
	{
	item_p item;

	if ((item = ng_alloc_item(NGQF_DATA, flags)) == NULL) {
	NG_FREE_M(m);
	return (NULL);
	}
	ITEM_DEBUG_CHECKS;
	item->el_flags \|= NGQF_READER;
	NGI_M(item) = m;
	return (item);
	}

	/*
	* Allocate a queue item and put items into it..
	* Evaluate the address as this will be needed to queue it and
	* to work out what some of the fields should be.
	* Hook and node references will be removed when the item is dequeued.
	* (or equivalent)
	*/
	item_p
	ng_package_msg(struct ng_mesg *msg, int flags)
	{
	item_p item;

	if ((item = ng_alloc_item(NGQF_MESG, flags)) == NULL) {
	NG_FREE_MSG(msg);
	return (NULL);
	}
	ITEM_DEBUG_CHECKS;
	/* Messages items count as writers unless explicitly exempted. */
	if (msg->header.cmd & NGM_READONLY)
	item->el_flags \|= NGQF_READER;
	else
	item->el_flags \|= NGQF_WRITER;
	/*
	* Set the current lasthook into the queue item
	*/
	NGI_MSG(item) = msg;
	NGI_RETADDR(item) = 0;
	return (item);
	}



	#define SET_RETADDR(item, here, retaddr) \
	do { /* Data or fn items don't have retaddrs */ \
	if ((item->el_flags & NGQF_TYPE) == NGQF_MESG) { \
	if (retaddr) { \
	NGI_RETADDR(item) = retaddr; \
	} else { \
	/* \
	* The old return address should be ok. \
	* If there isn't one, use the address \
	* here. \
	*/ \
	if (NGI_RETADDR(item) == 0) { \
	NGI_RETADDR(item) \
	= ng_node2ID(here); \
	} \
	} \
	} \
	} while (0)

	int
	ng_address_hook(node_p here, item_p item, hook_p hook, ng_ID_t retaddr)
	{
	hook_p peer;
	node_p peernode;
	ITEM_DEBUG_CHECKS;
	/*
	* Quick sanity check..
	* Since a hook holds a reference on it's node, once we know
	* that the peer is still connected (even if invalid,) we know
	* that the peer node is present, though maybe invalid.
	*/
	if ((hook == NULL) \|\|
	NG_HOOK_NOT_VALID(hook) \|\|
	NG_HOOK_NOT_VALID(peer = NG_HOOK_PEER(hook)) \|\|
	NG_NODE_NOT_VALID(peernode = NG_PEER_NODE(hook))) {
	NG_FREE_ITEM(item);
	TRAP_ERROR();
	return (ENETDOWN);
	}

	/*
	* Transfer our interest to the other (peer) end.
	*/
	NG_HOOK_REF(peer);
	NG_NODE_REF(peernode);
	NGI_SET_HOOK(item, peer);
	NGI_SET_NODE(item, peernode);
	SET_RETADDR(item, here, retaddr);
	return (0);
	}

	int
	ng_address_path(node_p here, item_p item, char *address, ng_ID_t retaddr)
	{
	node_p dest = NULL;
	hook_p hook = NULL;
	int error;

	ITEM_DEBUG_CHECKS;
	/*
	* Note that ng_path2noderef increments the reference count
	* on the node for us if it finds one. So we don't have to.
	*/
	error = ng_path2noderef(here, address, &dest, &hook);
	if (error) {
	NG_FREE_ITEM(item);
	return (error);
	}
	NGI_SET_NODE(item, dest);
	if ( hook) {
	NG_HOOK_REF(hook); /* don't let it go while on the queue */
	NGI_SET_HOOK(item, hook);
	}
	SET_RETADDR(item, here, retaddr);
	return (0);
	}

	int
	ng_address_ID(node_p here, item_p item, ng_ID_t ID, ng_ID_t retaddr)
	{
	node_p dest;

	ITEM_DEBUG_CHECKS;
	/*
	* Find the target node.
	*/
	dest = ng_ID2noderef(ID); /* GETS REFERENCE! */
	if (dest == NULL) {
	NG_FREE_ITEM(item);
	TRAP_ERROR();
	return(EINVAL);
	}
	/* Fill out the contents */
	NGI_SET_NODE(item, dest);
	NGI_CLR_HOOK(item);
	SET_RETADDR(item, here, retaddr);
	return (0);
	}

	/*
	* special case to send a message to self (e.g. destroy node)
	* Possibly indicate an arrival hook too.
	* Useful for removing that hook :-)
	*/
	item_p
	ng_package_msg_self(node_p here, hook_p hook, struct ng_mesg *msg)
	{
	item_p item;

	/*
	* Find the target node.
	* If there is a HOOK argument, then use that in preference
	* to the address.
	*/
	if ((item = ng_alloc_item(NGQF_MESG, NG_NOFLAGS)) == NULL) {
	NG_FREE_MSG(msg);
	return (NULL);
	}

	/* Fill out the contents */
	item->el_flags \|= NGQF_WRITER;
	NG_NODE_REF(here);
	NGI_SET_NODE(item, here);
	if (hook) {
	NG_HOOK_REF(hook);
	NGI_SET_HOOK(item, hook);
	}
	NGI_MSG(item) = msg;
	NGI_RETADDR(item) = ng_node2ID(here);
	return (item);
	}

	/*
	* Send ng_item_fn function call to the specified node.
	*/

	int
	ng_send_fn(node_p node, hook_p hook, ng_item_fn fn, void arg1, int arg2)
	{

	return ng_send_fn1(node, hook, fn, arg1, arg2, NG_NOFLAGS);
	}

	int
	ng_send_fn1(node_p node, hook_p hook, ng_item_fn fn, void arg1, int arg2,
	int flags)
	{
	item_p item;

	if ((item = ng_alloc_item(NGQF_FN, flags)) == NULL) {
	return (ENOMEM);
	}
	item->el_flags \|= NGQF_WRITER;
	NG_NODE_REF(node); /* and one for the item */
	NGI_SET_NODE(item, node);
	if (hook) {
	NG_HOOK_REF(hook);
	NGI_SET_HOOK(item, hook);
	}
	NGI_FN(item) = fn;
	NGI_ARG1(item) = arg1;
	NGI_ARG2(item) = arg2;
	return(ng_snd_item(item, flags));
	}

	/*
	* Send ng_item_fn2 function call to the specified node.
	*
	* If an optional pitem parameter is supplied, its apply
	* callback will be copied to the new item. If also NG_REUSE_ITEM
	* flag is set, no new item will be allocated, but pitem will
	* be used.
	*/
	int
	ng_send_fn2(node_p node, hook_p hook, item_p pitem, ng_item_fn2 fn, void arg1,
	int arg2, int flags)
	{
	item_p item;

	KASSERT((pitem != NULL \|\| (flags & NG_REUSE_ITEM) == 0),
	("%s: NG_REUSE_ITEM but no pitem", __func__));

	/*
	* Allocate a new item if no supplied or
	* if we can't use supplied one.
	*/
	if (pitem == NULL \|\| (flags & NG_REUSE_ITEM) == 0) {
	if ((item = ng_alloc_item(NGQF_FN2, flags)) == NULL)
	return (ENOMEM);
	if (pitem != NULL)
	item->apply = pitem->apply;
	} else {
	if ((item = ng_realloc_item(pitem, NGQF_FN2, flags)) == NULL)
	return (ENOMEM);
	}

	item->el_flags = (item->el_flags & ~NGQF_RW) \| NGQF_WRITER;
	NG_NODE_REF(node); /* and one for the item */
	NGI_SET_NODE(item, node);
	if (hook) {
	NG_HOOK_REF(hook);
	NGI_SET_HOOK(item, hook);
	}
	NGI_FN2(item) = fn;
	NGI_ARG1(item) = arg1;
	NGI_ARG2(item) = arg2;
	return(ng_snd_item(item, flags));
	}

	/*
	* Official timeout routines for Netgraph nodes.
	*/
	static void
	ng_callout_trampoline(void *arg)
	{
	item_p item = arg;

	ng_snd_item(item, 0);
	}


	int
	ng_callout(struct callout *c, node_p node, hook_p hook, int ticks,
	ng_item_fn fn, void arg1, int arg2)
	{
	item_p item, oitem;

	if ((item = ng_alloc_item(NGQF_FN, NG_NOFLAGS)) == NULL)
	return (ENOMEM);

	item->el_flags \|= NGQF_WRITER;
	NG_NODE_REF(node); /* and one for the item */
	NGI_SET_NODE(item, node);
	if (hook) {
	NG_HOOK_REF(hook);
	NGI_SET_HOOK(item, hook);
	}
	NGI_FN(item) = fn;
	NGI_ARG1(item) = arg1;
	NGI_ARG2(item) = arg2;
	oitem = c->c_arg;
	if (callout_reset(c, ticks, &ng_callout_trampoline, item) == 1 &&
	oitem != NULL)
	NG_FREE_ITEM(oitem);
	return (0);
	}

	/* A special modified version of untimeout() */
	int
	ng_uncallout(struct callout *c, node_p node)
	{
	item_p item;
	int rval;

	KASSERT(c != NULL, ("ng_uncallout: NULL callout"));
	KASSERT(node != NULL, ("ng_uncallout: NULL node"));

	rval = callout_stop(c);
	item = c->c_arg;
	/* Do an extra check */
	if ((rval > 0) && (c->c_func == &ng_callout_trampoline) &&
	(NGI_NODE(item) == node)) {
	/*
	* We successfully removed it from the queue before it ran
	* So now we need to unreference everything that was
	* given extra references. (NG_FREE_ITEM does this).
	*/
	NG_FREE_ITEM(item);
	}
	c->c_arg = NULL;

	return (rval);
	}

	/*
	* Set the address, if none given, give the node here.
	*/
	void
	ng_replace_retaddr(node_p here, item_p item, ng_ID_t retaddr)
	{
	if (retaddr) {
	NGI_RETADDR(item) = retaddr;
	} else {
	/*
	* The old return address should be ok.
	* If there isn't one, use the address here.
	*/
	NGI_RETADDR(item) = ng_node2ID(here);
	}
	}

	#define TESTING
	#ifdef TESTING
	/* just test all the macros */
	void
	ng_macro_test(item_p item);
	void
	ng_macro_test(item_p item)
	{
	node_p node = NULL;
	hook_p hook = NULL;
	struct mbuf *m;
	struct ng_mesg *msg;
	ng_ID_t retaddr;
	int error;

	NGI_GET_M(item, m);
	NGI_GET_MSG(item, msg);
	retaddr = NGI_RETADDR(item);
	NG_SEND_DATA(error, hook, m, NULL);
	NG_SEND_DATA_ONLY(error, hook, m);
	NG_FWD_NEW_DATA(error, item, hook, m);
	NG_FWD_ITEM_HOOK(error, item, hook);
	NG_SEND_MSG_HOOK(error, node, msg, hook, retaddr);
	NG_SEND_MSG_ID(error, node, msg, retaddr, retaddr);
	NG_SEND_MSG_PATH(error, node, msg, ".:", retaddr);
	NG_FWD_MSG_HOOK(error, node, item, hook, retaddr);
	}
	#endif /* TESTING */

	Index: head/sys/netgraph/ng_eiface.c
	===================================================================
	--- head/sys/netgraph/ng_eiface.c (revision 183549)
	+++ head/sys/netgraph/ng_eiface.c (revision 183550)
	@@ -1,592 +1,600 @@
	/*-
	*
	* Copyright (c) 1999-2001, Vitaly V Belekhov
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/sockio.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/netisr.h>

	#include <netgraph/ng_message.h>
	#include <netgraph/netgraph.h>
	#include <netgraph/ng_parse.h>
	#include <netgraph/ng_eiface.h>

	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/if_arp.h>

	static const struct ng_cmdlist ng_eiface_cmdlist[] = {
	{
	NGM_EIFACE_COOKIE,
	NGM_EIFACE_GET_IFNAME,
	"getifname",
	NULL,
	&ng_parse_string_type
	},
	{
	NGM_EIFACE_COOKIE,
	NGM_EIFACE_SET,
	"set",
	&ng_parse_enaddr_type,
	NULL
	},
	{ 0 }
	};

	/* Node private data */
	struct ng_eiface_private {
	struct ifnet ifp; / per-interface network data */
	int unit; /* Interface unit number */
	node_p node; /* Our netgraph node */
	hook_p ether; /* Hook for ethernet stream */
	};
	typedef struct ng_eiface_private *priv_p;

	/* Interface methods */
	static void ng_eiface_init(void *xsc);
	static void ng_eiface_start(struct ifnet *ifp);
	static int ng_eiface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
	#ifdef DEBUG
	static void ng_eiface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data);
	#endif

	/* Netgraph methods */
	static int ng_eiface_mod_event(module_t, int, void *);
	static ng_constructor_t ng_eiface_constructor;
	static ng_rcvmsg_t ng_eiface_rcvmsg;
	static ng_shutdown_t ng_eiface_rmnode;
	static ng_newhook_t ng_eiface_newhook;
	static ng_rcvdata_t ng_eiface_rcvdata;
	static ng_disconnect_t ng_eiface_disconnect;

	/* Node type descriptor */
	static struct ng_type typestruct = {
	.version = NG_ABI_VERSION,
	.name = NG_EIFACE_NODE_TYPE,
	.mod_event = ng_eiface_mod_event,
	.constructor = ng_eiface_constructor,
	.rcvmsg = ng_eiface_rcvmsg,
	.shutdown = ng_eiface_rmnode,
	.newhook = ng_eiface_newhook,
	.rcvdata = ng_eiface_rcvdata,
	.disconnect = ng_eiface_disconnect,
	.cmdlist = ng_eiface_cmdlist
	};
	NETGRAPH_INIT(eiface, &typestruct);

	static struct unrhdr *ng_eiface_unit;

	/************************************************************************
	INTERFACE STUFF
	************************************************************************/

	/*
	* Process an ioctl for the virtual interface
	*/
	static int
	ng_eiface_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
	{
	struct ifreq const ifr = (struct ifreq )data;
	int s, error = 0;

	#ifdef DEBUG
	ng_eiface_print_ioctl(ifp, command, data);
	#endif
	s = splimp();
	switch (command) {

	/* These two are mostly handled at a higher layer */
	case SIOCSIFADDR:
	error = ether_ioctl(ifp, command, data);
	break;
	case SIOCGIFADDR:
	break;

	/* Set flags */
	case SIOCSIFFLAGS:
	/*
	* If the interface is marked up and stopped, then start it.
	* If it is marked down and running, then stop it.
	*/
	if (ifp->if_flags & IFF_UP) {
	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE);
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	}
	} else {
	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING \|
	IFF_DRV_OACTIVE);
	}
	break;

	/* Set the interface MTU */
	case SIOCSIFMTU:
	if (ifr->ifr_mtu > NG_EIFACE_MTU_MAX \|\|
	ifr->ifr_mtu < NG_EIFACE_MTU_MIN)
	error = EINVAL;
	else
	ifp->if_mtu = ifr->ifr_mtu;
	break;

	/* Stuff that's not supported */
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	error = 0;
	break;
	case SIOCSIFPHYS:
	error = EOPNOTSUPP;
	break;

	default:
	error = EINVAL;
	break;
	}
	splx(s);
	return (error);
	}

	static void
	ng_eiface_init(void *xsc)
	{
	priv_p sc = xsc;
	struct ifnet *ifp = sc->ifp;
	int s;

	s = splimp();

	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;

	splx(s);
	}

	/*
	* We simply relay the packet to the "ether" hook, if it is connected.
	* We have been through the netgraph locking and are guaranteed to
	* be the only code running in this node at this time.
	*/
	static void
	ng_eiface_start2(node_p node, hook_p hook, void *arg1, int arg2)
	{
	struct ifnet *ifp = arg1;
	const priv_p priv = (priv_p)ifp->if_softc;
	int error = 0;
	struct mbuf *m;

	/* Check interface flags */

	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)))
	return;

	for (;;) {
	/*
	* Grab a packet to transmit.
	*/
	IF_DEQUEUE(&ifp->if_snd, m);

	/* If there's nothing to send, break. */
	if (m == NULL)
	break;

	/*
	* Berkeley packet filter.
	* Pass packet to bpf if there is a listener.
	* XXX is this safe? locking?
	*/
	BPF_MTAP(ifp, m);

	if (ifp->if_flags & IFF_MONITOR) {
	ifp->if_ipackets++;
	m_freem(m);
	continue;
	}

	/*
	* Send packet; if hook is not connected, mbuf will get
	* freed.
	*/
	NG_SEND_DATA_ONLY(error, priv->ether, m);

	/* Update stats */
	if (error == 0)
	ifp->if_opackets++;
	else
	ifp->if_oerrors++;
	}

	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;

	return;
	}

	/*
	* This routine is called to deliver a packet out the interface.
	* We simply queue the netgraph version to be called when netgraph locking
	* allows it to happen.
	* Until we know what the rest of the networking code is doing for
	* locking, we don't know how we will interact with it.
	* Take comfort from the fact that the ifnet struct is part of our
	* private info and can't go away while we are queued.
	* [Though we don't know it is still there now....]
	* it is possible we don't gain anything from this because
	* we would like to get the mbuf and queue it as data
	* somehow, but we can't and if we did would we solve anything?
	*/
	static void
	ng_eiface_start(struct ifnet *ifp)
	{

	const priv_p priv = (priv_p)ifp->if_softc;

	/* Don't do anything if output is active */
	if (ifp->if_drv_flags & IFF_DRV_OACTIVE)
	return;

	ifp->if_drv_flags \|= IFF_DRV_OACTIVE;

	if (ng_send_fn(priv->node, NULL, &ng_eiface_start2, ifp, 0) != 0)
	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
	}

	#ifdef DEBUG
	/*
	* Display an ioctl to the virtual interface
	*/

	static void
	ng_eiface_print_ioctl(struct ifnet *ifp, int command, caddr_t data)
	{
	char *str;

	switch (command & IOC_DIRMASK) {
	case IOC_VOID:
	str = "IO";
	break;
	case IOC_OUT:
	str = "IOR";
	break;
	case IOC_IN:
	str = "IOW";
	break;
	case IOC_INOUT:
	str = "IORW";
	break;
	default:
	str = "IO??";
	}
	log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n",
	ifp->if_xname,
	str,
	IOCGROUP(command),
	command & 0xff,
	IOCPARM_LEN(command));
	}
	#endif /* DEBUG */

	/************************************************************************
	NETGRAPH NODE STUFF
	************************************************************************/

	/*
	* Constructor for a node
	*/
	static int
	ng_eiface_constructor(node_p node)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	struct ifnet *ifp;
	priv_p priv;
	u_char eaddr[6] = {0,0,0,0,0,0};

	/* Allocate node and interface private structures */
	MALLOC(priv, priv_p, sizeof(*priv), M_NETGRAPH, M_NOWAIT \| M_ZERO);
	if (priv == NULL)
	return (ENOMEM);

	ifp = priv->ifp = if_alloc(IFT_ETHER);
	if (ifp == NULL) {
	free(priv, M_NETGRAPH);
	return (ENOSPC);
	}

	/* Link them together */
	ifp->if_softc = priv;

	/* Get an interface unit number */
	priv->unit = alloc_unr(V_ng_eiface_unit);

	/* Link together node and private info */
	NG_NODE_SET_PRIVATE(node, priv);
	priv->node = node;

	/* Initialize interface structure */
	if_initname(ifp, NG_EIFACE_EIFACE_NAME, priv->unit);
	ifp->if_init = ng_eiface_init;
	ifp->if_output = ether_output;
	ifp->if_start = ng_eiface_start;
	ifp->if_ioctl = ng_eiface_ioctl;
	ifp->if_watchdog = NULL;
	ifp->if_snd.ifq_maxlen = IFQ_MAXLEN;
	ifp->if_flags = (IFF_SIMPLEX \| IFF_BROADCAST \| IFF_MULTICAST);

	#if 0
	/* Give this node name */
	bzero(ifname, sizeof(ifname));
	sprintf(ifname, "if%s", ifp->if_xname);
	(void)ng_name_node(node, ifname);
	#endif

	/* Attach the interface */
	ether_ifattach(ifp, eaddr);

	/* Done */
	return (0);
	}

	/*
	* Give our ok for a hook to be added
	*/
	static int
	ng_eiface_newhook(node_p node, hook_p hook, const char *name)
	{
	priv_p priv = NG_NODE_PRIVATE(node);
	struct ifnet *ifp = priv->ifp;

	if (strcmp(name, NG_EIFACE_HOOK_ETHER))
	return (EPFNOSUPPORT);
	if (priv->ether != NULL)
	return (EISCONN);
	priv->ether = hook;
	NG_HOOK_SET_PRIVATE(hook, &priv->ether);

	if_link_state_change(ifp, LINK_STATE_UP);

	return (0);
	}

	/*
	* Receive a control message
	*/
	static int
	ng_eiface_rcvmsg(node_p node, item_p item, hook_p lasthook)
	{
	const priv_p priv = NG_NODE_PRIVATE(node);
	struct ifnet *const ifp = priv->ifp;
	struct ng_mesg *resp = NULL;
	int error = 0;
	struct ng_mesg *msg;

	NGI_GET_MSG(item, msg);
	switch (msg->header.typecookie) {
	case NGM_EIFACE_COOKIE:
	switch (msg->header.cmd) {

	case NGM_EIFACE_SET:
	{
	if (msg->header.arglen != ETHER_ADDR_LEN) {
	error = EINVAL;
	break;
	}
	error = if_setlladdr(priv->ifp,
	(u_char *)msg->data, ETHER_ADDR_LEN);
	break;
	}

	case NGM_EIFACE_GET_IFNAME:
	NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	strlcpy(resp->data, ifp->if_xname, IFNAMSIZ);
	break;

	case NGM_EIFACE_GET_IFADDRS:
	{
	struct ifaddr *ifa;
	caddr_t ptr;
	int buflen;

	#define SA_SIZE(s) ((s)->sa_len<sizeof((s))? sizeof((s)):(s)->sa_len)

	/* Determine size of response and allocate it */
	buflen = 0;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	buflen += SA_SIZE(ifa->ifa_addr);
	NG_MKRESPONSE(resp, msg, buflen, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}

	/* Add addresses */
	ptr = resp->data;
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	const int len = SA_SIZE(ifa->ifa_addr);

	if (buflen < len) {
	log(LOG_ERR, "%s: len changed?\n",
	ifp->if_xname);
	break;
	}
	bcopy(ifa->ifa_addr, ptr, len);
	ptr += len;
	buflen -= len;
	}
	break;
	#undef SA_SIZE
	}

	default:
	error = EINVAL;
	break;
	} /* end of inner switch() */
	break;
	case NGM_FLOW_COOKIE:
	switch (msg->header.cmd) {
	case NGM_LINK_IS_UP:
	if_link_state_change(ifp, LINK_STATE_UP);
	break;
	case NGM_LINK_IS_DOWN:
	if_link_state_change(ifp, LINK_STATE_DOWN);
	break;
	default:
	break;
	}
	break;
	default:
	error = EINVAL;
	break;
	}
	NG_RESPOND_MSG(error, node, item, resp);
	NG_FREE_MSG(msg);
	return (error);
	}

	/*
	* Receive data from a hook. Pass the packet to the ether_input routine.
	*/
	static int
	ng_eiface_rcvdata(hook_p hook, item_p item)
	{
	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
	struct ifnet *const ifp = priv->ifp;
	struct mbuf *m;

	NGI_GET_M(item, m);
	NG_FREE_ITEM(item);

	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING))) {
	NG_FREE_M(m);
	return (ENETDOWN);
	}

	if (m->m_len < ETHER_HDR_LEN) {
	m = m_pullup(m, ETHER_HDR_LEN);
	if (m == NULL)
	return (EINVAL);
	}

	/* Note receiving interface */
	m->m_pkthdr.rcvif = ifp;

	/* Update interface stats */
	ifp->if_ipackets++;

	(*ifp->if_input)(ifp, m);

	/* Done */
	return (0);
	}

	/*
	* Shutdown processing.
	*/
	static int
	ng_eiface_rmnode(node_p node)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	const priv_p priv = NG_NODE_PRIVATE(node);
	struct ifnet *const ifp = priv->ifp;

	+ /*
	+ * the ifnet may be in a different vnet than the netgraph node,
	+ * hence we have to change the current vnet context here.
	+ */
	+ CURVNET_SET_QUIET(ifp->if_vnet);
	ether_ifdetach(ifp);
	if_free(ifp);
	+ CURVNET_RESTORE();
	free_unr(V_ng_eiface_unit, priv->unit);
	FREE(priv, M_NETGRAPH);
	NG_NODE_SET_PRIVATE(node, NULL);
	NG_NODE_UNREF(node);
	return (0);
	}

	/*
	* Hook disconnection
	*/
	static int
	ng_eiface_disconnect(hook_p hook)
	{
	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));

	priv->ether = NULL;
	return (0);
	}

	/*
	* Handle loading and unloading for this node type.
	*/
	static int
	ng_eiface_mod_event(module_t mod, int event, void *data)
	{
	int error = 0;

	switch (event) {
	case MOD_LOAD:
	V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL);
	break;
	case MOD_UNLOAD:
	delete_unrhdr(V_ng_eiface_unit);
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}
	return (error);
	}
	Index: head/sys/netgraph/ng_gif.c
	===================================================================
	--- head/sys/netgraph/ng_gif.c (revision 183549)
	+++ head/sys/netgraph/ng_gif.c (revision 183550)
	@@ -1,597 +1,605 @@
	/*
	* ng_gif.c
	*/

	/*-
	* Copyright 2001 The Aerospace Corporation. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions, and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions, and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of The Aerospace Corporation may not be used to endorse or
	* promote products derived from this software.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AEROSPACE CORPORATION ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AEROSPACE CORPORATION BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*
	* Copyright (c) 1996-2000 Whistle Communications, Inc.
	* All rights reserved.
	*
	* Subject to the following obligations and disclaimer of warranty, use and
	* redistribution of this software, in source or object code forms, with or
	* without modifications are expressly permitted by Whistle Communications;
	* provided, however, that:
	* 1. Any and all reproductions of the source or object code must include the
	* copyright notice above and the following disclaimer of warranties; and
	* 2. No rights are granted, in any manner or form, to use Whistle
	* Communications, Inc. trademarks, including the mark "WHISTLE
	* COMMUNICATIONS" on advertising, endorsements, or otherwise except as
	* such appears in the above copyright notice or in the software.
	*
	* THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
	* TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
	* REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
	* INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
	* WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
	* REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
	* SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
	* IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
	* RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
	* WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
	* PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
	* OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* ng_gif(4) netgraph node type
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/socket.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/if_gif.h>

	#include <netgraph/ng_message.h>
	#include <netgraph/netgraph.h>
	#include <netgraph/ng_parse.h>
	#include <netgraph/ng_gif.h>

	#define IFP2NG(ifp) ((struct ng_node )((struct gif_softc )(ifp->if_softc))->gif_netgraph)
	#define IFP2NG_SET(ifp, val) (((struct gif_softc *)(ifp->if_softc))->gif_netgraph = (val))

	/* Per-node private data */
	struct private {
	struct ifnet ifp; / associated interface */
	hook_p lower; /* lower OR orphan hook connection */
	u_char lowerOrphan; /* whether lower is lower or orphan */
	};
	typedef struct private *priv_p;

	/* Functional hooks called from if_gif.c */
	static void ng_gif_input(struct ifnet ifp, struct mbuf *mp, int af);
	static void ng_gif_input_orphan(struct ifnet ifp, struct mbuf m, int af);
	static void ng_gif_attach(struct ifnet *ifp);
	static void ng_gif_detach(struct ifnet *ifp);

	/* Other functions */
	static void ng_gif_input2(node_p node, struct mbuf **mp, int af);
	static int ng_gif_glue_af(struct mbuf **mp, int af);
	static int ng_gif_rcv_lower(node_p node, struct mbuf *m);

	/* Netgraph node methods */
	static ng_constructor_t ng_gif_constructor;
	static ng_rcvmsg_t ng_gif_rcvmsg;
	static ng_shutdown_t ng_gif_shutdown;
	static ng_newhook_t ng_gif_newhook;
	static ng_connect_t ng_gif_connect;
	static ng_rcvdata_t ng_gif_rcvdata;
	static ng_disconnect_t ng_gif_disconnect;
	static int ng_gif_mod_event(module_t mod, int event, void *data);

	/* List of commands and how to convert arguments to/from ASCII */
	static const struct ng_cmdlist ng_gif_cmdlist[] = {
	{
	NGM_GIF_COOKIE,
	NGM_GIF_GET_IFNAME,
	"getifname",
	NULL,
	&ng_parse_string_type
	},
	{
	NGM_GIF_COOKIE,
	NGM_GIF_GET_IFINDEX,
	"getifindex",
	NULL,
	&ng_parse_int32_type
	},
	{ 0 }
	};

	static struct ng_type ng_gif_typestruct = {
	.version = NG_ABI_VERSION,
	.name = NG_GIF_NODE_TYPE,
	.mod_event = ng_gif_mod_event,
	.constructor = ng_gif_constructor,
	.rcvmsg = ng_gif_rcvmsg,
	.shutdown = ng_gif_shutdown,
	.newhook = ng_gif_newhook,
	.connect = ng_gif_connect,
	.rcvdata = ng_gif_rcvdata,
	.disconnect = ng_gif_disconnect,
	.cmdlist = ng_gif_cmdlist,
	};
	MODULE_DEPEND(ng_gif, if_gif, 1,1,1);
	NETGRAPH_INIT(gif, &ng_gif_typestruct);

	/******************************************************************
	GIF FUNCTION HOOKS
	******************************************************************/

	/*
	* Handle a packet that has come in on an interface. We get to
	* look at it here before any upper layer protocols do.
	*
	* NOTE: this function will get called at splimp()
	*/
	static void
	ng_gif_input(struct ifnet ifp, struct mbuf *mp, int af)
	{
	const node_p node = IFP2NG(ifp);
	const priv_p priv = NG_NODE_PRIVATE(node);

	/* If "lower" hook not connected, let packet continue */
	if (priv->lower == NULL \|\| priv->lowerOrphan)
	return;
	ng_gif_input2(node, mp, af);
	}

	/*
	* Handle a packet that has come in on an interface, and which
	* does not match any of our known protocols (an ``orphan'').
	*
	* NOTE: this function will get called at splimp()
	*/
	static void
	ng_gif_input_orphan(struct ifnet ifp, struct mbuf m, int af)
	{
	const node_p node = IFP2NG(ifp);
	const priv_p priv = NG_NODE_PRIVATE(node);

	/* If "orphan" hook not connected, let packet continue */
	if (priv->lower == NULL \|\| !priv->lowerOrphan) {
	m_freem(m);
	return;
	}
	ng_gif_input2(node, &m, af);
	if (m != NULL)
	m_freem(m);
	}

	/*
	* Handle a packet that has come in on a gif interface.
	* Attach the address family to the mbuf for later use.
	*
	* NOTE: this function will get called at splimp()
	*/
	static void
	ng_gif_input2(node_p node, struct mbuf **mp, int af)
	{
	const priv_p priv = NG_NODE_PRIVATE(node);
	int error;

	/* Glue address family on */
	if ((error = ng_gif_glue_af(mp, af)) != 0)
	return;

	/* Send out lower/orphan hook */
	NG_SEND_DATA_ONLY(error, priv->lower, *mp);
	*mp = NULL;
	}

	/*
	* A new gif interface has been attached.
	* Create a new node for it, etc.
	*/
	static void
	ng_gif_attach(struct ifnet *ifp)
	{
	priv_p priv;
	node_p node;

	/* Create node */
	KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__));
	if (ng_make_node_common(&ng_gif_typestruct, &node) != 0) {
	log(LOG_ERR, "%s: can't %s for %s\n",
	__func__, "create node", ifp->if_xname);
	return;
	}

	/* Allocate private data */
	MALLOC(priv, priv_p, sizeof(*priv), M_NETGRAPH, M_NOWAIT \| M_ZERO);
	if (priv == NULL) {
	log(LOG_ERR, "%s: can't %s for %s\n",
	__func__, "allocate memory", ifp->if_xname);
	NG_NODE_UNREF(node);
	return;
	}
	NG_NODE_SET_PRIVATE(node, priv);
	priv->ifp = ifp;
	IFP2NG_SET(ifp, node);

	/* Try to give the node the same name as the interface */
	if (ng_name_node(node, ifp->if_xname) != 0) {
	log(LOG_WARNING, "%s: can't name node %s\n",
	__func__, ifp->if_xname);
	}
	}

	/*
	* An interface is being detached.
	* REALLY Destroy its node.
	*/
	static void
	ng_gif_detach(struct ifnet *ifp)
	{
	const node_p node = IFP2NG(ifp);
	priv_p priv;

	if (node == NULL) /* no node (why not?), ignore */
	return;
	priv = NG_NODE_PRIVATE(node);
	NG_NODE_REALLY_DIE(node); /* Force real removal of node */
	/*
	* We can't assume the ifnet is still around when we run shutdown
	* So zap it now. XXX We HOPE that anything running at this time
	* handles it (as it should in the non netgraph case).
	*/
	IFP2NG_SET(ifp, NULL);
	priv->ifp = NULL; /* XXX race if interrupted an output packet */
	ng_rmnode_self(node); /* remove all netgraph parts */
	}

	/*
	* Optimization for gluing the address family onto
	* the front of an incoming packet.
	*/
	static int
	ng_gif_glue_af(struct mbuf **mp, int af)
	{
	struct mbuf m = mp;
	int error = 0;
	sa_family_t tmp_af;

	tmp_af = (sa_family_t) af;

	/*
	* XXX: should try to bring back some of the optimizations from
	* ng_ether.c
	*/

	/*
	* Doing anything more is likely to get more
	* expensive than it's worth..
	* it's probable that everything else is in one
	* big lump. The next node will do an m_pullup()
	* for exactly the amount of data it needs and
	* hopefully everything after that will not
	* need one. So let's just use M_PREPEND.
	*/
	M_PREPEND(m, sizeof (tmp_af), M_DONTWAIT);
	if (m == NULL) {
	error = ENOBUFS;
	goto done;
	}

	#if 0
	copy:
	#endif
	/* Copy header and return (possibly new) mbuf */
	mtod(m, sa_family_t ) = tmp_af;
	#if 0
	bcopy((caddr_t)&tmp_af, mtod(m, sa_family_t *), sizeof(tmp_af));
	#endif
	done:
	*mp = m;
	return error;
	}

	/******************************************************************
	NETGRAPH NODE METHODS
	******************************************************************/

	/*
	* It is not possible or allowable to create a node of this type.
	* Nodes get created when the interface is attached (or, when
	* this node type's KLD is loaded).
	*/
	static int
	ng_gif_constructor(node_p node)
	{
	return (EINVAL);
	}

	/*
	* Check for attaching a new hook.
	*/
	static int
	ng_gif_newhook(node_p node, hook_p hook, const char *name)
	{
	const priv_p priv = NG_NODE_PRIVATE(node);
	u_char orphan = priv->lowerOrphan;
	hook_p *hookptr;

	/* Divert hook is an alias for lower */
	if (strcmp(name, NG_GIF_HOOK_DIVERT) == 0)
	name = NG_GIF_HOOK_LOWER;

	/* Which hook? */
	if (strcmp(name, NG_GIF_HOOK_LOWER) == 0) {
	hookptr = &priv->lower;
	orphan = 0;
	} else if (strcmp(name, NG_GIF_HOOK_ORPHAN) == 0) {
	hookptr = &priv->lower;
	orphan = 1;
	} else
	return (EINVAL);

	/* Check if already connected (shouldn't be, but doesn't hurt) */
	if (*hookptr != NULL)
	return (EISCONN);

	/* OK */
	*hookptr = hook;
	priv->lowerOrphan = orphan;
	return (0);
	}

	/*
	* Hooks are attached, adjust to force queueing.
	* We don't really care which hook it is.
	* they should all be queuing for outgoing data.
	*/
	static int
	ng_gif_connect(hook_p hook)
	{
	NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook));
	return (0);
	}

	/*
	* Receive an incoming control message.
	*/
	static int
	ng_gif_rcvmsg(node_p node, item_p item, hook_p lasthook)
	{
	const priv_p priv = NG_NODE_PRIVATE(node);
	struct ng_mesg *resp = NULL;
	int error = 0;
	struct ng_mesg *msg;

	NGI_GET_MSG(item, msg);
	switch (msg->header.typecookie) {
	case NGM_GIF_COOKIE:
	switch (msg->header.cmd) {
	case NGM_GIF_GET_IFNAME:
	NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ);
	break;
	case NGM_GIF_GET_IFINDEX:
	NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	((u_int32_t )resp->data) = priv->ifp->if_index;
	break;
	default:
	error = EINVAL;
	break;
	}
	break;
	default:
	error = EINVAL;
	break;
	}
	NG_RESPOND_MSG(error, node, item, resp);
	NG_FREE_MSG(msg);
	return (error);
	}

	/*
	* Receive data on a hook.
	*/
	static int
	ng_gif_rcvdata(hook_p hook, item_p item)
	{
	const node_p node = NG_HOOK_NODE(hook);
	const priv_p priv = NG_NODE_PRIVATE(node);
	struct mbuf *m;

	NGI_GET_M(item, m);
	NG_FREE_ITEM(item);

	if (hook == priv->lower)
	return ng_gif_rcv_lower(node, m);
	panic("%s: weird hook", __func__);
	}

	/*
	* Handle an mbuf received on the "lower" hook.
	*/
	static int
	ng_gif_rcv_lower(node_p node, struct mbuf *m)
	{
	struct sockaddr dst;
	const priv_p priv = NG_NODE_PRIVATE(node);

	bzero(&dst, sizeof(dst));

	/* Make sure header is fully pulled up */
	if (m->m_pkthdr.len < sizeof(sa_family_t)) {
	NG_FREE_M(m);
	return (EINVAL);
	}
	if (m->m_len < sizeof(sa_family_t)
	&& (m = m_pullup(m, sizeof(sa_family_t))) == NULL) {
	return (ENOBUFS);
	}

	dst.sa_family = mtod(m, sa_family_t );
	m_adj(m, sizeof(sa_family_t));

	/* Send it on its way */
	/*
	* XXX: gif_output only uses dst for the family and passes the
	* fourth argument (rt) to in{,6}_gif_output which ignore it.
	* If this changes ng_gif will probably break.
	*/
	return gif_output(priv->ifp, m, &dst, NULL);
	}

	/*
	* Shutdown node. This resets the node but does not remove it
	* unless the REALLY_DIE flag is set.
	*/
	static int
	ng_gif_shutdown(node_p node)
	{
	const priv_p priv = NG_NODE_PRIVATE(node);

	if (node->nd_flags & NGF_REALLY_DIE) {
	/*
	* WE came here because the gif interface is being destroyed,
	* so stop being persistant.
	* Actually undo all the things we did on creation.
	* Assume the ifp has already been freed.
	*/
	NG_NODE_SET_PRIVATE(node, NULL);
	FREE(priv, M_NETGRAPH);
	NG_NODE_UNREF(node); /* free node itself */
	return (0);
	}
	NG_NODE_REVIVE(node); /* Signal ng_rmnode we are persisant */
	return (0);
	}

	/*
	* Hook disconnection.
	*/
	static int
	ng_gif_disconnect(hook_p hook)
	{
	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));

	if (hook == priv->lower) {
	priv->lower = NULL;
	priv->lowerOrphan = 0;
	} else
	panic("%s: weird hook", __func__);
	if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0)
	&& (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))))
	ng_rmnode_self(NG_HOOK_NODE(hook)); /* reset node */

	return (0);
	}

	/******************************************************************
	INITIALIZATION
	******************************************************************/

	/*
	* Handle loading and unloading for this node type.
	*/
	static int
	ng_gif_mod_event(module_t mod, int event, void *data)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct ifnet *ifp;
	int error = 0;
	int s;

	s = splnet();
	switch (event) {
	case MOD_LOAD:

	/* Register function hooks */
	if (ng_gif_attach_p != NULL) {
	error = EEXIST;
	break;
	}
	ng_gif_attach_p = ng_gif_attach;
	ng_gif_detach_p = ng_gif_detach;
	ng_gif_input_p = ng_gif_input;
	ng_gif_input_orphan_p = ng_gif_input_orphan;

	/* Create nodes for any already-existing gif interfaces */
	IFNET_RLOCK();
	- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	- if (ifp->if_type == IFT_GIF)
	- ng_gif_attach(ifp);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET_QUIET(vnet_iter); /* XXX revisit quiet */
	+ INIT_VNET_NET(curvnet);
	+ TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	+ if (ifp->if_type == IFT_GIF)
	+ ng_gif_attach(ifp);
	+ }
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IFNET_RUNLOCK();
	break;

	case MOD_UNLOAD:

	/*
	* Note that the base code won't try to unload us until
	* all nodes have been removed, and that can't happen
	* until all gif interfaces are destroyed. In any
	* case, we know there are no nodes left if the action
	* is MOD_UNLOAD, so there's no need to detach any nodes.
	*
	* XXX: what about manual unloads?!?
	*/

	/* Unregister function hooks */
	ng_gif_attach_p = NULL;
	ng_gif_detach_p = NULL;
	ng_gif_input_p = NULL;
	ng_gif_input_orphan_p = NULL;
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}
	splx(s);
	return (error);
	}

	Index: head/sys/netgraph/ng_iface.c
	===================================================================
	--- head/sys/netgraph/ng_iface.c (revision 183549)
	+++ head/sys/netgraph/ng_iface.c (revision 183550)
	@@ -1,818 +1,826 @@
	/*
	* ng_iface.c
	*/

	/*-
	* Copyright (c) 1996-1999 Whistle Communications, Inc.
	* All rights reserved.
	*
	* Subject to the following obligations and disclaimer of warranty, use and
	* redistribution of this software, in source or object code forms, with or
	* without modifications are expressly permitted by Whistle Communications;
	* provided, however, that:
	* 1. Any and all reproductions of the source or object code must include the
	* copyright notice above and the following disclaimer of warranties; and
	* 2. No rights are granted, in any manner or form, to use Whistle
	* Communications, Inc. trademarks, including the mark "WHISTLE
	* COMMUNICATIONS" on advertising, endorsements, or otherwise except as
	* such appears in the above copyright notice or in the software.
	*
	* THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
	* TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
	* REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
	* INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
	* WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
	* REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
	* SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
	* IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
	* RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
	* WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
	* PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
	* OF SUCH DAMAGE.
	*
	* Author: Archie Cobbs <archie@freebsd.org>
	*
	* $FreeBSD$
	* $Whistle: ng_iface.c,v 1.33 1999/11/01 09:24:51 julian Exp $
	*/

	/*
	* This node is also a system networking interface. It has
	* a hook for each protocol (IP, AppleTalk, IPX, etc). Packets
	* are simply relayed between the interface and the hooks.
	*
	* Interfaces are named ng0, ng1, etc. New nodes take the
	* first available interface name.
	*
	* This node also includes Berkeley packet filter support.
	*/

	#include "opt_atalk.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipx.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/random.h>
	#include <sys/sockio.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/libkern.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/bpf.h>
	#include <net/netisr.h>

	#include <netinet/in.h>

	#include <netgraph/ng_message.h>
	#include <netgraph/netgraph.h>
	#include <netgraph/ng_parse.h>
	#include <netgraph/ng_iface.h>
	#include <netgraph/ng_cisco.h>

	#ifdef NG_SEPARATE_MALLOC
	MALLOC_DEFINE(M_NETGRAPH_IFACE, "netgraph_iface", "netgraph iface node ");
	#else
	#define M_NETGRAPH_IFACE M_NETGRAPH
	#endif

	/* This struct describes one address family */
	struct iffam {
	sa_family_t family; /* Address family */
	const char hookname; / Name for hook */
	};
	typedef const struct iffam *iffam_p;

	/* List of address families supported by our interface */
	const static struct iffam gFamilies[] = {
	{ AF_INET, NG_IFACE_HOOK_INET },
	{ AF_INET6, NG_IFACE_HOOK_INET6 },
	{ AF_APPLETALK, NG_IFACE_HOOK_ATALK },
	{ AF_IPX, NG_IFACE_HOOK_IPX },
	{ AF_ATM, NG_IFACE_HOOK_ATM },
	{ AF_NATM, NG_IFACE_HOOK_NATM },
	};
	#define NUM_FAMILIES (sizeof(gFamilies) / sizeof(*gFamilies))

	/* Node private data */
	struct ng_iface_private {
	struct ifnet ifp; / Our interface */
	int unit; /* Interface unit number */
	node_p node; /* Our netgraph node */
	hook_p hooks[NUM_FAMILIES]; /* Hook for each address family */
	};
	typedef struct ng_iface_private *priv_p;

	/* Interface methods */
	static void ng_iface_start(struct ifnet *ifp);
	static int ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
	static int ng_iface_output(struct ifnet ifp, struct mbuf m0,
	struct sockaddr dst, struct rtentry rt0);
	static void ng_iface_bpftap(struct ifnet *ifp,
	struct mbuf *m, sa_family_t family);
	static int ng_iface_send(struct ifnet ifp, struct mbuf m,
	sa_family_t sa);
	#ifdef DEBUG
	static void ng_iface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data);
	#endif

	/* Netgraph methods */
	static int ng_iface_mod_event(module_t, int, void *);
	static ng_constructor_t ng_iface_constructor;
	static ng_rcvmsg_t ng_iface_rcvmsg;
	static ng_shutdown_t ng_iface_shutdown;
	static ng_newhook_t ng_iface_newhook;
	static ng_rcvdata_t ng_iface_rcvdata;
	static ng_disconnect_t ng_iface_disconnect;

	/* Helper stuff */
	static iffam_p get_iffam_from_af(sa_family_t family);
	static iffam_p get_iffam_from_hook(priv_p priv, hook_p hook);
	static iffam_p get_iffam_from_name(const char *name);
	static hook_p *get_hook_from_iffam(priv_p priv, iffam_p iffam);

	/* Parse type for struct ng_cisco_ipaddr */
	static const struct ng_parse_struct_field ng_cisco_ipaddr_type_fields[]
	= NG_CISCO_IPADDR_TYPE_INFO;
	static const struct ng_parse_type ng_cisco_ipaddr_type = {
	&ng_parse_struct_type,
	&ng_cisco_ipaddr_type_fields
	};

	/* List of commands and how to convert arguments to/from ASCII */
	static const struct ng_cmdlist ng_iface_cmds[] = {
	{
	NGM_IFACE_COOKIE,
	NGM_IFACE_GET_IFNAME,
	"getifname",
	NULL,
	&ng_parse_string_type
	},
	{
	NGM_IFACE_COOKIE,
	NGM_IFACE_POINT2POINT,
	"point2point",
	NULL,
	NULL
	},
	{
	NGM_IFACE_COOKIE,
	NGM_IFACE_BROADCAST,
	"broadcast",
	NULL,
	NULL
	},
	{
	NGM_CISCO_COOKIE,
	NGM_CISCO_GET_IPADDR,
	"getipaddr",
	NULL,
	&ng_cisco_ipaddr_type
	},
	{
	NGM_IFACE_COOKIE,
	NGM_IFACE_GET_IFINDEX,
	"getifindex",
	NULL,
	&ng_parse_uint32_type
	},
	{ 0 }
	};

	/* Node type descriptor */
	static struct ng_type typestruct = {
	.version = NG_ABI_VERSION,
	.name = NG_IFACE_NODE_TYPE,
	.mod_event = ng_iface_mod_event,
	.constructor = ng_iface_constructor,
	.rcvmsg = ng_iface_rcvmsg,
	.shutdown = ng_iface_shutdown,
	.newhook = ng_iface_newhook,
	.rcvdata = ng_iface_rcvdata,
	.disconnect = ng_iface_disconnect,
	.cmdlist = ng_iface_cmds,
	};
	NETGRAPH_INIT(iface, &typestruct);

	static struct unrhdr *ng_iface_unit;

	/************************************************************************
	HELPER STUFF
	************************************************************************/

	/*
	* Get the family descriptor from the family ID
	*/
	static __inline iffam_p
	get_iffam_from_af(sa_family_t family)
	{
	iffam_p iffam;
	int k;

	for (k = 0; k < NUM_FAMILIES; k++) {
	iffam = &gFamilies[k];
	if (iffam->family == family)
	return (iffam);
	}
	return (NULL);
	}

	/*
	* Get the family descriptor from the hook
	*/
	static __inline iffam_p
	get_iffam_from_hook(priv_p priv, hook_p hook)
	{
	int k;

	for (k = 0; k < NUM_FAMILIES; k++)
	if (priv->hooks[k] == hook)
	return (&gFamilies[k]);
	return (NULL);
	}

	/*
	* Get the hook from the iffam descriptor
	*/

	static __inline hook_p *
	get_hook_from_iffam(priv_p priv, iffam_p iffam)
	{
	return (&priv->hooks[iffam - gFamilies]);
	}

	/*
	* Get the iffam descriptor from the name
	*/
	static __inline iffam_p
	get_iffam_from_name(const char *name)
	{
	iffam_p iffam;
	int k;

	for (k = 0; k < NUM_FAMILIES; k++) {
	iffam = &gFamilies[k];
	if (!strcmp(iffam->hookname, name))
	return (iffam);
	}
	return (NULL);
	}

	/************************************************************************
	INTERFACE STUFF
	************************************************************************/

	/*
	* Process an ioctl for the virtual interface
	*/
	static int
	ng_iface_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
	{
	struct ifreq const ifr = (struct ifreq ) data;
	int s, error = 0;

	#ifdef DEBUG
	ng_iface_print_ioctl(ifp, command, data);
	#endif
	s = splimp();
	switch (command) {

	/* These two are mostly handled at a higher layer */
	case SIOCSIFADDR:
	ifp->if_flags \|= IFF_UP;
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE);
	break;
	case SIOCGIFADDR:
	break;

	/* Set flags */
	case SIOCSIFFLAGS:
	/*
	* If the interface is marked up and stopped, then start it.
	* If it is marked down and running, then stop it.
	*/
	if (ifr->ifr_flags & IFF_UP) {
	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE);
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	}
	} else {
	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING \|
	IFF_DRV_OACTIVE);
	}
	break;

	/* Set the interface MTU */
	case SIOCSIFMTU:
	if (ifr->ifr_mtu > NG_IFACE_MTU_MAX
	\|\| ifr->ifr_mtu < NG_IFACE_MTU_MIN)
	error = EINVAL;
	else
	ifp->if_mtu = ifr->ifr_mtu;
	break;

	/* Stuff that's not supported */
	case SIOCADDMULTI:
	case SIOCDELMULTI:
	error = 0;
	break;
	case SIOCSIFPHYS:
	error = EOPNOTSUPP;
	break;

	default:
	error = EINVAL;
	break;
	}
	(void) splx(s);
	return (error);
	}

	/*
	* This routine is called to deliver a packet out the interface.
	* We simply look at the address family and relay the packet to
	* the corresponding hook, if it exists and is connected.
	*/

	static int
	ng_iface_output(struct ifnet ifp, struct mbuf m,
	struct sockaddr dst, struct rtentry rt0)
	{
	uint32_t af;
	int error;

	/* Check interface flags */
	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING))) {
	m_freem(m);
	return (ENETDOWN);
	}

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	/* Berkeley packet filter */
	ng_iface_bpftap(ifp, m, dst->sa_family);

	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
	M_PREPEND(m, sizeof(sa_family_t), M_DONTWAIT);
	if (m == NULL) {
	IFQ_LOCK(&ifp->if_snd);
	IFQ_INC_DROPS(&ifp->if_snd);
	IFQ_UNLOCK(&ifp->if_snd);
	ifp->if_oerrors++;
	return (ENOBUFS);
	}
	(sa_family_t )m->m_data = dst->sa_family;
	IFQ_HANDOFF(ifp, m, error);
	} else
	error = ng_iface_send(ifp, m, dst->sa_family);

	return (error);
	}

	/*
	* Start method is used only when ALTQ is enabled.
	*/
	static void
	ng_iface_start(struct ifnet *ifp)
	{
	struct mbuf *m;
	sa_family_t sa;

	KASSERT(ALTQ_IS_ENABLED(&ifp->if_snd), ("%s without ALTQ", __func__));

	for(;;) {
	IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
	if (m == NULL)
	break;
	sa = mtod(m, sa_family_t );
	m_adj(m, sizeof(sa_family_t));
	ng_iface_send(ifp, m, sa);
	}
	}

	/*
	* Flash a packet by the BPF (requires prepending 4 byte AF header)
	* Note the phoney mbuf; this is OK because BPF treats it read-only.
	*/
	static void
	ng_iface_bpftap(struct ifnet ifp, struct mbuf m, sa_family_t family)
	{
	KASSERT(family != AF_UNSPEC, ("%s: family=AF_UNSPEC", __func__));
	if (bpf_peers_present(ifp->if_bpf)) {
	int32_t family4 = (int32_t)family;
	bpf_mtap2(ifp->if_bpf, &family4, sizeof(family4), m);
	}
	}

	/*
	* This routine does actual delivery of the packet into the
	* netgraph(4). It is called from ng_iface_start() and
	* ng_iface_output().
	*/
	static int
	ng_iface_send(struct ifnet ifp, struct mbuf m, sa_family_t sa)
	{
	const priv_p priv = (priv_p) ifp->if_softc;
	const iffam_p iffam = get_iffam_from_af(sa);
	int error;
	int len;

	/* Check address family to determine hook (if known) */
	if (iffam == NULL) {
	m_freem(m);
	log(LOG_WARNING, "%s: can't handle af%d\n", ifp->if_xname, sa);
	return (EAFNOSUPPORT);
	}

	/* Copy length before the mbuf gets invalidated. */
	len = m->m_pkthdr.len;

	/* Send packet. If hook is not connected,
	mbuf will get freed. */
	NG_SEND_DATA_ONLY(error, *get_hook_from_iffam(priv, iffam), m);

	/* Update stats. */
	if (error == 0) {
	ifp->if_obytes += len;
	ifp->if_opackets++;
	}

	return (error);
	}

	#ifdef DEBUG
	/*
	* Display an ioctl to the virtual interface
	*/

	static void
	ng_iface_print_ioctl(struct ifnet *ifp, int command, caddr_t data)
	{
	char *str;

	switch (command & IOC_DIRMASK) {
	case IOC_VOID:
	str = "IO";
	break;
	case IOC_OUT:
	str = "IOR";
	break;
	case IOC_IN:
	str = "IOW";
	break;
	case IOC_INOUT:
	str = "IORW";
	break;
	default:
	str = "IO??";
	}
	log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n",
	ifp->if_xname,
	str,
	IOCGROUP(command),
	command & 0xff,
	IOCPARM_LEN(command));
	}
	#endif /* DEBUG */

	/************************************************************************
	NETGRAPH NODE STUFF
	************************************************************************/

	/*
	* Constructor for a node
	*/
	static int
	ng_iface_constructor(node_p node)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	struct ifnet *ifp;
	priv_p priv;

	/* Allocate node and interface private structures */
	MALLOC(priv, priv_p, sizeof(*priv), M_NETGRAPH_IFACE, M_NOWAIT\|M_ZERO);
	if (priv == NULL)
	return (ENOMEM);
	ifp = if_alloc(IFT_PROPVIRTUAL);
	if (ifp == NULL) {
	FREE(priv, M_NETGRAPH_IFACE);
	return (ENOMEM);
	}

	/* Link them together */
	ifp->if_softc = priv;
	priv->ifp = ifp;

	/* Get an interface unit number */
	priv->unit = alloc_unr(V_ng_iface_unit);

	/* Link together node and private info */
	NG_NODE_SET_PRIVATE(node, priv);
	priv->node = node;

	/* Initialize interface structure */
	if_initname(ifp, NG_IFACE_IFACE_NAME, priv->unit);
	ifp->if_output = ng_iface_output;
	ifp->if_start = ng_iface_start;
	ifp->if_ioctl = ng_iface_ioctl;
	ifp->if_watchdog = NULL;
	ifp->if_mtu = NG_IFACE_MTU_DEFAULT;
	ifp->if_flags = (IFF_SIMPLEX\|IFF_POINTOPOINT\|IFF_NOARP\|IFF_MULTICAST);
	ifp->if_type = IFT_PROPVIRTUAL; /* XXX */
	ifp->if_addrlen = 0; /* XXX */
	ifp->if_hdrlen = 0; /* XXX */
	ifp->if_baudrate = 64000; /* XXX */
	IFQ_SET_MAXLEN(&ifp->if_snd, IFQ_MAXLEN);
	ifp->if_snd.ifq_drv_maxlen = IFQ_MAXLEN;
	IFQ_SET_READY(&ifp->if_snd);

	/* Give this node the same name as the interface (if possible) */
	if (ng_name_node(node, ifp->if_xname) != 0)
	log(LOG_WARNING, "%s: can't acquire netgraph name\n",
	ifp->if_xname);

	/* Attach the interface */
	if_attach(ifp);
	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));

	/* Done */
	return (0);
	}

	/*
	* Give our ok for a hook to be added
	*/
	static int
	ng_iface_newhook(node_p node, hook_p hook, const char *name)
	{
	const iffam_p iffam = get_iffam_from_name(name);
	hook_p *hookptr;

	if (iffam == NULL)
	return (EPFNOSUPPORT);
	hookptr = get_hook_from_iffam(NG_NODE_PRIVATE(node), iffam);
	if (*hookptr != NULL)
	return (EISCONN);
	*hookptr = hook;
	NG_HOOK_HI_STACK(hook);
	return (0);
	}

	/*
	* Receive a control message
	*/
	static int
	ng_iface_rcvmsg(node_p node, item_p item, hook_p lasthook)
	{
	const priv_p priv = NG_NODE_PRIVATE(node);
	struct ifnet *const ifp = priv->ifp;
	struct ng_mesg *resp = NULL;
	int error = 0;
	struct ng_mesg *msg;

	NGI_GET_MSG(item, msg);
	switch (msg->header.typecookie) {
	case NGM_IFACE_COOKIE:
	switch (msg->header.cmd) {
	case NGM_IFACE_GET_IFNAME:
	NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	strlcpy(resp->data, ifp->if_xname, IFNAMSIZ);
	break;

	case NGM_IFACE_POINT2POINT:
	case NGM_IFACE_BROADCAST:
	{

	/* Deny request if interface is UP */
	if ((ifp->if_flags & IFF_UP) != 0)
	return (EBUSY);

	/* Change flags */
	switch (msg->header.cmd) {
	case NGM_IFACE_POINT2POINT:
	ifp->if_flags \|= IFF_POINTOPOINT;
	ifp->if_flags &= ~IFF_BROADCAST;
	break;
	case NGM_IFACE_BROADCAST:
	ifp->if_flags &= ~IFF_POINTOPOINT;
	ifp->if_flags \|= IFF_BROADCAST;
	break;
	}
	break;
	}

	case NGM_IFACE_GET_IFINDEX:
	NG_MKRESPONSE(resp, msg, sizeof(uint32_t), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	((uint32_t )resp->data) = priv->ifp->if_index;
	break;

	default:
	error = EINVAL;
	break;
	}
	break;
	case NGM_CISCO_COOKIE:
	switch (msg->header.cmd) {
	case NGM_CISCO_GET_IPADDR: /* we understand this too */
	{
	struct ifaddr *ifa;

	/* Return the first configured IP address */
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct ng_cisco_ipaddr *ips;

	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	NG_MKRESPONSE(resp, msg, sizeof(ips), M_NOWAIT);
	if (resp == NULL) {
	error = ENOMEM;
	break;
	}
	ips = (struct ng_cisco_ipaddr *)resp->data;
	ips->ipaddr = ((struct sockaddr_in *)
	ifa->ifa_addr)->sin_addr;
	ips->netmask = ((struct sockaddr_in *)
	ifa->ifa_netmask)->sin_addr;
	break;
	}

	/* No IP addresses on this interface? */
	if (ifa == NULL)
	error = EADDRNOTAVAIL;
	break;
	}
	default:
	error = EINVAL;
	break;
	}
	break;
	case NGM_FLOW_COOKIE:
	switch (msg->header.cmd) {
	case NGM_LINK_IS_UP:
	ifp->if_drv_flags \|= IFF_DRV_RUNNING;
	break;
	case NGM_LINK_IS_DOWN:
	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
	break;
	default:
	break;
	}
	break;
	default:
	error = EINVAL;
	break;
	}
	NG_RESPOND_MSG(error, node, item, resp);
	NG_FREE_MSG(msg);
	return (error);
	}

	/*
	* Recive data from a hook. Pass the packet to the correct input routine.
	*/
	static int
	ng_iface_rcvdata(hook_p hook, item_p item)
	{
	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
	const iffam_p iffam = get_iffam_from_hook(priv, hook);
	struct ifnet *const ifp = priv->ifp;
	struct mbuf *m;
	int isr;

	NGI_GET_M(item, m);
	NG_FREE_ITEM(item);
	/* Sanity checks */
	KASSERT(iffam != NULL, ("%s: iffam", __func__));
	M_ASSERTPKTHDR(m);
	if ((ifp->if_flags & IFF_UP) == 0) {
	NG_FREE_M(m);
	return (ENETDOWN);
	}

	/* Update interface stats */
	ifp->if_ipackets++;
	ifp->if_ibytes += m->m_pkthdr.len;

	/* Note receiving interface */
	m->m_pkthdr.rcvif = ifp;

	/* Berkeley packet filter */
	ng_iface_bpftap(ifp, m, iffam->family);

	/* Send packet */
	switch (iffam->family) {
	#ifdef INET
	case AF_INET:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	isr = NETISR_IPV6;
	break;
	#endif
	#ifdef IPX
	case AF_IPX:
	isr = NETISR_IPX;
	break;
	#endif
	#ifdef NETATALK
	case AF_APPLETALK:
	isr = NETISR_ATALK2;
	break;
	#endif
	default:
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	/* First chunk of an mbuf contains good junk */
	if (harvest.point_to_point)
	random_harvest(m, 16, 3, 0, RANDOM_NET);
	netisr_dispatch(isr, m);
	return (0);
	}

	/*
	* Shutdown and remove the node and its associated interface.
	*/
	static int
	ng_iface_shutdown(node_p node)
	{
	+ INIT_VNET_NETGRAPH(curvnet);
	const priv_p priv = NG_NODE_PRIVATE(node);

	+ /*
	+ * The ifnet may be in a different vnet than the netgraph node,
	+ * hence we have to change the current vnet context here.
	+ */
	+ CURVNET_SET_QUIET(priv->ifp->if_vnet);
	bpfdetach(priv->ifp);
	if_detach(priv->ifp);
	if_free(priv->ifp);
	+ CURVNET_RESTORE();
	priv->ifp = NULL;
	free_unr(V_ng_iface_unit, priv->unit);
	FREE(priv, M_NETGRAPH_IFACE);
	NG_NODE_SET_PRIVATE(node, NULL);
	NG_NODE_UNREF(node);
	return (0);
	}

	/*
	* Hook disconnection. Note that we do not shutdown when all
	* hooks have been disconnected.
	*/
	static int
	ng_iface_disconnect(hook_p hook)
	{
	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
	const iffam_p iffam = get_iffam_from_hook(priv, hook);

	if (iffam == NULL)
	panic(__func__);
	*get_hook_from_iffam(priv, iffam) = NULL;
	return (0);
	}

	/*
	* Handle loading and unloading for this node type.
	*/
	static int
	ng_iface_mod_event(module_t mod, int event, void *data)
	{
	int error = 0;

	switch (event) {
	case MOD_LOAD:
	V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL);
	break;
	case MOD_UNLOAD:
	delete_unrhdr(V_ng_iface_unit);
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}
	return (error);
	}
	Index: head/sys/netinet/if_ether.c
	===================================================================
	--- head/sys/netinet/if_ether.c (revision 183549)
	+++ head/sys/netinet/if_ether.c (revision 183550)
	@@ -1,1079 +1,1086 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)if_ether.c 8.1 (Berkeley) 6/10/93
	*/

	/*
	* Ethernet address resolution protocol.
	* TODO:
	* add "inuse/lock" bit (or ref. count) along with valid bit
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_mac.h"
	#include "opt_carp.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/queue.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/route.h>
	#include <net/netisr.h>
	#include <net/if_llc.h>
	#include <net/ethernet.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>

	#include <net/if_arc.h>
	#include <net/iso88025.h>

	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif

	#include <security/mac/mac_framework.h>

	#define SIN(s) ((struct sockaddr_in *)s)
	#define SDL(s) ((struct sockaddr_dl *)s)

	SYSCTL_DECL(_net_link_ether);
	SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, "");

	/* timer values */
	static int arpt_keep = (2060); / once resolved, good for 20 more minutes */

	SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW,
	&arpt_keep, 0, "ARP entry lifetime in seconds");

	#define rt_expire rt_rmx.rmx_expire

	struct llinfo_arp {
	struct callout la_timer;
	struct rtentry *la_rt;
	struct mbuf la_hold; / last packet until resolved/timeout */
	u_short la_preempt; /* countdown for pre-expiry arps */
	u_short la_asked; /* # requests sent */
	};

	static struct ifqueue arpintrq;
	static int arp_allocated;

	static int arp_maxtries = 5;
	static int useloopback = 1; /* use loopback interface for local traffic */
	static int arp_proxyall = 0;

	-SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW,
	- &arp_maxtries, 0, "ARP resolution attempts before returning error");
	-SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW,
	- &useloopback, 0, "Use the loopback interface for local traffic");
	-SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW,
	- &arp_proxyall, 0, "Enable proxy ARP for all suitable requests");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, maxtries,
	+ CTLFLAG_RW, arp_maxtries, 0,
	+ "ARP resolution attempts before returning error");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, useloopback,
	+ CTLFLAG_RW, useloopback, 0,
	+ "Use the loopback interface for local traffic");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_link_ether_inet, OID_AUTO, proxyall,
	+ CTLFLAG_RW, arp_proxyall, 0,
	+ "Enable proxy ARP for all suitable requests");

	static void arp_init(void);
	static void arp_rtrequest(int, struct rtentry , struct rt_addrinfo );
	static void arprequest(struct ifnet *,
	struct in_addr , struct in_addr , u_char *);
	static void arpintr(struct mbuf *);
	static void arptimer(void *);
	static struct rtentry
	*arplookup(u_long, int, int, int);
	#ifdef INET
	static void in_arpinput(struct mbuf *);
	#endif

	/*
	* Timeout routine.
	*/
	static void
	arptimer(void *arg)
	{
	struct rtentry rt = (struct rtentry )arg;

	RT_LOCK_ASSERT(rt);
	/*
	* The lock is needed to close a theoretical race
	* between spontaneous expiry and intentional removal.
	* We still got an extra reference on rtentry, so can
	* safely pass pointers to its contents.
	*/
	RT_UNLOCK(rt);

	in_rtrequest(RTM_DELETE, rt_key(rt), NULL, rt_mask(rt), 0, NULL,
	rt->rt_fibnum);
	}

	/*
	* Parallel to llc_rtrequest.
	*/
	static void
	arp_rtrequest(int req, struct rtentry rt, struct rt_addrinfo info)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET(curvnet);
	struct sockaddr *gate;
	struct llinfo_arp *la;
	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
	struct in_ifaddr *ia;
	struct ifaddr *ifa;

	RT_LOCK_ASSERT(rt);

	if (rt->rt_flags & RTF_GATEWAY)
	return;
	gate = rt->rt_gateway;
	la = (struct llinfo_arp *)rt->rt_llinfo;
	switch (req) {

	case RTM_ADD:
	/*
	* XXX: If this is a manually added route to interface
	* such as older version of routed or gated might provide,
	* restore cloning bit.
	*/
	if ((rt->rt_flags & RTF_HOST) == 0 &&
	rt_mask(rt) != NULL &&
	SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff)
	rt->rt_flags \|= RTF_CLONING;
	if (rt->rt_flags & RTF_CLONING) {
	/*
	* Case 1: This route should come from a route to iface.
	*/
	rt_setgate(rt, rt_key(rt),
	(struct sockaddr *)&null_sdl);
	gate = rt->rt_gateway;
	SDL(gate)->sdl_type = rt->rt_ifp->if_type;
	SDL(gate)->sdl_index = rt->rt_ifp->if_index;
	rt->rt_expire = time_uptime;
	break;
	}
	/* Announce a new entry if requested. */
	if (rt->rt_flags & RTF_ANNOUNCE)
	arprequest(rt->rt_ifp,
	&SIN(rt_key(rt))->sin_addr,
	&SIN(rt_key(rt))->sin_addr,
	(u_char *)LLADDR(SDL(gate)));
	/FALLTHROUGH/
	case RTM_RESOLVE:
	if (gate->sa_family != AF_LINK \|\|
	gate->sa_len < sizeof(null_sdl)) {
	log(LOG_DEBUG, "%s: bad gateway %s%s\n", __func__,
	inet_ntoa(SIN(rt_key(rt))->sin_addr),
	(gate->sa_family != AF_LINK) ?
	" (!AF_LINK)": "");
	break;
	}
	SDL(gate)->sdl_type = rt->rt_ifp->if_type;
	SDL(gate)->sdl_index = rt->rt_ifp->if_index;
	if (la != 0)
	break; /* This happens on a route change */
	/*
	* Case 2: This route may come from cloning, or a manual route
	* add with a LL address.
	*/
	R_Zalloc(la, struct llinfo_arp , sizeof(la));
	rt->rt_llinfo = (caddr_t)la;
	if (la == 0) {
	log(LOG_DEBUG, "%s: malloc failed\n", __func__);
	break;
	}
	arp_allocated++;
	/*
	* We are storing a route entry outside of radix tree. So,
	* it can be found and accessed by other means than radix
	* lookup. The routing code assumes that any rtentry detached
	* from radix can be destroyed safely. To prevent this, we
	* add an additional reference.
	*/
	RT_ADDREF(rt);
	la->la_rt = rt;
	rt->rt_flags \|= RTF_LLINFO;
	callout_init_mtx(&la->la_timer, &rt->rt_mtx,
	CALLOUT_RETURNUNLOCKED);

	#ifdef INET
	/*
	* This keeps the multicast addresses from showing up
	* in `arp -a' listings as unresolved. It's not actually
	* functional. Then the same for broadcast.
	*/
	if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr)) &&
	rt->rt_ifp->if_type != IFT_ARCNET) {
	ETHER_MAP_IP_MULTICAST(&SIN(rt_key(rt))->sin_addr,
	LLADDR(SDL(gate)));
	SDL(gate)->sdl_alen = 6;
	rt->rt_expire = 0;
	}
	if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) {
	memcpy(LLADDR(SDL(gate)), rt->rt_ifp->if_broadcastaddr,
	rt->rt_ifp->if_addrlen);
	SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen;
	rt->rt_expire = 0;
	}
	#endif

	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	if (ia->ia_ifp == rt->rt_ifp &&
	SIN(rt_key(rt))->sin_addr.s_addr ==
	(IA_SIN(ia))->sin_addr.s_addr)
	break;
	}
	if (ia) {
	/*
	* This test used to be
	* if (loif.if_flags & IFF_UP)
	* It allowed local traffic to be forced
	* through the hardware by configuring the loopback down.
	* However, it causes problems during network configuration
	* for boards that can't receive packets they send.
	* It is now necessary to clear "useloopback" and remove
	* the route to force traffic out to the hardware.
	*/
	rt->rt_expire = 0;
	bcopy(IF_LLADDR(rt->rt_ifp), LLADDR(SDL(gate)),
	SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen);
	if (V_useloopback) {
	rt->rt_ifp = V_loif;
	rt->rt_rmx.rmx_mtu = V_loif->if_mtu;
	}

	/*
	* make sure to set rt->rt_ifa to the interface
	* address we are using, otherwise we will have trouble
	* with source address selection.
	*/
	ifa = &ia->ia_ifa;
	if (ifa != rt->rt_ifa) {
	IFAFREE(rt->rt_ifa);
	IFAREF(ifa);
	rt->rt_ifa = ifa;
	}
	}
	break;

	case RTM_DELETE:
	if (la == NULL) /* XXX: at least CARP does this. */
	break;
	callout_stop(&la->la_timer);
	rt->rt_llinfo = NULL;
	rt->rt_flags &= ~RTF_LLINFO;
	RT_REMREF(rt);
	if (la->la_hold)
	m_freem(la->la_hold);
	Free((caddr_t)la);
	}
	}

	/*
	* Broadcast an ARP request. Caller specifies:
	* - arp header source ip address
	* - arp header target ip address
	* - arp header source ethernet address
	*/
	static void
	arprequest(struct ifnet ifp, struct in_addr sip, struct in_addr *tip,
	u_char *enaddr)
	{
	struct mbuf *m;
	struct arphdr *ah;
	struct sockaddr sa;

	if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
	return;
	m->m_len = sizeof(ah) + 2sizeof(struct in_addr) +
	2*ifp->if_data.ifi_addrlen;
	m->m_pkthdr.len = m->m_len;
	MH_ALIGN(m, m->m_len);
	ah = mtod(m, struct arphdr *);
	bzero((caddr_t)ah, m->m_len);
	#ifdef MAC
	mac_netinet_arp_send(ifp, m);
	#endif
	ah->ar_pro = htons(ETHERTYPE_IP);
	ah->ar_hln = ifp->if_addrlen; /* hardware address length */
	ah->ar_pln = sizeof(struct in_addr); /* protocol address length */
	ah->ar_op = htons(ARPOP_REQUEST);
	bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln);
	bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln);
	bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln);
	sa.sa_family = AF_ARP;
	sa.sa_len = 2;
	m->m_flags \|= M_BCAST;
	(ifp->if_output)(ifp, m, &sa, (struct rtentry )0);

	return;
	}

	/*
	* Resolve an IP address into an ethernet address.
	* On input:
	* ifp is the interface we use
	* rt0 is the route to the final destination (possibly useless)
	* m is the mbuf. May be NULL if we don't have a packet.
	* dst is the next hop,
	* desten is where we want the address.
	*
	* On success, desten is filled in and the function returns 0;
	* If the packet must be held pending resolution, we return EWOULDBLOCK
	* On other errors, we return the corresponding error code.
	* Note that m_freem() handles NULL.
	*/
	int
	arpresolve(struct ifnet ifp, struct rtentry rt0, struct mbuf *m,
	struct sockaddr dst, u_char desten)
	{
	+ INIT_VNET_INET(ifp->if_vnet);
	struct llinfo_arp *la = NULL;
	struct rtentry *rt = NULL;
	struct sockaddr_dl *sdl;
	int error;
	int fibnum = -1;

	if (m) {
	if (m->m_flags & M_BCAST) {
	/* broadcast */
	(void)memcpy(desten,
	ifp->if_broadcastaddr, ifp->if_addrlen);
	return (0);
	}
	if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) {
	/* multicast */
	ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
	return (0);
	}
	fibnum = M_GETFIB(m);
	}

	if (rt0 != NULL) {
	/* Look for a cached arp (ll) entry. */
	if (m == NULL)
	fibnum = rt0->rt_fibnum;
	error = rt_check(&rt, &rt0, dst);
	if (error) {
	m_freem(m);
	return error;
	}
	la = (struct llinfo_arp *)rt->rt_llinfo;
	if (la == NULL)
	RT_UNLOCK(rt);
	}

	/*
	* If we had no mbuf and no route, then hope the caller
	* has a fib in mind because we are running out of ideas.
	* I think this should not happen in current code.
	* (kmacy would know).
	*/
	if (fibnum == -1)
	fibnum = curthread->td_proc->p_fibnum; /* last gasp */

	if (la == NULL) {
	/*
	* We enter this block if rt0 was NULL,
	* or if rt found by rt_check() didn't have llinfo.
	* we should get a cloned route, which since it should
	* come from the local interface should have a ll entry.
	* It may be incomplete but that's ok.
	*/
	rt = arplookup(SIN(dst)->sin_addr.s_addr, 1, 0, fibnum);
	if (rt == NULL) {
	log(LOG_DEBUG,
	"arpresolve: can't allocate route for %s\n",
	inet_ntoa(SIN(dst)->sin_addr));
	m_freem(m);
	return (EINVAL); /* XXX */
	}
	la = (struct llinfo_arp *)rt->rt_llinfo;
	if (la == NULL) {
	RT_UNLOCK(rt);
	log(LOG_DEBUG,
	"arpresolve: can't allocate llinfo for %s\n",
	inet_ntoa(SIN(dst)->sin_addr));
	m_freem(m);
	return (EINVAL); /* XXX */
	}
	}
	sdl = SDL(rt->rt_gateway);
	/*
	* Check the address family and length is valid, the address
	* is resolved; otherwise, try to resolve.
	*/
	if ((rt->rt_expire == 0 \|\| rt->rt_expire > time_uptime) &&
	sdl->sdl_family == AF_LINK && sdl->sdl_alen != 0) {

	bcopy(LLADDR(sdl), desten, sdl->sdl_alen);

	/*
	* If entry has an expiry time and it is approaching,
	* send an ARP request.
	*/
	if ((rt->rt_expire != 0) &&
	(time_uptime + la->la_preempt > rt->rt_expire)) {
	struct in_addr sin =
	SIN(rt->rt_ifa->ifa_addr)->sin_addr;

	la->la_preempt--;
	RT_UNLOCK(rt);
	arprequest(ifp, &sin, &SIN(dst)->sin_addr,
	IF_LLADDR(ifp));
	return (0);
	}

	RT_UNLOCK(rt);
	return (0);
	}
	/*
	* If ARP is disabled or static on this interface, stop.
	* XXX
	* Probably should not allocate empty llinfo struct if we are
	* not going to be sending out an arp request.
	*/
	if (ifp->if_flags & (IFF_NOARP \| IFF_STATICARP)) {
	RT_UNLOCK(rt);
	m_freem(m);
	return (EINVAL);
	}
	/*
	* There is an arptab entry, but no ethernet address
	* response yet. Replace the held mbuf with this
	* latest one.
	*/
	if (m) {
	if (la->la_hold)
	m_freem(la->la_hold);
	la->la_hold = m;
	}
	KASSERT(rt->rt_expire > 0, ("sending ARP request for static entry"));

	/*
	* Return EWOULDBLOCK if we have tried less than arp_maxtries. It
	* will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
	* if we have already sent arp_maxtries ARP requests. Retransmit the
	* ARP request, but not faster than one request per second.
	*/
	if (la->la_asked < V_arp_maxtries)
	error = EWOULDBLOCK; /* First request. */
	else
	error = (rt == rt0) ? EHOSTDOWN : EHOSTUNREACH;

	if (la->la_asked == 0 \|\| rt->rt_expire != time_uptime) {
	struct in_addr sin =
	SIN(rt->rt_ifa->ifa_addr)->sin_addr;

	rt->rt_expire = time_uptime;
	callout_reset(&la->la_timer, hz, arptimer, rt);
	la->la_asked++;
	RT_UNLOCK(rt);

	arprequest(ifp, &sin, &SIN(dst)->sin_addr,
	IF_LLADDR(ifp));
	} else
	RT_UNLOCK(rt);

	return (error);
	}

	/*
	* Common length and type checks are done here,
	* then the protocol-specific routine is called.
	*/
	static void
	arpintr(struct mbuf *m)
	{
	struct arphdr *ar;

	if (m->m_len < sizeof(struct arphdr) &&
	((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
	log(LOG_ERR, "arp: runt packet -- m_pullup failed\n");
	return;
	}
	ar = mtod(m, struct arphdr *);

	if (ntohs(ar->ar_hrd) != ARPHRD_ETHER &&
	ntohs(ar->ar_hrd) != ARPHRD_IEEE802 &&
	ntohs(ar->ar_hrd) != ARPHRD_ARCNET &&
	ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) {
	log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n",
	(unsigned char *)&ar->ar_hrd, "");
	m_freem(m);
	return;
	}

	if (m->m_len < arphdr_len(ar)) {
	if ((m = m_pullup(m, arphdr_len(ar))) == NULL) {
	log(LOG_ERR, "arp: runt packet\n");
	m_freem(m);
	return;
	}
	ar = mtod(m, struct arphdr *);
	}

	switch (ntohs(ar->ar_pro)) {
	#ifdef INET
	case ETHERTYPE_IP:
	in_arpinput(m);
	return;
	#endif
	}
	m_freem(m);
	}

	#ifdef INET
	/*
	* ARP for Internet protocols on 10 Mb/s Ethernet.
	* Algorithm is that given in RFC 826.
	* In addition, a sanity check is performed on the sender
	* protocol address, to catch impersonators.
	* We no longer handle negotiations for use of trailer protocol:
	* Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
	* along with IP replies if we wanted trailers sent to us,
	* and also sent them in response to IP replies.
	* This allowed either end to announce the desire to receive
	* trailer packets.
	* We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
	* but formerly didn't normally send requests.
	*/
	static int log_arp_wrong_iface = 1;
	static int log_arp_movements = 1;
	static int log_arp_permanent_modify = 1;

	SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
	&log_arp_wrong_iface, 0,
	"log arp packets arriving on the wrong interface");
	SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
	&log_arp_movements, 0,
	"log arp replies from MACs different than the one in the cache");
	SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
	&log_arp_permanent_modify, 0,
	"log arp replies from MACs different than the one in the permanent arp entry");


	static void
	in_arpinput(struct mbuf *m)
	{
	struct arphdr *ah;
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct llinfo_arp *la;
	struct rtentry *rt;
	struct ifaddr *ifa;
	struct in_ifaddr *ia;
	struct sockaddr_dl *sdl;
	struct sockaddr sa;
	struct in_addr isaddr, itaddr, myaddr;
	struct mbuf *hold;
	u_int8_t *enaddr = NULL;
	int op, rif_len;
	int req_len;
	int bridged = 0, is_bridge = 0;
	u_int fibnum;
	u_int goodfib = 0;
	int firstpass = 1;
	#ifdef DEV_CARP
	int carp_match = 0;
	#endif
	struct sockaddr_in sin;
	sin.sin_len = sizeof(struct sockaddr_in);
	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = 0;
	-
	+ INIT_VNET_INET(ifp->if_vnet);
	+
	if (ifp->if_bridge)
	bridged = 1;
	if (ifp->if_type == IFT_BRIDGE)
	is_bridge = 1;

	req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr));
	if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) {
	log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n");
	return;
	}

	ah = mtod(m, struct arphdr *);
	op = ntohs(ah->ar_op);
	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));

	/*
	* For a bridge, we want to check the address irrespective
	* of the receive interface. (This will change slightly
	* when we have clusters of interfaces).
	* If the interface does not match, but the recieving interface
	* is part of carp, we call carp_iamatch to see if this is a
	* request for the virtual host ip.
	* XXX: This is really ugly!
	*/
	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
	if (((bridged && ia->ia_ifp->if_bridge != NULL) \|\|
	(ia->ia_ifp == ifp)) &&
	itaddr.s_addr == ia->ia_addr.sin_addr.s_addr)
	goto match;
	#ifdef DEV_CARP
	if (ifp->if_carp != NULL &&
	carp_iamatch(ifp->if_carp, ia, &isaddr, &enaddr) &&
	itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
	carp_match = 1;
	goto match;
	}
	#endif
	}
	LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
	if (((bridged && ia->ia_ifp->if_bridge != NULL) \|\|
	(ia->ia_ifp == ifp)) &&
	isaddr.s_addr == ia->ia_addr.sin_addr.s_addr)
	goto match;

	#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \
	(ia->ia_ifp->if_bridge == ifp->if_softc && \
	!bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \
	addr == ia->ia_addr.sin_addr.s_addr)
	/*
	* Check the case when bridge shares its MAC address with
	* some of its children, so packets are claimed by bridge
	* itself (bridge_input() does it first), but they are really
	* meant to be destined to the bridge member.
	*/
	if (is_bridge) {
	LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
	if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
	ifp = ia->ia_ifp;
	goto match;
	}
	}
	}
	#undef BDG_MEMBER_MATCHES_ARP

	/*
	* No match, use the first inet address on the receive interface
	* as a dummy address for the rest of the function.
	*/
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (ifa->ifa_addr->sa_family == AF_INET) {
	ia = ifatoia(ifa);
	goto match;
	}
	/*
	* If bridging, fall back to using any inet address.
	*/
	if (!bridged \|\| (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL)
	goto drop;
	match:
	if (!enaddr)
	enaddr = (u_int8_t *)IF_LLADDR(ifp);
	myaddr = ia->ia_addr.sin_addr;
	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
	goto drop; /* it's from me, ignore it. */
	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
	log(LOG_ERR,
	"arp: link address is broadcast for IP address %s!\n",
	inet_ntoa(isaddr));
	goto drop;
	}
	/*
	* Warn if another host is using the same IP address, but only if the
	* IP address isn't 0.0.0.0, which is used for DHCP only, in which
	* case we suppress the warning to avoid false positive complaints of
	* potential misconfiguration.
	*/
	if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) {
	log(LOG_ERR,
	"arp: %*D is using my IP address %s on %s!\n",
	ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
	inet_ntoa(isaddr), ifp->if_xname);
	itaddr = myaddr;
	goto reply;
	}
	if (ifp->if_flags & IFF_STATICARP)
	goto reply;
	/*
	* We look for any FIB that has this address to find
	* the interface etc.
	* For sanity checks that are FIB independent we abort the loop.
	*/
	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
	rt = arplookup(isaddr.s_addr,
	itaddr.s_addr == myaddr.s_addr, 0, fibnum);
	if (rt == NULL)
	continue;

	sdl = SDL(rt->rt_gateway);
	/* Only call this once */
	if (firstpass) {
	sin.sin_addr.s_addr = isaddr.s_addr;
	EVENTHANDLER_INVOKE(route_arp_update_event, rt,
	ar_sha(ah), (struct sockaddr *)&sin);
	}

	la = (struct llinfo_arp *)rt->rt_llinfo;
	if (la == NULL) {
	RT_UNLOCK(rt);
	continue;
	}

	if (firstpass) {
	/* The following is not an error when doing bridging. */
	if (!bridged && rt->rt_ifp != ifp
	#ifdef DEV_CARP
	&& (ifp->if_type != IFT_CARP \|\| !carp_match)
	#endif
	) {
	if (log_arp_wrong_iface)
	log(LOG_ERR, "arp: %s is on %s "
	"but got reply from %*D "
	"on %s\n",
	inet_ntoa(isaddr),
	rt->rt_ifp->if_xname,
	ifp->if_addrlen,
	(u_char *)ar_sha(ah), ":",
	ifp->if_xname);
	RT_UNLOCK(rt);
	break;
	}
	if (sdl->sdl_alen &&
	bcmp(ar_sha(ah), LLADDR(sdl), sdl->sdl_alen)) {
	if (rt->rt_expire) {
	if (log_arp_movements)
	log(LOG_INFO,
	"arp: %s moved from %D to %D "
	"on %s\n",
	inet_ntoa(isaddr),
	ifp->if_addrlen,
	(u_char *)LLADDR(sdl), ":",
	ifp->if_addrlen,
	(u_char *)ar_sha(ah), ":",
	ifp->if_xname);
	} else {
	RT_UNLOCK(rt);
	if (log_arp_permanent_modify)
	log(LOG_ERR,
	"arp: %*D attempts to "
	"modify permanent entry "
	"for %s on %s\n",
	ifp->if_addrlen,
	(u_char *)ar_sha(ah), ":",
	inet_ntoa(isaddr),
	ifp->if_xname);
	break;
	}
	}
	/*
	* sanity check for the address length.
	* XXX this does not work for protocols
	* with variable address length. -is
	*/
	if (sdl->sdl_alen &&
	sdl->sdl_alen != ah->ar_hln) {
	log(LOG_WARNING,
	"arp from %*D: new addr len %d, was %d",
	ifp->if_addrlen, (u_char *) ar_sha(ah),
	":", ah->ar_hln, sdl->sdl_alen);
	}
	if (ifp->if_addrlen != ah->ar_hln) {
	log(LOG_WARNING,
	"arp from %*D: addr len: "
	"new %d, i/f %d (ignored)",
	ifp->if_addrlen, (u_char *) ar_sha(ah),
	":", ah->ar_hln, ifp->if_addrlen);
	RT_UNLOCK(rt);
	break;
	}
	firstpass = 0;
	goodfib = fibnum;
	}

	/* Copy in the information received. */
	(void)memcpy(LLADDR(sdl), ar_sha(ah),
	sdl->sdl_alen = ah->ar_hln);
	/*
	* If we receive an arp from a token-ring station over
	* a token-ring nic then try to save the source routing info.
	* XXXMRT Only minimal Token Ring support for MRT.
	* Only do this on the first pass as if modifies the mbuf.
	*/
	if (ifp->if_type == IFT_ISO88025) {
	struct iso88025_header *th = NULL;
	struct iso88025_sockaddr_dl_data *trld;

	/* force the fib loop to end after this pass */
	fibnum = rt_numfibs - 1;

	th = (struct iso88025_header *)m->m_pkthdr.header;
	trld = SDL_ISO88025(sdl);
	rif_len = TR_RCF_RIFLEN(th->rcf);
	if ((th->iso88025_shost[0] & TR_RII) &&
	(rif_len > 2)) {
	trld->trld_rcf = th->rcf;
	trld->trld_rcf ^= htons(TR_RCF_DIR);
	memcpy(trld->trld_route, th->rd, rif_len - 2);
	trld->trld_rcf &= ~htons(TR_RCF_BCST_MASK);
	/*
	* Set up source routing information for
	* reply packet (XXX)
	*/
	m->m_data -= rif_len;
	m->m_len += rif_len;
	m->m_pkthdr.len += rif_len;
	} else {
	th->iso88025_shost[0] &= ~TR_RII;
	trld->trld_rcf = 0;
	}
	m->m_data -= 8;
	m->m_len += 8;
	m->m_pkthdr.len += 8;
	th->rcf = trld->trld_rcf;
	}

	if (rt->rt_expire) {
	rt->rt_expire = time_uptime + V_arpt_keep;
	callout_reset(&la->la_timer, hz * V_arpt_keep,
	arptimer, rt);
	}
	la->la_asked = 0;
	la->la_preempt = V_arp_maxtries;
	hold = la->la_hold;
	la->la_hold = NULL;
	RT_UNLOCK(rt);
	if (hold != NULL)
	(*ifp->if_output)(ifp, hold, rt_key(rt), rt);
	} /* end of FIB loop */
	reply:

	/*
	* Decide if we have to respond to something.
	*/
	if (op != ARPOP_REQUEST)
	goto drop;
	if (itaddr.s_addr == myaddr.s_addr) {
	/* Shortcut.. the receiving interface is the target. */
	(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
	(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
	} else {
	/* It's not asking for our address. But it still may
	* be something we should answer.
	*
	* XXX MRT
	* We assume that link level info is independent of
	* the table used and so we use whichever we can and don't
	* have a better option.
	*/
	/* Have we been asked to proxy for the target. */
	rt = arplookup(itaddr.s_addr, 0, SIN_PROXY, goodfib);
	if (rt == NULL) {
	/* Nope, only intersted now if proxying everything. */
	struct sockaddr_in sin;

	if (!V_arp_proxyall)
	goto drop;

	bzero(&sin, sizeof sin);
	sin.sin_family = AF_INET;
	sin.sin_len = sizeof sin;
	sin.sin_addr = itaddr;

	/* XXX MRT use table 0 for arp reply */
	rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
	if (!rt)
	goto drop;
	/*
	* Don't send proxies for nodes on the same interface
	* as this one came out of, or we'll get into a fight
	* over who claims what Ether address.
	*/
	if (rt->rt_ifp == ifp) {
	rtfree(rt);
	goto drop;
	}
	(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
	(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
	rtfree(rt);

	/*
	* Also check that the node which sent the ARP packet
	* is on the the interface we expect it to be on. This
	* avoids ARP chaos if an interface is connected to the
	* wrong network.
	*/
	sin.sin_addr = isaddr;

	/* XXX MRT use table 0 for arp checks */
	rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0);
	if (!rt)
	goto drop;
	if (rt->rt_ifp != ifp) {
	log(LOG_INFO, "arp_proxy: ignoring request"
	" from %s via %s, expecting %s\n",
	inet_ntoa(isaddr), ifp->if_xname,
	rt->rt_ifp->if_xname);
	rtfree(rt);
	goto drop;
	}
	rtfree(rt);

	#ifdef DEBUG_PROXY
	printf("arp: proxying for %s\n",
	inet_ntoa(itaddr));
	#endif
	} else {
	/*
	* Return proxied ARP replies only on the interface
	* or bridge cluster where this network resides.
	* Otherwise we may conflict with the host we are
	* proxying for.
	*/
	if (rt->rt_ifp != ifp &&
	(rt->rt_ifp->if_bridge != ifp->if_bridge \|\|
	ifp->if_bridge == NULL)) {
	RT_UNLOCK(rt);
	goto drop;
	}
	sdl = SDL(rt->rt_gateway);
	(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
	(void)memcpy(ar_sha(ah), LLADDR(sdl), ah->ar_hln);
	RT_UNLOCK(rt);
	}
	}

	if (itaddr.s_addr == myaddr.s_addr &&
	IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
	/* RFC 3927 link-local IPv4; always reply by broadcast. */
	#ifdef DEBUG_LINKLOCAL
	printf("arp: sending reply for link-local addr %s\n",
	inet_ntoa(itaddr));
	#endif
	m->m_flags \|= M_BCAST;
	m->m_flags &= ~M_MCAST;
	} else {
	/* default behaviour; never reply by broadcast. */
	m->m_flags &= ~(M_BCAST\|M_MCAST);
	}
	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
	ah->ar_op = htons(ARPOP_REPLY);
	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
	m->m_len = sizeof(ah) + (2 ah->ar_pln) + (2 * ah->ar_hln);
	m->m_pkthdr.len = m->m_len;
	sa.sa_family = AF_ARP;
	sa.sa_len = 2;
	(ifp->if_output)(ifp, m, &sa, (struct rtentry )0);
	return;

	drop:
	m_freem(m);
	}
	#endif

	/*
	* Lookup or enter a new address in arptab.
	*/
	static struct rtentry *
	arplookup(u_long addr, int create, int proxy, int fibnum)
	{
	struct rtentry *rt;
	struct sockaddr_inarp sin;
	const char *why = 0;

	bzero(&sin, sizeof(sin));
	sin.sin_len = sizeof(sin);
	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = addr;
	if (proxy)
	sin.sin_other = SIN_PROXY;
	rt = in_rtalloc1((struct sockaddr *)&sin, create, 0UL, fibnum);
	if (rt == 0)
	return (0);

	if (rt->rt_flags & RTF_GATEWAY)
	why = "host is not on local network";
	else if ((rt->rt_flags & RTF_LLINFO) == 0)
	why = "could not allocate llinfo";
	else if (rt->rt_gateway->sa_family != AF_LINK)
	why = "gateway route is not ours";

	if (why) {
	#define ISDYNCLONE(_rt) \
	(((_rt)->rt_flags & (RTF_STATIC \| RTF_WASCLONED)) == RTF_WASCLONED)
	if (create)
	log(LOG_DEBUG, "arplookup %s failed: %s\n",
	inet_ntoa(sin.sin_addr), why);
	/*
	* If there are no references to this Layer 2 route,
	* and it is a cloned route, and not static, and
	* arplookup() is creating the route, then purge
	* it from the routing table as it is probably bogus.
	*/
	if (rt->rt_refcnt == 1 && ISDYNCLONE(rt))
	rtexpunge(rt);
	RTFREE_LOCKED(rt);
	return (0);
	#undef ISDYNCLONE
	} else {
	RT_REMREF(rt);
	return (rt);
	}
	}

	void
	arp_ifinit(struct ifnet ifp, struct ifaddr ifa)
	{
	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
	arprequest(ifp, &IA_SIN(ifa)->sin_addr,
	&IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp));
	ifa->ifa_rtrequest = arp_rtrequest;
	ifa->ifa_flags \|= RTF_CLONING;
	}

	void
	arp_ifinit2(struct ifnet ifp, struct ifaddr ifa, u_char *enaddr)
	{
	if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY)
	arprequest(ifp, &IA_SIN(ifa)->sin_addr,
	&IA_SIN(ifa)->sin_addr, enaddr);
	ifa->ifa_rtrequest = arp_rtrequest;
	ifa->ifa_flags \|= RTF_CLONING;
	}

	static void
	arp_init(void)
	{

	arpintrq.ifq_maxlen = 50;
	mtx_init(&arpintrq.ifq_mtx, "arp_inq", NULL, MTX_DEF);
	netisr_register(NETISR_ARP, arpintr, &arpintrq, 0);
	}
	SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0);
	Index: head/sys/netinet/igmp.c
	===================================================================
	--- head/sys/netinet/igmp.c (revision 183549)
	+++ head/sys/netinet/igmp.c (revision 183550)
	@@ -1,513 +1,534 @@
	/*-
	* Copyright (c) 1988 Stephen Deering.
	* Copyright (c) 1992, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Stephen Deering of Stanford University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)igmp.c 8.1 (Berkeley) 7/19/93
	*/

	/*
	* Internet Group Management Protocol (IGMP) routines.
	*
	* Written by Steve Deering, Stanford, May 1988.
	* Modified by Rosen Sharma, Stanford, Aug 1994.
	* Modified by Bill Fenner, Xerox PARC, Feb 1995.
	* Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
	*
	* MULTICAST Revision: 3.5.1.4
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/protosw.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/igmp.h>
	#include <netinet/igmp_var.h>

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");

	static struct router_info find_rti(struct ifnet ifp);
	static void igmp_sendpkt(struct in_multi *, int, unsigned long);

	static struct igmpstat igmpstat;

	-SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, &igmpstat,
	- igmpstat, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS,
	+ stats, CTLFLAG_RW, igmpstat, igmpstat, "");

	/*
	* igmp_mtx protects all mutable global variables in igmp.c, as well as the
	* data fields in struct router_info. In general, a router_info structure
	* will be valid as long as the referencing struct in_multi is valid, so no
	* reference counting is used. We allow unlocked reads of router_info data
	* when accessed via an in_multi read-only.
	*/
	static struct mtx igmp_mtx;
	static SLIST_HEAD(, router_info) router_info_head;
	static int igmp_timers_are_running;

	/*
	* XXXRW: can we define these such that these can be made const? In any
	* case, these shouldn't be changed after igmp_init() and therefore don't
	* need locking.
	*/
	static u_long igmp_all_hosts_group;
	static u_long igmp_all_rtrs_group;

	static struct mbuf *router_alert;
	static struct route igmprt;

	#ifdef IGMP_DEBUG
	#define IGMP_PRINTF(x) printf(x)
	#else
	#define IGMP_PRINTF(x)
	#endif

	void
	igmp_init(void)
	{
	+ INIT_VNET_INET(curvnet);
	struct ipoption *ra;

	/*
	* To avoid byte-swapping the same value over and over again.
	*/
	igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP);
	igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP);

	igmp_timers_are_running = 0;

	/*
	* Construct a Router Alert option to use in outgoing packets.
	*/
	MGET(router_alert, M_DONTWAIT, MT_DATA);
	ra = mtod(router_alert, struct ipoption *);
	ra->ipopt_dst.s_addr = 0;
	ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */
	ra->ipopt_list[1] = 0x04; /* 4 bytes long */
	ra->ipopt_list[2] = 0x00;
	ra->ipopt_list[3] = 0x00;
	router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1];

	mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF);
	SLIST_INIT(&V_router_info_head);
	}

	static struct router_info *
	find_rti(struct ifnet *ifp)
	{
	+ INIT_VNET_INET(ifp->if_vnet);
	struct router_info *rti;

	mtx_assert(&igmp_mtx, MA_OWNED);
	IGMP_PRINTF("[igmp.c, _find_rti] --> entering \n");
	SLIST_FOREACH(rti, &V_router_info_head, rti_list) {
	if (rti->rti_ifp == ifp) {
	IGMP_PRINTF(
	"[igmp.c, _find_rti] --> found old entry \n");
	return (rti);
	}
	}
	MALLOC(rti, struct router_info , sizeof rti, M_IGMP, M_NOWAIT);
	if (rti == NULL) {
	IGMP_PRINTF("[igmp.c, _find_rti] --> no memory for entry\n");
	return (NULL);
	}
	rti->rti_ifp = ifp;
	rti->rti_type = IGMP_V2_ROUTER;
	rti->rti_time = 0;
	SLIST_INSERT_HEAD(&V_router_info_head, rti, rti_list);
	IGMP_PRINTF("[igmp.c, _find_rti] --> created an entry \n");
	return (rti);
	}

	void
	igmp_input(register struct mbuf *m, int off)
	{
	register int iphlen = off;
	register struct igmp *igmp;
	register struct ip *ip;
	register int igmplen;
	register struct ifnet *ifp = m->m_pkthdr.rcvif;
	register int minlen;
	register struct in_multi *inm;
	register struct in_ifaddr *ia;
	struct in_multistep step;
	struct router_info *rti;
	int timer; / timer value in the igmp query header /
	+ INIT_VNET_INET(ifp->if_vnet);

	++V_igmpstat.igps_rcv_total;

	ip = mtod(m, struct ip *);
	igmplen = ip->ip_len;

	/*
	* Validate lengths.
	*/
	if (igmplen < IGMP_MINLEN) {
	++V_igmpstat.igps_rcv_tooshort;
	m_freem(m);
	return;
	}
	minlen = iphlen + IGMP_MINLEN;
	if ((m->m_flags & M_EXT \|\| m->m_len < minlen) &&
	(m = m_pullup(m, minlen)) == 0) {
	++V_igmpstat.igps_rcv_tooshort;
	return;
	}

	/*
	* Validate checksum.
	*/
	m->m_data += iphlen;
	m->m_len -= iphlen;
	igmp = mtod(m, struct igmp *);
	if (in_cksum(m, igmplen)) {
	++V_igmpstat.igps_rcv_badsum;
	m_freem(m);
	return;
	}
	m->m_data -= iphlen;
	m->m_len += iphlen;

	ip = mtod(m, struct ip *);
	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
	if (timer == 0)
	timer = 1;

	/*
	* In the IGMPv2 specification, there are 3 states and a flag.
	*
	* In Non-Member state, we simply don't have a membership record.
	* In Delaying Member state, our timer is running (inm->inm_timer).
	* In Idle Member state, our timer is not running (inm->inm_timer==0).
	*
	* The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if we
	* have heard a report from another member, or IGMP_IREPORTEDLAST if
	* I sent the last report.
	*/
	switch (igmp->igmp_type) {
	case IGMP_MEMBERSHIP_QUERY:
	++V_igmpstat.igps_rcv_queries;

	if (ifp->if_flags & IFF_LOOPBACK)
	break;

	if (igmp->igmp_code == 0) {
	/*
	* Old router. Remember that the querier on this
	* interface is old, and set the timer to the value
	* in RFC 1112.
	*/

	mtx_lock(&igmp_mtx);
	rti = find_rti(ifp);
	if (rti == NULL) {
	mtx_unlock(&igmp_mtx);
	m_freem(m);
	return;
	}
	rti->rti_type = IGMP_V1_ROUTER;
	rti->rti_time = 0;
	mtx_unlock(&igmp_mtx);

	timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ;

	if (ip->ip_dst.s_addr != igmp_all_hosts_group \|\|
	igmp->igmp_group.s_addr != 0) {
	++V_igmpstat.igps_rcv_badqueries;
	m_freem(m);
	return;
	}
	} else {
	/*
	* New router. Simply do the new validity check.
	*/

	if (igmp->igmp_group.s_addr != 0 &&
	!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) {
	++V_igmpstat.igps_rcv_badqueries;
	m_freem(m);
	return;
	}
	}

	/*
	* - Start the timers in all of our membership records that
	* the query applies to for the interface on which the
	* query arrived excl. those that belong to the "all-hosts"
	* group (224.0.0.1).
	* - Restart any timer that is already running but has a
	* value longer than the requested timeout.
	* - Use the value specified in the query message as the
	* maximum timeout.
	*/
	IN_MULTI_LOCK();
	IN_FIRST_MULTI(step, inm);
	while (inm != NULL) {
	if (inm->inm_ifp == ifp &&
	inm->inm_addr.s_addr != igmp_all_hosts_group &&
	(igmp->igmp_group.s_addr == 0 \|\|
	igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) {
	if (inm->inm_timer == 0 \|\|
	inm->inm_timer > timer) {
	inm->inm_timer =
	IGMP_RANDOM_DELAY(timer);
	igmp_timers_are_running = 1;
	}
	}
	IN_NEXT_MULTI(step, inm);
	}
	IN_MULTI_UNLOCK();
	break;

	case IGMP_V1_MEMBERSHIP_REPORT:
	case IGMP_V2_MEMBERSHIP_REPORT:
	/*
	* For fast leave to work, we have to know that we are the
	* last person to send a report for this group. Reports can
	* potentially get looped back if we are a multicast router,
	* so discard reports sourced by me.
	*/
	IFP_TO_IA(ifp, ia);
	if (ia != NULL &&
	ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr)
	break;

	++V_igmpstat.igps_rcv_reports;

	if (ifp->if_flags & IFF_LOOPBACK)
	break;

	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) {
	++V_igmpstat.igps_rcv_badreports;
	m_freem(m);
	return;
	}

	/*
	* KLUDGE: if the IP source address of the report has an
	* unspecified (i.e., zero) subnet number, as is allowed for
	* a booting host, replace it with the correct subnet number
	* so that a process-level multicast routing daemon can
	* determine which subnet it arrived from. This is necessary
	* to compensate for the lack of any way for a process to
	* determine the arrival interface of an incoming packet.
	*/
	if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) {
	if (ia != NULL)
	ip->ip_src.s_addr = htonl(ia->ia_subnet);
	}

	/*
	* If we belong to the group being reported, stop our timer
	* for that group.
	*/
	IN_MULTI_LOCK();
	IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm);
	if (inm != NULL) {
	inm->inm_timer = 0;
	++V_igmpstat.igps_rcv_ourreports;
	inm->inm_state = IGMP_OTHERMEMBER;
	}
	IN_MULTI_UNLOCK();
	break;
	}

	/*
	* Pass all valid IGMP packets up to any process(es) listening on a
	* raw IGMP socket.
	*/
	rip_input(m, off);
	}

	void
	igmp_joingroup(struct in_multi *inm)
	{

	IN_MULTI_LOCK_ASSERT();

	if (inm->inm_addr.s_addr == igmp_all_hosts_group
	\|\| inm->inm_ifp->if_flags & IFF_LOOPBACK) {
	inm->inm_timer = 0;
	inm->inm_state = IGMP_OTHERMEMBER;
	} else {
	mtx_lock(&igmp_mtx);
	inm->inm_rti = find_rti(inm->inm_ifp);
	mtx_unlock(&igmp_mtx);
	if (inm->inm_rti != NULL) {
	igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
	inm->inm_timer = IGMP_RANDOM_DELAY(
	IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ);
	inm->inm_state = IGMP_IREPORTEDLAST;
	igmp_timers_are_running = 1;
	}
	/* XXX handling of failure case? */
	}
	}

	void
	igmp_leavegroup(struct in_multi *inm)
	{

	IN_MULTI_LOCK_ASSERT();

	if (inm->inm_state == IGMP_IREPORTEDLAST &&
	inm->inm_addr.s_addr != igmp_all_hosts_group &&
	!(inm->inm_ifp->if_flags & IFF_LOOPBACK) &&
	inm->inm_rti->rti_type != IGMP_V1_ROUTER)
	igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group);
	}

	void
	igmp_fasttimo(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	register struct in_multi *inm;
	struct in_multistep step;

	/*
	* Quick check to see if any work needs to be done, in order to
	* minimize the overhead of fasttimo processing.
	*/

	if (!igmp_timers_are_running)
	return;

	IN_MULTI_LOCK();
	igmp_timers_are_running = 0;
	- IN_FIRST_MULTI(step, inm);
	- while (inm != NULL) {
	- if (inm->inm_timer == 0) {
	- /* do nothing */
	- } else if (--inm->inm_timer == 0) {
	- igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
	- inm->inm_state = IGMP_IREPORTEDLAST;
	- } else {
	- igmp_timers_are_running = 1;
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET(vnet_iter);
	+ IN_FIRST_MULTI(step, inm);
	+ while (inm != NULL) {
	+ if (inm->inm_timer == 0) {
	+ /* do nothing */
	+ } else if (--inm->inm_timer == 0) {
	+ igmp_sendpkt(inm, inm->inm_rti->rti_type, 0);
	+ inm->inm_state = IGMP_IREPORTEDLAST;
	+ } else {
	+ igmp_timers_are_running = 1;
	+ }
	+ IN_NEXT_MULTI(step, inm);
	}
	- IN_NEXT_MULTI(step, inm);
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IN_MULTI_UNLOCK();
	}

	void
	igmp_slowtimo(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct router_info *rti;

	IGMP_PRINTF("[igmp.c,_slowtimo] -- > entering \n");
	mtx_lock(&igmp_mtx);
	- SLIST_FOREACH(rti, &V_router_info_head, rti_list) {
	- if (rti->rti_type == IGMP_V1_ROUTER) {
	- rti->rti_time++;
	- if (rti->rti_time >= IGMP_AGE_THRESHOLD)
	- rti->rti_type = IGMP_V2_ROUTER;
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET(vnet_iter);
	+ SLIST_FOREACH(rti, &V_router_info_head, rti_list) {
	+ if (rti->rti_type == IGMP_V1_ROUTER) {
	+ rti->rti_time++;
	+ if (rti->rti_time >= IGMP_AGE_THRESHOLD)
	+ rti->rti_type = IGMP_V2_ROUTER;
	+ }
	}
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	mtx_unlock(&igmp_mtx);
	IGMP_PRINTF("[igmp.c,_slowtimo] -- > exiting \n");
	}

	static void
	igmp_sendpkt(struct in_multi *inm, int type, unsigned long addr)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET(curvnet);
	struct mbuf *m;
	struct igmp *igmp;
	struct ip *ip;
	struct ip_moptions imo;

	IN_MULTI_LOCK_ASSERT();

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return;

	m->m_pkthdr.rcvif = V_loif;
	#ifdef MAC
	mac_netinet_igmp_send(inm->inm_ifp, m);
	#endif
	m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN;
	MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip));
	m->m_data += sizeof(struct ip);
	m->m_len = IGMP_MINLEN;
	igmp = mtod(m, struct igmp *);
	igmp->igmp_type = type;
	igmp->igmp_code = 0;
	igmp->igmp_group = inm->inm_addr;
	igmp->igmp_cksum = 0;
	igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN);

	m->m_data -= sizeof(struct ip);
	m->m_len += sizeof(struct ip);
	ip = mtod(m, struct ip *);
	ip->ip_tos = 0;
	ip->ip_len = sizeof(struct ip) + IGMP_MINLEN;
	ip->ip_off = 0;
	ip->ip_p = IPPROTO_IGMP;
	ip->ip_src.s_addr = INADDR_ANY;
	ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr;

	imo.imo_multicast_ifp = inm->inm_ifp;
	imo.imo_multicast_ttl = 1;
	imo.imo_multicast_vif = -1;
	/*
	* Request loopback of the report if we are acting as a multicast
	* router, so that the process-level routing daemon can hear it.
	*/
	imo.imo_multicast_loop = (V_ip_mrouter != NULL);

	/*
	* XXX: Do we have to worry about reentrancy here? Don't think so.
	*/
	ip_output(m, router_alert, &igmprt, 0, &imo, NULL);

	++V_igmpstat.igps_snd_reports;
	}
	Index: head/sys/netinet/in.c
	===================================================================
	--- head/sys/netinet/in.c (revision 183549)
	+++ head/sys/netinet/in.c (revision 183550)
	@@ -1,1005 +1,1014 @@
	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* Copyright (C) 2001 WIDE Project. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in.c 8.4 (Berkeley) 1/9/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_carp.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sockio.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/socket.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>

	static int in_mask2len(struct in_addr *);
	static void in_len2mask(struct in_addr *, int);
	static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t,
	struct ifnet , struct thread );

	static int in_addprefix(struct in_ifaddr *, int);
	static int in_scrubprefix(struct in_ifaddr *);
	static void in_socktrim(struct sockaddr_in *);
	static int in_ifinit(struct ifnet *,
	struct in_ifaddr , struct sockaddr_in , int);
	static void in_purgemaddrs(struct ifnet *);

	static int subnetsarelocal = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW,
	- &subnetsarelocal, 0, "Treat all subnets as directly connected");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, subnets_are_local,
	+ CTLFLAG_RW, subnetsarelocal, 0,
	+ "Treat all subnets as directly connected");
	static int sameprefixcarponly = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW,
	- &sameprefixcarponly, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, same_prefix_carp_only,
	+ CTLFLAG_RW, sameprefixcarponly, 0,
	"Refuse to create same prefixes on different interfaces");

	extern struct inpcbinfo ripcbinfo;
	extern struct inpcbinfo udbinfo;

	/*
	* Return 1 if an internet address is for a ``local'' host
	* (one to which we have a connection). If subnetsarelocal
	* is true, this includes other subnets of the local net.
	* Otherwise, it includes only the directly-connected (sub)nets.
	*/
	int
	in_localaddr(struct in_addr in)
	{
	+ INIT_VNET_INET(curvnet);
	register u_long i = ntohl(in.s_addr);
	register struct in_ifaddr *ia;

	if (V_subnetsarelocal) {
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
	if ((i & ia->ia_netmask) == ia->ia_net)
	return (1);
	} else {
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
	if ((i & ia->ia_subnetmask) == ia->ia_subnet)
	return (1);
	}
	return (0);
	}

	/*
	* Return 1 if an internet address is for the local host and configured
	* on one of its interfaces.
	*/
	int
	in_localip(struct in_addr in)
	{
	+ INIT_VNET_INET(curvnet);
	struct in_ifaddr *ia;

	LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) {
	if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr)
	return 1;
	}
	return 0;
	}

	/*
	* Determine whether an IP address is in a reserved set of addresses
	* that may not be forwarded, or whether datagrams to that destination
	* may be forwarded.
	*/
	int
	in_canforward(struct in_addr in)
	{
	register u_long i = ntohl(in.s_addr);
	register u_long net;

	if (IN_EXPERIMENTAL(i) \|\| IN_MULTICAST(i) \|\| IN_LINKLOCAL(i))
	return (0);
	if (IN_CLASSA(i)) {
	net = i & IN_CLASSA_NET;
	if (net == 0 \|\| net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT))
	return (0);
	}
	return (1);
	}

	/*
	* Trim a mask in a sockaddr
	*/
	static void
	in_socktrim(struct sockaddr_in *ap)
	{
	register char cplim = (char ) &ap->sin_addr;
	register char cp = (char ) (&ap->sin_addr + 1);

	ap->sin_len = 0;
	while (--cp >= cplim)
	if (*cp) {
	(ap)->sin_len = cp - (char *) (ap) + 1;
	break;
	}
	}

	static int
	in_mask2len(mask)
	struct in_addr *mask;
	{
	int x, y;
	u_char *p;

	p = (u_char *)mask;
	for (x = 0; x < sizeof(*mask); x++) {
	if (p[x] != 0xff)
	break;
	}
	y = 0;
	if (x < sizeof(*mask)) {
	for (y = 0; y < 8; y++) {
	if ((p[x] & (0x80 >> y)) == 0)
	break;
	}
	}
	return x * 8 + y;
	}

	static void
	in_len2mask(struct in_addr *mask, int len)
	{
	int i;
	u_char *p;

	p = (u_char *)mask;
	bzero(mask, sizeof(*mask));
	for (i = 0; i < len / 8; i++)
	p[i] = 0xff;
	if (len % 8)
	p[i] = (0xff00 >> (len % 8)) & 0xff;
	}

	/*
	* Generic internet control operations (ioctl's).
	* Ifp is 0 if not an interface-specific ioctl.
	*/
	/* ARGSUSED */
	int
	in_control(struct socket so, u_long cmd, caddr_t data, struct ifnet ifp,
	struct thread *td)
	{
	+ INIT_VNET_INET(curvnet); /* both so and ifp can be NULL here! */
	register struct ifreq ifr = (struct ifreq )data;
	register struct in_ifaddr ia = 0, iap;
	register struct ifaddr *ifa;
	struct in_addr allhosts_addr;
	struct in_addr dst;
	struct in_ifaddr *oia;
	struct in_aliasreq ifra = (struct in_aliasreq )data;
	struct sockaddr_in oldaddr;
	int error, hostIsNew, iaIsNew, maskIsNew, s;
	int iaIsFirst;

	iaIsFirst = 0;
	iaIsNew = 0;
	allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);

	switch (cmd) {
	case SIOCALIFADDR:
	if (td != NULL) {
	error = priv_check(td, PRIV_NET_ADDIFADDR);
	if (error)
	return (error);
	}
	if (!ifp)
	return EINVAL;
	return in_lifaddr_ioctl(so, cmd, data, ifp, td);

	case SIOCDLIFADDR:
	if (td != NULL) {
	error = priv_check(td, PRIV_NET_DELIFADDR);
	if (error)
	return (error);
	}
	if (!ifp)
	return EINVAL;
	return in_lifaddr_ioctl(so, cmd, data, ifp, td);

	case SIOCGLIFADDR:
	if (!ifp)
	return EINVAL;
	return in_lifaddr_ioctl(so, cmd, data, ifp, td);
	}

	/*
	* Find address for this interface, if it exists.
	*
	* If an alias address was specified, find that one instead of
	* the first one on the interface, if possible.
	*/
	if (ifp) {
	dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr;
	LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash)
	if (iap->ia_ifp == ifp &&
	iap->ia_addr.sin_addr.s_addr == dst.s_addr) {
	ia = iap;
	break;
	}
	if (ia == NULL)
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	iap = ifatoia(ifa);
	if (iap->ia_addr.sin_family == AF_INET) {
	ia = iap;
	break;
	}
	}
	if (ia == NULL)
	iaIsFirst = 1;
	}

	switch (cmd) {

	case SIOCAIFADDR:
	case SIOCDIFADDR:
	if (ifp == 0)
	return (EADDRNOTAVAIL);
	if (ifra->ifra_addr.sin_family == AF_INET) {
	for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) {
	if (ia->ia_ifp == ifp &&
	ia->ia_addr.sin_addr.s_addr ==
	ifra->ifra_addr.sin_addr.s_addr)
	break;
	}
	if ((ifp->if_flags & IFF_POINTOPOINT)
	&& (cmd == SIOCAIFADDR)
	&& (ifra->ifra_dstaddr.sin_addr.s_addr
	== INADDR_ANY)) {
	return EDESTADDRREQ;
	}
	}
	if (cmd == SIOCDIFADDR && ia == 0)
	return (EADDRNOTAVAIL);
	/* FALLTHROUGH */
	case SIOCSIFADDR:
	case SIOCSIFNETMASK:
	case SIOCSIFDSTADDR:
	if (td != NULL) {
	error = priv_check(td, (cmd == SIOCDIFADDR) ?
	PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR);
	if (error)
	return (error);
	}

	if (ifp == 0)
	return (EADDRNOTAVAIL);
	if (ia == (struct in_ifaddr *)0) {
	ia = (struct in_ifaddr *)
	malloc(sizeof *ia, M_IFADDR, M_WAITOK \| M_ZERO);
	if (ia == (struct in_ifaddr *)NULL)
	return (ENOBUFS);
	/*
	* Protect from ipintr() traversing address list
	* while we're modifying it.
	*/
	s = splnet();
	ifa = &ia->ia_ifa;
	IFA_LOCK_INIT(ifa);
	ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
	ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
	ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
	ifa->ifa_refcnt = 1;
	TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);

	ia->ia_sockmask.sin_len = 8;
	ia->ia_sockmask.sin_family = AF_INET;
	if (ifp->if_flags & IFF_BROADCAST) {
	ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr);
	ia->ia_broadaddr.sin_family = AF_INET;
	}
	ia->ia_ifp = ifp;

	TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
	splx(s);
	iaIsNew = 1;
	}
	break;

	case SIOCSIFBRDADDR:
	if (td != NULL) {
	error = priv_check(td, PRIV_NET_ADDIFADDR);
	if (error)
	return (error);
	}
	/* FALLTHROUGH */

	case SIOCGIFADDR:
	case SIOCGIFNETMASK:
	case SIOCGIFDSTADDR:
	case SIOCGIFBRDADDR:
	if (ia == (struct in_ifaddr *)0)
	return (EADDRNOTAVAIL);
	break;
	}
	switch (cmd) {

	case SIOCGIFADDR:
	((struct sockaddr_in )&ifr->ifr_addr) = ia->ia_addr;
	return (0);

	case SIOCGIFBRDADDR:
	if ((ifp->if_flags & IFF_BROADCAST) == 0)
	return (EINVAL);
	((struct sockaddr_in )&ifr->ifr_dstaddr) = ia->ia_broadaddr;
	return (0);

	case SIOCGIFDSTADDR:
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
	return (EINVAL);
	((struct sockaddr_in )&ifr->ifr_dstaddr) = ia->ia_dstaddr;
	return (0);

	case SIOCGIFNETMASK:
	((struct sockaddr_in )&ifr->ifr_addr) = ia->ia_sockmask;
	return (0);

	case SIOCSIFDSTADDR:
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
	return (EINVAL);
	oldaddr = ia->ia_dstaddr;
	ia->ia_dstaddr = (struct sockaddr_in )&ifr->ifr_dstaddr;
	if (ifp->if_ioctl) {
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR,
	(caddr_t)ia);
	IFF_UNLOCKGIANT(ifp);
	if (error) {
	ia->ia_dstaddr = oldaddr;
	return (error);
	}
	}
	if (ia->ia_flags & IFA_ROUTE) {
	ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr;
	rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST);
	ia->ia_ifa.ifa_dstaddr =
	(struct sockaddr *)&ia->ia_dstaddr;
	rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST\|RTF_UP);
	}
	return (0);

	case SIOCSIFBRDADDR:
	if ((ifp->if_flags & IFF_BROADCAST) == 0)
	return (EINVAL);
	ia->ia_broadaddr = (struct sockaddr_in )&ifr->ifr_broadaddr;
	return (0);

	case SIOCSIFADDR:
	error = in_ifinit(ifp, ia,
	(struct sockaddr_in *) &ifr->ifr_addr, 1);
	if (error != 0 && iaIsNew)
	break;
	if (error == 0) {
	if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST) != 0)
	in_addmulti(&allhosts_addr, ifp);
	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
	}
	return (0);

	case SIOCSIFNETMASK:
	ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr;
	ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
	return (0);

	case SIOCAIFADDR:
	maskIsNew = 0;
	hostIsNew = 1;
	error = 0;
	if (ia->ia_addr.sin_family == AF_INET) {
	if (ifra->ifra_addr.sin_len == 0) {
	ifra->ifra_addr = ia->ia_addr;
	hostIsNew = 0;
	} else if (ifra->ifra_addr.sin_addr.s_addr ==
	ia->ia_addr.sin_addr.s_addr)
	hostIsNew = 0;
	}
	if (ifra->ifra_mask.sin_len) {
	in_ifscrub(ifp, ia);
	ia->ia_sockmask = ifra->ifra_mask;
	ia->ia_sockmask.sin_family = AF_INET;
	ia->ia_subnetmask =
	ntohl(ia->ia_sockmask.sin_addr.s_addr);
	maskIsNew = 1;
	}
	if ((ifp->if_flags & IFF_POINTOPOINT) &&
	(ifra->ifra_dstaddr.sin_family == AF_INET)) {
	in_ifscrub(ifp, ia);
	ia->ia_dstaddr = ifra->ifra_dstaddr;
	maskIsNew = 1; /* We lie; but the effect's the same */
	}
	if (ifra->ifra_addr.sin_family == AF_INET &&
	(hostIsNew \|\| maskIsNew))
	error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0);
	if (error != 0 && iaIsNew)
	break;

	if ((ifp->if_flags & IFF_BROADCAST) &&
	(ifra->ifra_broadaddr.sin_family == AF_INET))
	ia->ia_broadaddr = ifra->ifra_broadaddr;
	if (error == 0) {
	if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST) != 0)
	in_addmulti(&allhosts_addr, ifp);
	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
	}
	return (error);

	case SIOCDIFADDR:
	/*
	* in_ifscrub kills the interface route.
	*/
	in_ifscrub(ifp, ia);
	/*
	* in_ifadown gets rid of all the rest of
	* the routes. This is not quite the right
	* thing to do, but at least if we are running
	* a routing process they will come back.
	*/
	in_ifadown(&ia->ia_ifa, 1);
	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
	error = 0;
	break;

	default:
	if (ifp == 0 \|\| ifp->if_ioctl == 0)
	return (EOPNOTSUPP);
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, cmd, data);
	IFF_UNLOCKGIANT(ifp);
	return (error);
	}

	/*
	* Protect from ipintr() traversing address list while we're modifying
	* it.
	*/
	s = splnet();
	TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
	TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
	if (ia->ia_addr.sin_family == AF_INET) {
	LIST_REMOVE(ia, ia_hash);
	/*
	* If this is the last IPv4 address configured on this
	* interface, leave the all-hosts group.
	* XXX: This is quite ugly because of locking and structure.
	*/
	oia = NULL;
	IFP_TO_IA(ifp, oia);
	if (oia == NULL) {
	struct in_multi *inm;

	IFF_LOCKGIANT(ifp);
	IN_MULTI_LOCK();
	IN_LOOKUP_MULTI(allhosts_addr, ifp, inm);
	if (inm != NULL)
	in_delmulti_locked(inm);
	IN_MULTI_UNLOCK();
	IFF_UNLOCKGIANT(ifp);
	}
	}
	IFAFREE(&ia->ia_ifa);
	splx(s);

	return (error);
	}

	/*
	* SIOC[GAD]LIFADDR.
	* SIOCGLIFADDR: get first address. (?!?)
	* SIOCGLIFADDR with IFLR_PREFIX:
	* get first address that matches the specified prefix.
	* SIOCALIFADDR: add the specified address.
	* SIOCALIFADDR with IFLR_PREFIX:
	* EINVAL since we can't deduce hostid part of the address.
	* SIOCDLIFADDR: delete the specified address.
	* SIOCDLIFADDR with IFLR_PREFIX:
	* delete the first address that matches the specified prefix.
	* return values:
	* EINVAL on invalid parameters
	* EADDRNOTAVAIL on prefix match failed/specified address not found
	* other values may be returned from in_ioctl()
	*/
	static int
	in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
	struct ifnet ifp, struct thread td)
	{
	struct if_laddrreq iflr = (struct if_laddrreq )data;
	struct ifaddr *ifa;

	/* sanity checks */
	if (!data \|\| !ifp) {
	panic("invalid argument to in_lifaddr_ioctl");
	/NOTRECHED/
	}

	switch (cmd) {
	case SIOCGLIFADDR:
	/* address must be specified on GET with IFLR_PREFIX */
	if ((iflr->flags & IFLR_PREFIX) == 0)
	break;
	/FALLTHROUGH/
	case SIOCALIFADDR:
	case SIOCDLIFADDR:
	/* address must be specified on ADD and DELETE */
	if (iflr->addr.ss_family != AF_INET)
	return EINVAL;
	if (iflr->addr.ss_len != sizeof(struct sockaddr_in))
	return EINVAL;
	/* XXX need improvement */
	if (iflr->dstaddr.ss_family
	&& iflr->dstaddr.ss_family != AF_INET)
	return EINVAL;
	if (iflr->dstaddr.ss_family
	&& iflr->dstaddr.ss_len != sizeof(struct sockaddr_in))
	return EINVAL;
	break;
	default: /shouldn't happen/
	return EOPNOTSUPP;
	}
	if (sizeof(struct in_addr) * 8 < iflr->prefixlen)
	return EINVAL;

	switch (cmd) {
	case SIOCALIFADDR:
	{
	struct in_aliasreq ifra;

	if (iflr->flags & IFLR_PREFIX)
	return EINVAL;

	/* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
	bzero(&ifra, sizeof(ifra));
	bcopy(iflr->iflr_name, ifra.ifra_name,
	sizeof(ifra.ifra_name));

	bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len);

	if (iflr->dstaddr.ss_family) { /XXX/
	bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
	iflr->dstaddr.ss_len);
	}

	ifra.ifra_mask.sin_family = AF_INET;
	ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in);
	in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen);

	return in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td);
	}
	case SIOCGLIFADDR:
	case SIOCDLIFADDR:
	{
	struct in_ifaddr *ia;
	struct in_addr mask, candidate, match;
	struct sockaddr_in *sin;

	bzero(&mask, sizeof(mask));
	bzero(&match, sizeof(match));
	if (iflr->flags & IFLR_PREFIX) {
	/* lookup a prefix rather than address. */
	in_len2mask(&mask, iflr->prefixlen);

	sin = (struct sockaddr_in *)&iflr->addr;
	match.s_addr = sin->sin_addr.s_addr;
	match.s_addr &= mask.s_addr;

	/* if you set extra bits, that's wrong */
	if (match.s_addr != sin->sin_addr.s_addr)
	return EINVAL;

	} else {
	/* on getting an address, take the 1st match */
	/* on deleting an address, do exact match */
	if (cmd != SIOCGLIFADDR) {
	in_len2mask(&mask, 32);
	sin = (struct sockaddr_in *)&iflr->addr;
	match.s_addr = sin->sin_addr.s_addr;
	}
	}

	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	if (match.s_addr == 0)
	break;
	candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr;
	candidate.s_addr &= mask.s_addr;
	if (candidate.s_addr == match.s_addr)
	break;
	}
	if (!ifa)
	return EADDRNOTAVAIL;
	ia = (struct in_ifaddr *)ifa;

	if (cmd == SIOCGLIFADDR) {
	/* fill in the if_laddrreq structure */
	bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len);

	if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
	bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
	ia->ia_dstaddr.sin_len);
	} else
	bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));

	iflr->prefixlen =
	in_mask2len(&ia->ia_sockmask.sin_addr);

	iflr->flags = 0; /XXX/

	return 0;
	} else {
	struct in_aliasreq ifra;

	/* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
	bzero(&ifra, sizeof(ifra));
	bcopy(iflr->iflr_name, ifra.ifra_name,
	sizeof(ifra.ifra_name));

	bcopy(&ia->ia_addr, &ifra.ifra_addr,
	ia->ia_addr.sin_len);
	if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
	bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
	ia->ia_dstaddr.sin_len);
	}
	bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr,
	ia->ia_sockmask.sin_len);

	return in_control(so, SIOCDIFADDR, (caddr_t)&ifra,
	ifp, td);
	}
	}
	}

	return EOPNOTSUPP; /just for safety/
	}

	/*
	* Delete any existing route for an interface.
	*/
	void
	in_ifscrub(struct ifnet ifp, struct in_ifaddr ia)
	{

	in_scrubprefix(ia);
	}

	/*
	* Initialize an interface's internet address
	* and routing table entry.
	*/
	static int
	in_ifinit(struct ifnet ifp, struct in_ifaddr ia, struct sockaddr_in *sin,
	int scrub)
	{
	+ INIT_VNET_INET(ifp->if_vnet);
	register u_long i = ntohl(sin->sin_addr.s_addr);
	struct sockaddr_in oldaddr;
	int s = splimp(), flags = RTF_UP, error = 0;

	oldaddr = ia->ia_addr;
	if (oldaddr.sin_family == AF_INET)
	LIST_REMOVE(ia, ia_hash);
	ia->ia_addr = *sin;
	if (ia->ia_addr.sin_family == AF_INET)
	LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr),
	ia, ia_hash);
	/*
	* Give the interface a chance to initialize
	* if this is its first address,
	* and to validate the address if necessary.
	*/
	if (ifp->if_ioctl) {
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
	IFF_UNLOCKGIANT(ifp);
	if (error) {
	splx(s);
	/* LIST_REMOVE(ia, ia_hash) is done in in_control */
	ia->ia_addr = oldaddr;
	if (ia->ia_addr.sin_family == AF_INET)
	LIST_INSERT_HEAD(INADDR_HASH(
	ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
	else
	/*
	* If oldaddr family is not AF_INET (e.g.
	* interface has been just created) in_control
	* does not call LIST_REMOVE, and we end up
	* with bogus ia entries in hash
	*/
	LIST_REMOVE(ia, ia_hash);
	return (error);
	}
	}
	splx(s);
	if (scrub) {
	ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr;
	in_ifscrub(ifp, ia);
	ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
	}
	if (IN_CLASSA(i))
	ia->ia_netmask = IN_CLASSA_NET;
	else if (IN_CLASSB(i))
	ia->ia_netmask = IN_CLASSB_NET;
	else
	ia->ia_netmask = IN_CLASSC_NET;
	/*
	* The subnet mask usually includes at least the standard network part,
	* but may may be smaller in the case of supernetting.
	* If it is set, we believe it.
	*/
	if (ia->ia_subnetmask == 0) {
	ia->ia_subnetmask = ia->ia_netmask;
	ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
	} else
	ia->ia_netmask &= ia->ia_subnetmask;
	ia->ia_net = i & ia->ia_netmask;
	ia->ia_subnet = i & ia->ia_subnetmask;
	in_socktrim(&ia->ia_sockmask);
	#ifdef DEV_CARP
	/*
	* XXX: carp(4) does not have interface route
	*/
	if (ifp->if_type == IFT_CARP)
	return (0);
	#endif
	/*
	* Add route for the network.
	*/
	ia->ia_ifa.ifa_metric = ifp->if_metric;
	if (ifp->if_flags & IFF_BROADCAST) {
	ia->ia_broadaddr.sin_addr.s_addr =
	htonl(ia->ia_subnet \| ~ia->ia_subnetmask);
	ia->ia_netbroadcast.s_addr =
	htonl(ia->ia_net \| ~ ia->ia_netmask);
	} else if (ifp->if_flags & IFF_LOOPBACK) {
	ia->ia_dstaddr = ia->ia_addr;
	flags \|= RTF_HOST;
	} else if (ifp->if_flags & IFF_POINTOPOINT) {
	if (ia->ia_dstaddr.sin_family != AF_INET)
	return (0);
	flags \|= RTF_HOST;
	}
	if ((error = in_addprefix(ia, flags)) != 0)
	return (error);

	return (error);
	}

	#define rtinitflags(x) \
	((((x)->ia_ifp->if_flags & (IFF_LOOPBACK \| IFF_POINTOPOINT)) != 0) \
	? RTF_HOST : 0)
	/*
	* Check if we have a route for the given prefix already or add one accordingly.
	*/
	static int
	in_addprefix(struct in_ifaddr *target, int flags)
	{
	+ INIT_VNET_INET(curvnet);
	struct in_ifaddr *ia;
	struct in_addr prefix, mask, p, m;
	int error;

	if ((flags & RTF_HOST) != 0) {
	prefix = target->ia_dstaddr.sin_addr;
	mask.s_addr = 0;
	} else {
	prefix = target->ia_addr.sin_addr;
	mask = target->ia_sockmask.sin_addr;
	prefix.s_addr &= mask.s_addr;
	}

	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	if (rtinitflags(ia)) {
	p = ia->ia_addr.sin_addr;

	if (prefix.s_addr != p.s_addr)
	continue;
	} else {
	p = ia->ia_addr.sin_addr;
	m = ia->ia_sockmask.sin_addr;
	p.s_addr &= m.s_addr;

	if (prefix.s_addr != p.s_addr \|\|
	mask.s_addr != m.s_addr)
	continue;
	}

	/*
	* If we got a matching prefix route inserted by other
	* interface address, we are done here.
	*/
	if (ia->ia_flags & IFA_ROUTE) {
	if (V_sameprefixcarponly &&
	target->ia_ifp->if_type != IFT_CARP &&
	ia->ia_ifp->if_type != IFT_CARP)
	return (EEXIST);
	else
	return (0);
	}
	}

	/*
	* No-one seem to have this prefix route, so we try to insert it.
	*/
	error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags);
	if (!error)
	target->ia_flags \|= IFA_ROUTE;
	return error;
	}

	/*
	* If there is no other address in the system that can serve a route to the
	* same prefix, remove the route. Hand over the route to the new address
	* otherwise.
	*/
	static int
	in_scrubprefix(struct in_ifaddr *target)
	{
	+ INIT_VNET_INET(curvnet);
	struct in_ifaddr *ia;
	struct in_addr prefix, mask, p;
	int error;

	if ((target->ia_flags & IFA_ROUTE) == 0)
	return 0;

	if (rtinitflags(target))
	prefix = target->ia_dstaddr.sin_addr;
	else {
	prefix = target->ia_addr.sin_addr;
	mask = target->ia_sockmask.sin_addr;
	prefix.s_addr &= mask.s_addr;
	}

	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	if (rtinitflags(ia))
	p = ia->ia_dstaddr.sin_addr;
	else {
	p = ia->ia_addr.sin_addr;
	p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
	}

	if (prefix.s_addr != p.s_addr)
	continue;

	/*
	* If we got a matching prefix address, move IFA_ROUTE and
	* the route itself to it. Make sure that routing daemons
	* get a heads-up.
	*
	* XXX: a special case for carp(4) interface
	*/
	if ((ia->ia_flags & IFA_ROUTE) == 0
	#ifdef DEV_CARP
	&& (ia->ia_ifp->if_type != IFT_CARP)
	#endif
	) {
	rtinit(&(target->ia_ifa), (int)RTM_DELETE,
	rtinitflags(target));
	target->ia_flags &= ~IFA_ROUTE;

	error = rtinit(&ia->ia_ifa, (int)RTM_ADD,
	rtinitflags(ia) \| RTF_UP);
	if (error == 0)
	ia->ia_flags \|= IFA_ROUTE;
	return error;
	}
	}

	/*
	* As no-one seem to have this prefix, we can remove the route.
	*/
	rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target));
	target->ia_flags &= ~IFA_ROUTE;
	return 0;
	}

	#undef rtinitflags

	/*
	* Return 1 if the address might be a local broadcast address.
	*/
	int
	in_broadcast(struct in_addr in, struct ifnet *ifp)
	{
	register struct ifaddr *ifa;
	u_long t;

	if (in.s_addr == INADDR_BROADCAST \|\|
	in.s_addr == INADDR_ANY)
	return 1;
	if ((ifp->if_flags & IFF_BROADCAST) == 0)
	return 0;
	t = ntohl(in.s_addr);
	/*
	* Look through the list of addresses for a match
	* with a broadcast address.
	*/
	#define ia ((struct in_ifaddr *)ifa)
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
	if (ifa->ifa_addr->sa_family == AF_INET &&
	(in.s_addr == ia->ia_broadaddr.sin_addr.s_addr \|\|
	in.s_addr == ia->ia_netbroadcast.s_addr \|\|
	/*
	* Check for old-style (host 0) broadcast.
	*/
	t == ia->ia_subnet \|\| t == ia->ia_net) &&
	/*
	* Check for an all one subnetmask. These
	* only exist when an interface gets a secondary
	* address.
	*/
	ia->ia_subnetmask != (u_long)0xffffffff)
	return 1;
	return (0);
	#undef ia
	}

	/*
	* Delete all IPv4 multicast address records, and associated link-layer
	* multicast address records, associated with ifp.
	*/
	static void
	in_purgemaddrs(struct ifnet *ifp)
	{
	+ INIT_VNET_INET(ifp->if_vnet);
	struct in_multi *inm;
	struct in_multi *oinm;

	#ifdef DIAGNOSTIC
	printf("%s: purging ifp %p\n", __func__, ifp);
	#endif
	IFF_LOCKGIANT(ifp);
	IN_MULTI_LOCK();
	LIST_FOREACH_SAFE(inm, &V_in_multihead, inm_link, oinm) {
	if (inm->inm_ifp == ifp)
	in_delmulti_locked(inm);
	}
	IN_MULTI_UNLOCK();
	IFF_UNLOCKGIANT(ifp);
	}

	/*
	* On interface removal, clean up IPv4 data structures hung off of the ifnet.
	*/
	void
	in_ifdetach(struct ifnet *ifp)
	{
	+ INIT_VNET_INET(ifp->if_vnet);

	in_pcbpurgeif0(&V_ripcbinfo, ifp);
	in_pcbpurgeif0(&V_udbinfo, ifp);
	in_purgemaddrs(ifp);
	}
	Index: head/sys/netinet/in.h
	===================================================================
	--- head/sys/netinet/in.h (revision 183549)
	+++ head/sys/netinet/in.h (revision 183550)
	@@ -1,746 +1,750 @@
	/*-
	* Copyright (c) 1982, 1986, 1990, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in.h 8.3 (Berkeley) 1/3/94
	* $FreeBSD$
	*/

	#ifndef _NETINET_IN_H_
	#define _NETINET_IN_H_

	#include <sys/cdefs.h>
	#include <sys/_types.h>
	#include <machine/endian.h>

	/* Protocols common to RFC 1700, POSIX, and X/Open. */
	#define IPPROTO_IP 0 /* dummy for IP */
	#define IPPROTO_ICMP 1 /* control message protocol */
	#define IPPROTO_TCP 6 /* tcp */
	#define IPPROTO_UDP 17 /* user datagram protocol */

	#define INADDR_ANY (u_int32_t)0x00000000
	#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */

	#ifndef _UINT8_T_DECLARED
	typedef __uint8_t uint8_t;
	#define _UINT8_T_DECLARED
	#endif

	#ifndef _UINT16_T_DECLARED
	typedef __uint16_t uint16_t;
	#define _UINT16_T_DECLARED
	#endif

	#ifndef _UINT32_T_DECLARED
	typedef __uint32_t uint32_t;
	#define _UINT32_T_DECLARED
	#endif

	#ifndef _IN_ADDR_T_DECLARED
	typedef uint32_t in_addr_t;
	#define _IN_ADDR_T_DECLARED
	#endif

	#ifndef _IN_PORT_T_DECLARED
	typedef uint16_t in_port_t;
	#define _IN_PORT_T_DECLARED
	#endif

	#ifndef _SA_FAMILY_T_DECLARED
	typedef __sa_family_t sa_family_t;
	#define _SA_FAMILY_T_DECLARED
	#endif

	/* Internet address (a structure for historical reasons). */
	#ifndef _STRUCT_IN_ADDR_DECLARED
	struct in_addr {
	in_addr_t s_addr;
	};
	#define _STRUCT_IN_ADDR_DECLARED
	#endif

	#ifndef _SOCKLEN_T_DECLARED
	typedef __socklen_t socklen_t;
	#define _SOCKLEN_T_DECLARED
	#endif

	/* Avoid collision with original definition in sys/socket.h. */
	#ifndef _STRUCT_SOCKADDR_STORAGE_DECLARED
	/*
	* RFC 2553: protocol-independent placeholder for socket addresses
	*/
	#define _SS_MAXSIZE 128U
	#define _SS_ALIGNSIZE (sizeof(__int64_t))
	#define _SS_PAD1SIZE (_SS_ALIGNSIZE - sizeof(unsigned char) - \
	sizeof(sa_family_t))
	#define _SS_PAD2SIZE (_SS_MAXSIZE - sizeof(unsigned char) - \
	sizeof(sa_family_t) - _SS_PAD1SIZE - _SS_ALIGNSIZE)

	struct sockaddr_storage {
	unsigned char ss_len; /* address length */
	sa_family_t ss_family; /* address family */
	char __ss_pad1[_SS_PAD1SIZE];
	__int64_t __ss_align; /* force desired struct alignment */
	char __ss_pad2[_SS_PAD2SIZE];
	};
	#define _STRUCT_SOCKADDR_STORAGE_DECLARED
	#endif

	/* Socket address, internet style. */
	struct sockaddr_in {
	uint8_t sin_len;
	sa_family_t sin_family;
	in_port_t sin_port;
	struct in_addr sin_addr;
	char sin_zero[8];
	};

	#ifndef _KERNEL

	#ifndef _BYTEORDER_PROTOTYPED
	#define _BYTEORDER_PROTOTYPED
	__BEGIN_DECLS
	uint32_t htonl(uint32_t);
	uint16_t htons(uint16_t);
	uint32_t ntohl(uint32_t);
	uint16_t ntohs(uint16_t);
	__END_DECLS
	#endif

	#ifndef _BYTEORDER_FUNC_DEFINED
	#define _BYTEORDER_FUNC_DEFINED
	#define htonl(x) __htonl(x)
	#define htons(x) __htons(x)
	#define ntohl(x) __ntohl(x)
	#define ntohs(x) __ntohs(x)
	#endif

	#endif /* !_KERNEL */

	#if __POSIX_VISIBLE >= 200112
	#define IPPROTO_RAW 255 /* raw IP packet */
	#define INET_ADDRSTRLEN 16
	#endif

	#if __BSD_VISIBLE
	/*
	* Constants and structures defined by the internet system,
	* Per RFC 790, September 1981, and numerous additions.
	*/

	/*
	* Protocols (RFC 1700)
	*/
	#define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */
	#define IPPROTO_IGMP 2 /* group mgmt protocol */
	#define IPPROTO_GGP 3 /* gateway^2 (deprecated) */
	#define IPPROTO_IPV4 4 /* IPv4 encapsulation */
	#define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */
	#define IPPROTO_ST 7 /* Stream protocol II */
	#define IPPROTO_EGP 8 /* exterior gateway protocol */
	#define IPPROTO_PIGP 9 /* private interior gateway */
	#define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */
	#define IPPROTO_NVPII 11 /* network voice protocol*/
	#define IPPROTO_PUP 12 /* pup */
	#define IPPROTO_ARGUS 13 /* Argus */
	#define IPPROTO_EMCON 14 /* EMCON */
	#define IPPROTO_XNET 15 /* Cross Net Debugger */
	#define IPPROTO_CHAOS 16 /* Chaos*/
	#define IPPROTO_MUX 18 /* Multiplexing */
	#define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */
	#define IPPROTO_HMP 20 /* Host Monitoring */
	#define IPPROTO_PRM 21 /* Packet Radio Measurement */
	#define IPPROTO_IDP 22 /* xns idp */
	#define IPPROTO_TRUNK1 23 /* Trunk-1 */
	#define IPPROTO_TRUNK2 24 /* Trunk-2 */
	#define IPPROTO_LEAF1 25 /* Leaf-1 */
	#define IPPROTO_LEAF2 26 /* Leaf-2 */
	#define IPPROTO_RDP 27 /* Reliable Data */
	#define IPPROTO_IRTP 28 /* Reliable Transaction */
	#define IPPROTO_TP 29 /* tp-4 w/ class negotiation */
	#define IPPROTO_BLT 30 /* Bulk Data Transfer */
	#define IPPROTO_NSP 31 /* Network Services */
	#define IPPROTO_INP 32 /* Merit Internodal */
	#define IPPROTO_SEP 33 /* Sequential Exchange */
	#define IPPROTO_3PC 34 /* Third Party Connect */
	#define IPPROTO_IDPR 35 /* InterDomain Policy Routing */
	#define IPPROTO_XTP 36 /* XTP */
	#define IPPROTO_DDP 37 /* Datagram Delivery */
	#define IPPROTO_CMTP 38 /* Control Message Transport */
	#define IPPROTO_TPXX 39 /* TP++ Transport */
	#define IPPROTO_IL 40 /* IL transport protocol */
	#define IPPROTO_IPV6 41 /* IP6 header */
	#define IPPROTO_SDRP 42 /* Source Demand Routing */
	#define IPPROTO_ROUTING 43 /* IP6 routing header */
	#define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */
	#define IPPROTO_IDRP 45 /* InterDomain Routing*/
	#define IPPROTO_RSVP 46 /* resource reservation */
	#define IPPROTO_GRE 47 /* General Routing Encap. */
	#define IPPROTO_MHRP 48 /* Mobile Host Routing */
	#define IPPROTO_BHA 49 /* BHA */
	#define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */
	#define IPPROTO_AH 51 /* IP6 Auth Header */
	#define IPPROTO_INLSP 52 /* Integ. Net Layer Security */
	#define IPPROTO_SWIPE 53 /* IP with encryption */
	#define IPPROTO_NHRP 54 /* Next Hop Resolution */
	#define IPPROTO_MOBILE 55 /* IP Mobility */
	#define IPPROTO_TLSP 56 /* Transport Layer Security */
	#define IPPROTO_SKIP 57 /* SKIP */
	#define IPPROTO_ICMPV6 58 /* ICMP6 */
	#define IPPROTO_NONE 59 /* IP6 no next header */
	#define IPPROTO_DSTOPTS 60 /* IP6 destination option */
	#define IPPROTO_AHIP 61 /* any host internal protocol */
	#define IPPROTO_CFTP 62 /* CFTP */
	#define IPPROTO_HELLO 63 /* "hello" routing protocol */
	#define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */
	#define IPPROTO_KRYPTOLAN 65 /* Kryptolan */
	#define IPPROTO_RVD 66 /* Remote Virtual Disk */
	#define IPPROTO_IPPC 67 /* Pluribus Packet Core */
	#define IPPROTO_ADFS 68 /* Any distributed FS */
	#define IPPROTO_SATMON 69 /* Satnet Monitoring */
	#define IPPROTO_VISA 70 /* VISA Protocol */
	#define IPPROTO_IPCV 71 /* Packet Core Utility */
	#define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */
	#define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */
	#define IPPROTO_WSN 74 /* Wang Span Network */
	#define IPPROTO_PVP 75 /* Packet Video Protocol */
	#define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */
	#define IPPROTO_ND 77 /* Sun net disk proto (temp.) */
	#define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */
	#define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */
	#define IPPROTO_EON 80 /* ISO cnlp */
	#define IPPROTO_VMTP 81 /* VMTP */
	#define IPPROTO_SVMTP 82 /* Secure VMTP */
	#define IPPROTO_VINES 83 /* Banyon VINES */
	#define IPPROTO_TTP 84 /* TTP */
	#define IPPROTO_IGP 85 /* NSFNET-IGP */
	#define IPPROTO_DGP 86 /* dissimilar gateway prot. */
	#define IPPROTO_TCF 87 /* TCF */
	#define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */
	#define IPPROTO_OSPFIGP 89 /* OSPFIGP */
	#define IPPROTO_SRPC 90 /* Strite RPC protocol */
	#define IPPROTO_LARP 91 /* Locus Address Resoloution */
	#define IPPROTO_MTP 92 /* Multicast Transport */
	#define IPPROTO_AX25 93 /* AX.25 Frames */
	#define IPPROTO_IPEIP 94 /* IP encapsulated in IP */
	#define IPPROTO_MICP 95 /* Mobile Int.ing control */
	#define IPPROTO_SCCSP 96 /* Semaphore Comm. security */
	#define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */
	#define IPPROTO_ENCAP 98 /* encapsulation header */
	#define IPPROTO_APES 99 /* any private encr. scheme */
	#define IPPROTO_GMTP 100 /* GMTP*/
	#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */
	#define IPPROTO_SCTP 132 /* SCTP */
	/* 101-254: Partly Unassigned */
	#define IPPROTO_PIM 103 /* Protocol Independent Mcast */
	#define IPPROTO_CARP 112 /* CARP */
	#define IPPROTO_PGM 113 /* PGM */
	#define IPPROTO_PFSYNC 240 /* PFSYNC */
	/* 255: Reserved */
	/* BSD Private, local use, namespace incursion, no longer used */
	#define IPPROTO_OLD_DIVERT 254 /* OLD divert pseudo-proto */
	#define IPPROTO_MAX 256

	/* last return value of _input(), meaning "all job for this pkt is done". /
	#define IPPROTO_DONE 257

	/* Only used internally, so can be outside the range of valid IP protocols. */
	#define IPPROTO_DIVERT 258 /* divert pseudo-protocol */

	/*
	* Defined to avoid confusion. The master value is defined by
	* PROTO_SPACER in sys/protosw.h.
	*/
	#define IPPROTO_SPACER 32767 /* spacer for loadable protos */

	/*
	* Local port number conventions:
	*
	* When a user does a bind(2) or connect(2) with a port number of zero,
	* a non-conflicting local port address is chosen.
	* The default range is IPPORT_HIFIRSTAUTO through
	* IPPORT_HILASTAUTO, although that is settable by sysctl.
	*
	* A user may set the IPPROTO_IP option IP_PORTRANGE to change this
	* default assignment range.
	*
	* The value IP_PORTRANGE_DEFAULT causes the default behavior.
	*
	* The value IP_PORTRANGE_HIGH changes the range of candidate port numbers
	* into the "high" range. These are reserved for client outbound connections
	* which do not want to be filtered by any firewalls.
	*
	* The value IP_PORTRANGE_LOW changes the range to the "low" are
	* that is (by convention) restricted to privileged processes. This
	* convention is based on "vouchsafe" principles only. It is only secure
	* if you trust the remote host to restrict these ports.
	*
	* The default range of ports and the high range can be changed by
	* sysctl(3). (net.inet.ip.port{hi,low}{first,last}_auto)
	*
	* Changing those values has bad security implications if you are
	* using a stateless firewall that is allowing packets outside of that
	* range in order to allow transparent outgoing connections.
	*
	* Such a firewall configuration will generally depend on the use of these
	* default values. If you change them, you may find your Security
	* Administrator looking for you with a heavy object.
	*
	* For a slightly more orthodox text view on this:
	*
	* ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers
	*
	* port numbers are divided into three ranges:
	*
	* 0 - 1023 Well Known Ports
	* 1024 - 49151 Registered Ports
	* 49152 - 65535 Dynamic and/or Private Ports
	*
	*/

	/*
	* Ports < IPPORT_RESERVED are reserved for
	* privileged processes (e.g. root). (IP_PORTRANGE_LOW)
	*/
	#define IPPORT_RESERVED 1024

	/*
	* Default local port range, used by IP_PORTRANGE_DEFAULT
	*/
	#define IPPORT_EPHEMERALFIRST 10000
	#define IPPORT_EPHEMERALLAST 65535

	/*
	* Dynamic port range, used by IP_PORTRANGE_HIGH.
	*/
	#define IPPORT_HIFIRSTAUTO 49152
	#define IPPORT_HILASTAUTO 65535

	/*
	* Scanning for a free reserved port return a value below IPPORT_RESERVED,
	* but higher than IPPORT_RESERVEDSTART. Traditionally the start value was
	* 512, but that conflicts with some well-known-services that firewalls may
	* have a fit if we use.
	*/
	#define IPPORT_RESERVEDSTART 600

	#define IPPORT_MAX 65535

	/*
	* Definitions of bits in internet address integers.
	* On subnets, the decomposition of addresses to host and net parts
	* is done according to subnet mask, not the masks here.
	*/
	#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0)
	#define IN_CLASSA_NET 0xff000000
	#define IN_CLASSA_NSHIFT 24
	#define IN_CLASSA_HOST 0x00ffffff
	#define IN_CLASSA_MAX 128

	#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000)
	#define IN_CLASSB_NET 0xffff0000
	#define IN_CLASSB_NSHIFT 16
	#define IN_CLASSB_HOST 0x0000ffff
	#define IN_CLASSB_MAX 65536

	#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000)
	#define IN_CLASSC_NET 0xffffff00
	#define IN_CLASSC_NSHIFT 8
	#define IN_CLASSC_HOST 0x000000ff

	#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000)
	#define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */
	#define IN_CLASSD_NSHIFT 28 /* net and host fields, but */
	#define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */
	#define IN_MULTICAST(i) IN_CLASSD(i)

	#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000)
	#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000)

	#define IN_LINKLOCAL(i) (((u_int32_t)(i) & 0xffff0000) == 0xa9fe0000)
	#define IN_LOOPBACK(i) (((u_int32_t)(i) & 0xff000000) == 0x7f000000)
	#define IN_ZERONET(i) (((u_int32_t)(i) & 0xff000000) == 0)

	#define IN_PRIVATE(i) ((((u_int32_t)(i) & 0xff000000) == 0x0a000000) \|\| \
	(((u_int32_t)(i) & 0xfff00000) == 0xac100000) \|\| \
	(((u_int32_t)(i) & 0xffff0000) == 0xc0a80000))

	#define IN_LOCAL_GROUP(i) (((u_int32_t)(i) & 0xffffff00) == 0xe0000000)

	#define IN_ANY_LOCAL(i) (IN_LINKLOCAL(i) \|\| IN_LOCAL_GROUP(i))

	#define INADDR_LOOPBACK (u_int32_t)0x7f000001
	#ifndef _KERNEL
	#define INADDR_NONE 0xffffffff /* -1 return */
	#endif

	#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */
	#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */
	#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */
	#define INADDR_ALLRPTS_GROUP (u_int32_t)0xe0000016 /* 224.0.0.22, IGMPv3 */
	#define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */
	#define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */
	#define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */
	#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */

	#define IN_LOOPBACKNET 127 /* official! */

	/*
	* Options for use with [gs]etsockopt at the IP level.
	* First word of comment is data type; bool is stored in int.
	*/
	#define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */
	#define IP_HDRINCL 2 /* int; header is included with data */
	#define IP_TOS 3 /* int; IP type of service and preced. */
	#define IP_TTL 4 /* int; IP time to live */
	#define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */
	#define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */
	#define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */
	#define IP_SENDSRCADDR IP_RECVDSTADDR /* cmsg_type to set src addr */
	#define IP_RETOPTS 8 /* ip_opts; set/get IP options */
	#define IP_MULTICAST_IF 9 /* struct in_addr or struct ip_mreqn;
	* set/get IP multicast i/f */
	#define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */
	#define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */
	#define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */
	#define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */
	#define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */
	#define IP_RSVP_ON 15 /* enable RSVP in kernel */
	#define IP_RSVP_OFF 16 /* disable RSVP in kernel */
	#define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */
	#define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */
	#define IP_PORTRANGE 19 /* int; range to choose for unspec port */
	#define IP_RECVIF 20 /* bool; receive reception if w/dgram */
	/* for IPSEC */
	#define IP_IPSEC_POLICY 21 /* int; set/get security policy */
	#define IP_FAITH 22 /* bool; accept FAITH'ed connections */

	#define IP_ONESBCAST 23 /* bool: send all-ones broadcast */

	#define IP_FW_TABLE_ADD 40 /* add entry */
	#define IP_FW_TABLE_DEL 41 /* delete entry */
	#define IP_FW_TABLE_FLUSH 42 /* flush table */
	#define IP_FW_TABLE_GETSIZE 43 /* get table size */
	#define IP_FW_TABLE_LIST 44 /* list table contents */

	#define IP_FW_ADD 50 /* add a firewall rule to chain */
	#define IP_FW_DEL 51 /* delete a firewall rule from chain */
	#define IP_FW_FLUSH 52 /* flush firewall rule chain */
	#define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */
	#define IP_FW_GET 54 /* get entire firewall rule chain */
	#define IP_FW_RESETLOG 55 /* reset logging counters */

	#define IP_FW_NAT_CFG 56 /* add/config a nat rule */
	#define IP_FW_NAT_DEL 57 /* delete a nat rule */
	#define IP_FW_NAT_GET_CONFIG 58 /* get configuration of a nat rule */
	#define IP_FW_NAT_GET_LOG 59 /* get log of a nat rule */

	#define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */
	#define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */
	#define IP_DUMMYNET_FLUSH 62 /* flush dummynet */
	#define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */

	#define IP_RECVTTL 65 /* bool; receive IP TTL w/dgram */
	#define IP_MINTTL 66 /* minimum TTL for packet or drop */
	#define IP_DONTFRAG 67 /* don't fragment packet */

	/* IPv4 Source Filter Multicast API [RFC3678] */
	#define IP_ADD_SOURCE_MEMBERSHIP 70 /* join a source-specific group */
	#define IP_DROP_SOURCE_MEMBERSHIP 71 /* drop a single source */
	#define IP_BLOCK_SOURCE 72 /* block a source */
	#define IP_UNBLOCK_SOURCE 73 /* unblock a source */

	/* The following option is private; do not use it from user applications. */
	#define IP_MSFILTER 74 /* set/get filter list */

	/* Protocol Independent Multicast API [RFC3678] */
	#define MCAST_JOIN_GROUP 80 /* join an any-source group */
	#define MCAST_LEAVE_GROUP 81 /* leave all sources for group */
	#define MCAST_JOIN_SOURCE_GROUP 82 /* join a source-specific group */
	#define MCAST_LEAVE_SOURCE_GROUP 83 /* leave a single source */
	#define MCAST_BLOCK_SOURCE 84 /* block a source */
	#define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */

	/*
	* Defaults and limits for options
	*/
	#define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */
	#define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */

	/*
	* The imo_membership vector for each socket is now dynamically allocated at
	* run-time, bounded by USHRT_MAX, and is reallocated when needed, sized
	* according to a power-of-two increment.
	*/
	#define IP_MIN_MEMBERSHIPS 31
	#define IP_MAX_MEMBERSHIPS 4095
	#define IP_MAX_SOURCE_FILTER 1024 /* # of filters per socket, per group */

	/*
	* Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP.
	*/
	struct ip_mreq {
	struct in_addr imr_multiaddr; /* IP multicast address of group */
	struct in_addr imr_interface; /* local IP address of interface */
	};

	/*
	* Modified argument structure for IP_MULTICAST_IF, obtained from Linux.
	* This is used to specify an interface index for multicast sends, as
	* the IPv4 legacy APIs do not support this (unless IP_SENDIF is available).
	*/
	struct ip_mreqn {
	struct in_addr imr_multiaddr; /* IP multicast address of group */
	struct in_addr imr_address; /* local IP address of interface */
	int imr_ifindex; /* Interface index; cast to uint32_t */
	};

	/*
	* Argument structure for IPv4 Multicast Source Filter APIs. [RFC3678]
	*/
	struct ip_mreq_source {
	struct in_addr imr_multiaddr; /* IP multicast address of group */
	struct in_addr imr_sourceaddr; /* IP address of source */
	struct in_addr imr_interface; /* local IP address of interface */
	};

	/*
	* Argument structures for Protocol-Independent Multicast Source
	* Filter APIs. [RFC3678]
	*/
	struct group_req {
	uint32_t gr_interface; /* interface index */
	struct sockaddr_storage gr_group; /* group address */
	};

	struct group_source_req {
	uint32_t gsr_interface; /* interface index */
	struct sockaddr_storage gsr_group; /* group address */
	struct sockaddr_storage gsr_source; /* source address */
	};

	#ifndef __MSFILTERREQ_DEFINED
	#define __MSFILTERREQ_DEFINED
	/*
	* The following structure is private; do not use it from user applications.
	* It is used to communicate IP_MSFILTER/IPV6_MSFILTER information between
	* the RFC 3678 libc functions and the kernel.
	*/
	struct __msfilterreq {
	uint32_t msfr_ifindex; /* interface index */
	uint32_t msfr_fmode; /* filter mode for group */
	uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */
	struct sockaddr_storage msfr_group; /* group address */
	struct sockaddr_storage msfr_srcs; / pointer to the first member
	* of a contiguous array of
	* sources to filter in full.
	*/
	};
	#endif

	struct sockaddr;

	/*
	* Advanced (Full-state) APIs [RFC3678]
	* The RFC specifies uint_t for the 6th argument to [sg]etsourcefilter().
	* We use uint32_t here to be consistent.
	*/
	int setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t,
	uint32_t, struct in_addr *);
	int getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *,
	uint32_t , struct in_addr );
	int setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
	uint32_t, uint32_t, struct sockaddr_storage *);
	int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t,
	uint32_t , uint32_t , struct sockaddr_storage *);

	/*
	* Filter modes; also used to represent per-socket filter mode internally.
	*/
	#define MCAST_INCLUDE 1 /* fmode: include these source(s) */
	#define MCAST_EXCLUDE 2 /* fmode: exclude these source(s) */

	/*
	* Argument for IP_PORTRANGE:
	* - which range to search when port is unspecified at bind() or connect()
	*/
	#define IP_PORTRANGE_DEFAULT 0 /* default range */
	#define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */
	#define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */

	/*
	* Definitions for inet sysctl operations.
	*
	* Third level is protocol number.
	* Fourth level is desired variable within that protocol.
	*/
	#define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */

	#define CTL_IPPROTO_NAMES { \
	{ "ip", CTLTYPE_NODE }, \
	{ "icmp", CTLTYPE_NODE }, \
	{ "igmp", CTLTYPE_NODE }, \
	{ "ggp", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "tcp", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ "egp", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "pup", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "udp", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "idp", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "ipsec", CTLTYPE_NODE }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "pim", CTLTYPE_NODE }, \
	}

	/*
	* Names for IP sysctl objects
	*/
	#define IPCTL_FORWARDING 1 /* act as router */
	#define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */
	#define IPCTL_DEFTTL 3 /* default TTL */
	#ifdef notyet
	#define IPCTL_DEFMTU 4 /* default MTU */
	#endif
	#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */
	#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */
	#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */
	#define IPCTL_SOURCEROUTE 8 /* may perform source routes */
	#define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */
	#define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */
	#define IPCTL_INTRQDROPS 11 /* number of netisr q drops */
	#define IPCTL_STATS 12 /* ipstat structure */
	#define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */
	#define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */
	#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */
	#define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */
	#define IPCTL_MAXID 17

	#define IPCTL_NAMES { \
	{ 0, 0 }, \
	{ "forwarding", CTLTYPE_INT }, \
	{ "redirect", CTLTYPE_INT }, \
	{ "ttl", CTLTYPE_INT }, \
	{ "mtu", CTLTYPE_INT }, \
	{ "rtexpire", CTLTYPE_INT }, \
	{ "rtminexpire", CTLTYPE_INT }, \
	{ "rtmaxcache", CTLTYPE_INT }, \
	{ "sourceroute", CTLTYPE_INT }, \
	{ "directed-broadcast", CTLTYPE_INT }, \
	{ "intr-queue-maxlen", CTLTYPE_INT }, \
	{ "intr-queue-drops", CTLTYPE_INT }, \
	{ "stats", CTLTYPE_STRUCT }, \
	{ "accept_sourceroute", CTLTYPE_INT }, \
	{ "fastforwarding", CTLTYPE_INT }, \
	}

	#endif /* __BSD_VISIBLE */

	#ifdef _KERNEL

	struct ifnet; struct mbuf; /* forward declarations for Standard C */

	int in_broadcast(struct in_addr, struct ifnet *);
	int in_canforward(struct in_addr);
	int in_localaddr(struct in_addr);
	int in_localip(struct in_addr);
	char inet_ntoa(struct in_addr); / in libkern */
	char inet_ntoa_r(struct in_addr ina, char buf); /* in libkern */
	void in_ifdetach(struct ifnet *);

	#define in_hosteq(s, t) ((s).s_addr == (t).s_addr)
	#define in_nullhost(x) ((x).s_addr == INADDR_ANY)

	#define satosin(sa) ((struct sockaddr_in *)(sa))
	#define sintosa(sin) ((struct sockaddr *)(sin))
	#define ifatoia(ifa) ((struct in_ifaddr *)(ifa))

	#endif /* _KERNEL */

	/* INET6 stuff */
	#if __POSIX_VISIBLE >= 200112
	#define __KAME_NETINET_IN_H_INCLUDED_
	#include <netinet6/in6.h>
	#undef __KAME_NETINET_IN_H_INCLUDED_
	#endif

	+#ifdef _KERNEL
	+#include <netinet/vinet.h>
	+#endif
	+
	#endif /* !_NETINET_IN_H_*/
	Index: head/sys/netinet/in_gif.c
	===================================================================
	--- head/sys/netinet/in_gif.c (revision 183549)
	+++ head/sys/netinet/in_gif.c (revision 183550)
	@@ -1,433 +1,435 @@
	/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mrouting.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/protosw.h>
	-
	#include <sys/malloc.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/in_gif.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_encap.h>
	#include <netinet/ip_ecn.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#endif

	#ifdef MROUTING
	#include <netinet/ip_mroute.h>
	#endif /* MROUTING */

	#include <net/if_gif.h>

	static int gif_validate4(const struct ip , struct gif_softc ,
	struct ifnet *);

	extern struct domain inetdomain;
	struct protosw in_gif_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inetdomain,
	.pr_protocol = 0/* IPPROTO_IPV[46] */,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = in_gif_input,
	.pr_output = (pr_output_t*)rip_output,
	.pr_ctloutput = rip_ctloutput,
	.pr_usrreqs = &rip_usrreqs
	};

	static int ip_gif_ttl = GIF_TTL;
	-SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW,
	- &ip_gif_ttl, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_gif, _net_inet_ip, IPCTL_GIF_TTL, gifttl,
	+ CTLFLAG_RW, ip_gif_ttl, 0, "");

	int
	in_gif_output(struct ifnet ifp, int family, struct mbuf m)
	{
	+ INIT_VNET_GIF(ifp->if_vnet);
	struct gif_softc *sc = ifp->if_softc;
	struct sockaddr_in dst = (struct sockaddr_in )&sc->gif_ro.ro_dst;
	struct sockaddr_in sin_src = (struct sockaddr_in )sc->gif_psrc;
	struct sockaddr_in sin_dst = (struct sockaddr_in )sc->gif_pdst;
	struct ip iphdr; /* capsule IP header, host byte ordered */
	struct etherip_header eiphdr;
	int proto, error;
	u_int8_t tos;

	GIF_LOCK_ASSERT(sc);

	if (sin_src == NULL \|\| sin_dst == NULL \|\|
	sin_src->sin_family != AF_INET \|\|
	sin_dst->sin_family != AF_INET) {
	m_freem(m);
	return EAFNOSUPPORT;
	}

	switch (family) {
	#ifdef INET
	case AF_INET:
	{
	struct ip *ip;

	proto = IPPROTO_IPV4;
	if (m->m_len < sizeof(*ip)) {
	m = m_pullup(m, sizeof(*ip));
	if (!m)
	return ENOBUFS;
	}
	ip = mtod(m, struct ip *);
	tos = ip->ip_tos;
	break;
	}
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	{
	struct ip6_hdr *ip6;
	proto = IPPROTO_IPV6;
	if (m->m_len < sizeof(*ip6)) {
	m = m_pullup(m, sizeof(*ip6));
	if (!m)
	return ENOBUFS;
	}
	ip6 = mtod(m, struct ip6_hdr *);
	tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	break;
	}
	#endif /* INET6 */
	case AF_LINK:
	proto = IPPROTO_ETHERIP;
	eiphdr.eip_ver = ETHERIP_VERSION & ETHERIP_VER_VERS_MASK;
	eiphdr.eip_pad = 0;
	/* prepend Ethernet-in-IP header */
	M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT);
	if (m && m->m_len < sizeof(struct etherip_header))
	m = m_pullup(m, sizeof(struct etherip_header));
	if (m == NULL)
	return ENOBUFS;
	bcopy(&eiphdr, mtod(m, struct etherip_header *),
	sizeof(struct etherip_header));
	break;

	default:
	#ifdef DEBUG
	printf("in_gif_output: warning: unknown family %d passed\n",
	family);
	#endif
	m_freem(m);
	return EAFNOSUPPORT;
	}

	bzero(&iphdr, sizeof(iphdr));
	iphdr.ip_src = sin_src->sin_addr;
	/* bidirectional configured tunnel mode */
	if (sin_dst->sin_addr.s_addr != INADDR_ANY)
	iphdr.ip_dst = sin_dst->sin_addr;
	else {
	m_freem(m);
	return ENETUNREACH;
	}
	iphdr.ip_p = proto;
	/* version will be set in ip_output() */
	iphdr.ip_ttl = V_ip_gif_ttl;
	iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip);
	ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE,
	&iphdr.ip_tos, &tos);

	/* prepend new IP header */
	M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
	if (m && m->m_len < sizeof(struct ip))
	m = m_pullup(m, sizeof(struct ip));
	if (m == NULL) {
	printf("ENOBUFS in in_gif_output %d\n", __LINE__);
	return ENOBUFS;
	}
	bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip));

	M_SETFIB(m, sc->gif_fibnum);

	if (dst->sin_family != sin_dst->sin_family \|\|
	dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) {
	/* cache route doesn't match */
	bzero(dst, sizeof(*dst));
	dst->sin_family = sin_dst->sin_family;
	dst->sin_len = sizeof(struct sockaddr_in);
	dst->sin_addr = sin_dst->sin_addr;
	if (sc->gif_ro.ro_rt) {
	RTFREE(sc->gif_ro.ro_rt);
	sc->gif_ro.ro_rt = NULL;
	}
	#if 0
	GIF2IFP(sc)->if_mtu = GIF_MTU;
	#endif
	}

	if (sc->gif_ro.ro_rt == NULL) {
	in_rtalloc_ign(&sc->gif_ro, 0, sc->gif_fibnum);
	if (sc->gif_ro.ro_rt == NULL) {
	m_freem(m);
	return ENETUNREACH;
	}

	/* if it constitutes infinite encapsulation, punt. */
	if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
	m_freem(m);
	return ENETUNREACH; /* XXX */
	}
	#if 0
	ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu
	- sizeof(struct ip);
	#endif
	}

	error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);

	if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) &&
	sc->gif_ro.ro_rt != NULL) {
	RTFREE(sc->gif_ro.ro_rt);
	sc->gif_ro.ro_rt = NULL;
	}

	return (error);
	}

	void
	in_gif_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET(curvnet);
	struct ifnet *gifp = NULL;
	struct gif_softc *sc;
	struct ip *ip;
	int af;
	u_int8_t otos;
	int proto;

	ip = mtod(m, struct ip *);
	proto = ip->ip_p;

	sc = (struct gif_softc *)encap_getarg(m);
	if (sc == NULL) {
	m_freem(m);
	V_ipstat.ips_nogif++;
	return;
	}

	gifp = GIF2IFP(sc);
	if (gifp == NULL \|\| (gifp->if_flags & IFF_UP) == 0) {
	m_freem(m);
	V_ipstat.ips_nogif++;
	return;
	}

	otos = ip->ip_tos;
	m_adj(m, off);

	switch (proto) {
	#ifdef INET
	case IPPROTO_IPV4:
	{
	struct ip *ip;
	af = AF_INET;
	if (m->m_len < sizeof(*ip)) {
	m = m_pullup(m, sizeof(*ip));
	if (!m)
	return;
	}
	ip = mtod(m, struct ip *);
	if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
	ECN_ALLOWED : ECN_NOCARE,
	&otos, &ip->ip_tos) == 0) {
	m_freem(m);
	return;
	}
	break;
	}
	#endif
	#ifdef INET6
	case IPPROTO_IPV6:
	{
	struct ip6_hdr *ip6;
	u_int8_t itos, oitos;

	af = AF_INET6;
	if (m->m_len < sizeof(*ip6)) {
	m = m_pullup(m, sizeof(*ip6));
	if (!m)
	return;
	}
	ip6 = mtod(m, struct ip6_hdr *);
	itos = oitos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
	ECN_ALLOWED : ECN_NOCARE,
	&otos, &itos) == 0) {
	m_freem(m);
	return;
	}
	if (itos != oitos) {
	ip6->ip6_flow &= ~htonl(0xff << 20);
	ip6->ip6_flow \|= htonl((u_int32_t)itos << 20);
	}
	break;
	}
	#endif /* INET6 */
	case IPPROTO_ETHERIP:
	af = AF_LINK;
	break;

	default:
	V_ipstat.ips_nogif++;
	m_freem(m);
	return;
	}
	gif_input(m, af, gifp);
	return;
	}

	/*
	* validate outer address.
	*/
	static int
	gif_validate4(const struct ip ip, struct gif_softc sc, struct ifnet *ifp)
	{
	+ INIT_VNET_INET(curvnet);
	struct sockaddr_in src, dst;
	struct in_ifaddr *ia4;

	src = (struct sockaddr_in *)sc->gif_psrc;
	dst = (struct sockaddr_in *)sc->gif_pdst;

	/* check for address match */
	if (src->sin_addr.s_addr != ip->ip_dst.s_addr \|\|
	dst->sin_addr.s_addr != ip->ip_src.s_addr)
	return 0;

	/* martian filters on outer source - NOT done in ip_input! */
	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)))
	return 0;
	switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) {
	case 0: case 127: case 255:
	return 0;
	}
	/* reject packets with broadcast on source */
	TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) {
	if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0)
	continue;
	if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr)
	return 0;
	}

	/* ingress filters on outer source */
	if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) {
	struct sockaddr_in sin;
	struct rtentry *rt;

	bzero(&sin, sizeof(sin));
	sin.sin_family = AF_INET;
	sin.sin_len = sizeof(struct sockaddr_in);
	sin.sin_addr = ip->ip_src;
	/* XXX MRT check for the interface we would use on output */
	rt = in_rtalloc1((struct sockaddr *)&sin, 0,
	0UL, sc->gif_fibnum);
	if (!rt \|\| rt->rt_ifp != ifp) {
	#if 0
	log(LOG_WARNING, "%s: packet from 0x%x dropped "
	"due to ingress filter\n", if_name(GIF2IFP(sc)),
	(u_int32_t)ntohl(sin.sin_addr.s_addr));
	#endif
	if (rt)
	RTFREE_LOCKED(rt);
	return 0;
	}
	RTFREE_LOCKED(rt);
	}

	return 32 * 2;
	}

	/*
	* we know that we are in IFF_UP, outer address available, and outer family
	* matched the physical addr family. see gif_encapcheck().
	*/
	int
	gif_encapcheck4(const struct mbuf m, int off, int proto, void arg)
	{
	struct ip ip;
	struct gif_softc *sc;
	struct ifnet *ifp;

	/* sanity check done in caller */
	sc = (struct gif_softc *)arg;

	/* LINTED const cast */
	m_copydata(m, 0, sizeof(ip), (caddr_t)&ip);
	ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL;

	return gif_validate4(&ip, sc, ifp);
	}

	int
	in_gif_attach(struct gif_softc *sc)
	{
	sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck,
	&in_gif_protosw, sc);
	if (sc->encap_cookie4 == NULL)
	return EEXIST;
	return 0;
	}

	int
	in_gif_detach(struct gif_softc *sc)
	{
	int error;

	error = encap_detach(sc->encap_cookie4);
	if (error == 0)
	sc->encap_cookie4 = NULL;
	return error;
	}
	Index: head/sys/netinet/in_mcast.c
	===================================================================
	--- head/sys/netinet/in_mcast.c (revision 183549)
	+++ head/sys/netinet/in_mcast.c (revision 183550)
	@@ -1,1824 +1,1835 @@
	/*-
	* Copyright (c) 2007 Bruce M. Simpson.
	* Copyright (c) 2005 Robert N. M. Watson.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* IPv4 multicast socket, group, and socket option processing module.
	* Until further notice, this file requires INET to compile.
	* TODO: Make this infrastructure independent of address family.
	* TODO: Teach netinet6 to use this code.
	* TODO: Hook up SSM logic to IGMPv3/MLDv2.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#include <netinet/igmp_var.h>

	#ifndef __SOCKUNION_DECLARED
	union sockunion {
	struct sockaddr_storage ss;
	struct sockaddr sa;
	struct sockaddr_dl sdl;
	struct sockaddr_in sin;
	#ifdef INET6
	struct sockaddr_in6 sin6;
	#endif
	};
	typedef union sockunion sockunion_t;
	#define __SOCKUNION_DECLARED
	#endif /* __SOCKUNION_DECLARED */

	static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
	static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
	static MALLOC_DEFINE(M_IPMSOURCE, "in_msource", "IPv4 multicast source filter");

	/*
	* The IPv4 multicast list (in_multihead and associated structures) are
	* protected by the global in_multi_mtx. See in_var.h for more details. For
	* now, in_multi_mtx is marked as recursible due to IGMP's calling back into
	* ip_output() to send IGMP packets while holding the lock; this probably is
	* not quite desirable.
	*/
	struct in_multihead in_multihead; /* XXX BSS initialization */
	struct mtx in_multi_mtx;
	MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF \| MTX_RECURSE);

	/*
	* Functions with non-static linkage defined in this file should be
	* declared in in_var.h:
	* imo_match_group()
	* imo_match_source()
	* in_addmulti()
	* in_delmulti()
	* in_delmulti_locked()
	* and ip_var.h:
	* inp_freemoptions()
	* inp_getmoptions()
	* inp_setmoptions()
	*/
	static int imo_grow(struct ip_moptions *);
	static int imo_join_source(struct ip_moptions , size_t, sockunion_t );
	static int imo_leave_source(struct ip_moptions , size_t, sockunion_t );
	static int inp_change_source_filter(struct inpcb , struct sockopt );
	static struct ip_moptions *
	inp_findmoptions(struct inpcb *);
	static int inp_get_source_filters(struct inpcb , struct sockopt );
	static int inp_join_group(struct inpcb , struct sockopt );
	static int inp_leave_group(struct inpcb , struct sockopt );
	static int inp_set_multicast_if(struct inpcb , struct sockopt );
	static int inp_set_source_filters(struct inpcb , struct sockopt );

	/*
	* Resize the ip_moptions vector to the next power-of-two minus 1.
	* May be called with locks held; do not sleep.
	*/
	static int
	imo_grow(struct ip_moptions *imo)
	{
	struct in_multi **nmships;
	struct in_multi **omships;
	struct in_mfilter *nmfilters;
	struct in_mfilter *omfilters;
	size_t idx;
	size_t newmax;
	size_t oldmax;

	nmships = NULL;
	nmfilters = NULL;
	omships = imo->imo_membership;
	omfilters = imo->imo_mfilters;
	oldmax = imo->imo_max_memberships;
	newmax = ((oldmax + 1) * 2) - 1;

	if (newmax <= IP_MAX_MEMBERSHIPS) {
	nmships = (struct in_multi **)realloc(omships,
	sizeof(struct in_multi ) newmax, M_IPMOPTS, M_NOWAIT);
	nmfilters = (struct in_mfilter *)realloc(omfilters,
	sizeof(struct in_mfilter) * newmax, M_IPMSOURCE, M_NOWAIT);
	if (nmships != NULL && nmfilters != NULL) {
	/* Initialize newly allocated source filter heads. */
	for (idx = oldmax; idx < newmax; idx++) {
	nmfilters[idx].imf_fmode = MCAST_EXCLUDE;
	nmfilters[idx].imf_nsources = 0;
	TAILQ_INIT(&nmfilters[idx].imf_sources);
	}
	imo->imo_max_memberships = newmax;
	imo->imo_membership = nmships;
	imo->imo_mfilters = nmfilters;
	}
	}

	if (nmships == NULL \|\| nmfilters == NULL) {
	if (nmships != NULL)
	free(nmships, M_IPMOPTS);
	if (nmfilters != NULL)
	free(nmfilters, M_IPMSOURCE);
	return (ETOOMANYREFS);
	}

	return (0);
	}

	/*
	* Add a source to a multicast filter list.
	* Assumes the associated inpcb is locked.
	*/
	static int
	imo_join_source(struct ip_moptions imo, size_t gidx, sockunion_t src)
	{
	struct in_msource ims, nims;
	struct in_mfilter *imf;

	KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__));
	KASSERT(imo->imo_mfilters != NULL,
	("%s: imo_mfilters vector not allocated", __func__));

	imf = &imo->imo_mfilters[gidx];
	if (imf->imf_nsources == IP_MAX_SOURCE_FILTER)
	return (ENOBUFS);

	ims = imo_match_source(imo, gidx, &src->sa);
	if (ims != NULL)
	return (EADDRNOTAVAIL);

	/* Do not sleep with inp lock held. */
	MALLOC(nims, struct in_msource *, sizeof(struct in_msource),
	M_IPMSOURCE, M_NOWAIT \| M_ZERO);
	if (nims == NULL)
	return (ENOBUFS);

	nims->ims_addr = src->ss;
	TAILQ_INSERT_TAIL(&imf->imf_sources, nims, ims_next);
	imf->imf_nsources++;

	return (0);
	}

	static int
	imo_leave_source(struct ip_moptions imo, size_t gidx, sockunion_t src)
	{
	struct in_msource *ims;
	struct in_mfilter *imf;

	KASSERT(src->ss.ss_family == AF_INET, ("%s: !AF_INET", __func__));
	KASSERT(imo->imo_mfilters != NULL,
	("%s: imo_mfilters vector not allocated", __func__));

	imf = &imo->imo_mfilters[gidx];
	if (imf->imf_nsources == IP_MAX_SOURCE_FILTER)
	return (ENOBUFS);

	ims = imo_match_source(imo, gidx, &src->sa);
	if (ims == NULL)
	return (EADDRNOTAVAIL);

	TAILQ_REMOVE(&imf->imf_sources, ims, ims_next);
	FREE(ims, M_IPMSOURCE);
	imf->imf_nsources--;

	return (0);
	}

	/*
	* Find an IPv4 multicast group entry for this ip_moptions instance
	* which matches the specified group, and optionally an interface.
	* Return its index into the array, or -1 if not found.
	*/
	size_t
	imo_match_group(struct ip_moptions imo, struct ifnet ifp,
	struct sockaddr *group)
	{
	sockunion_t *gsa;
	struct in_multi **pinm;
	int idx;
	int nmships;

	gsa = (sockunion_t *)group;

	/* The imo_membership array may be lazy allocated. */
	if (imo->imo_membership == NULL \|\| imo->imo_num_memberships == 0)
	return (-1);

	nmships = imo->imo_num_memberships;
	pinm = &imo->imo_membership[0];
	for (idx = 0; idx < nmships; idx++, pinm++) {
	if (*pinm == NULL)
	continue;
	#if 0
	printf("%s: trying ifp = %p, inaddr = %s ", __func__,
	ifp, inet_ntoa(gsa->sin.sin_addr));
	printf("against %p, %s\n",
	(pinm)->inm_ifp, inet_ntoa((pinm)->inm_addr));
	#endif
	if ((ifp == NULL \|\| ((*pinm)->inm_ifp == ifp)) &&
	(*pinm)->inm_addr.s_addr == gsa->sin.sin_addr.s_addr) {
	break;
	}
	}
	if (idx >= nmships)
	idx = -1;

	return (idx);
	}

	/*
	* Find a multicast source entry for this imo which matches
	* the given group index for this socket, and source address.
	*/
	struct in_msource *
	imo_match_source(struct ip_moptions imo, size_t gidx, struct sockaddr src)
	{
	struct in_mfilter *imf;
	struct in_msource ims, pims;

	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
	KASSERT(gidx != -1 && gidx < imo->imo_num_memberships,
	("%s: invalid index %d\n", __func__, (int)gidx));

	/* The imo_mfilters array may be lazy allocated. */
	if (imo->imo_mfilters == NULL)
	return (NULL);

	pims = NULL;
	imf = &imo->imo_mfilters[gidx];
	TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) {
	/*
	* Perform bitwise comparison of two IPv4 addresses.
	* TODO: Do the same for IPv6.
	* Do not use sa_equal() for this as it is not aware of
	* deeper structure in sockaddr_in or sockaddr_in6.
	*/
	if (((struct sockaddr_in *)&ims->ims_addr)->sin_addr.s_addr ==
	((struct sockaddr_in *)src)->sin_addr.s_addr) {
	pims = ims;
	break;
	}
	}

	return (pims);
	}

	/*
	* Join an IPv4 multicast group.
	*/
	struct in_multi *
	in_addmulti(struct in_addr ap, struct ifnet ifp)
	{
	+ INIT_VNET_INET(ifp->if_vnet);
	struct in_multi *inm;

	inm = NULL;

	IFF_LOCKGIANT(ifp);
	IN_MULTI_LOCK();

	IN_LOOKUP_MULTI(*ap, ifp, inm);
	if (inm != NULL) {
	/*
	* If we already joined this group, just bump the
	* refcount and return it.
	*/
	KASSERT(inm->inm_refcount >= 1,
	("%s: bad refcount %d", __func__, inm->inm_refcount));
	++inm->inm_refcount;
	} else do {
	sockunion_t gsa;
	struct ifmultiaddr *ifma;
	struct in_multi *ninm;
	int error;

	memset(&gsa, 0, sizeof(gsa));
	gsa.sin.sin_family = AF_INET;
	gsa.sin.sin_len = sizeof(struct sockaddr_in);
	gsa.sin.sin_addr = *ap;

	/*
	* Check if a link-layer group is already associated
	* with this network-layer group on the given ifnet.
	* If so, bump the refcount on the existing network-layer
	* group association and return it.
	*/
	error = if_addmulti(ifp, &gsa.sa, &ifma);
	if (error)
	break;
	if (ifma->ifma_protospec != NULL) {
	inm = (struct in_multi *)ifma->ifma_protospec;
	#ifdef INVARIANTS
	if (inm->inm_ifma != ifma \|\| inm->inm_ifp != ifp \|\|
	inm->inm_addr.s_addr != ap->s_addr)
	panic("%s: ifma is inconsistent", __func__);
	#endif
	++inm->inm_refcount;
	break;
	}

	/*
	* A new membership is needed; construct it and
	* perform the IGMP join.
	*/
	ninm = malloc(sizeof(*ninm), M_IPMADDR, M_NOWAIT \| M_ZERO);
	if (ninm == NULL) {
	if_delmulti_ifma(ifma);
	break;
	}
	ninm->inm_addr = *ap;
	ninm->inm_ifp = ifp;
	ninm->inm_ifma = ifma;
	ninm->inm_refcount = 1;
	ifma->ifma_protospec = ninm;
	LIST_INSERT_HEAD(&V_in_multihead, ninm, inm_link);

	igmp_joingroup(ninm);

	inm = ninm;
	} while (0);

	IN_MULTI_UNLOCK();
	IFF_UNLOCKGIANT(ifp);

	return (inm);
	}

	/*
	* Leave an IPv4 multicast group.
	* It is OK to call this routine if the underlying ifnet went away.
	*
	* XXX: To deal with the ifp going away, we cheat; the link-layer code in net
	* will set ifma_ifp to NULL when the associated ifnet instance is detached
	* from the system.
	*
	* The only reason we need to violate layers and check ifma_ifp here at all
	* is because certain hardware drivers still require Giant to be held,
	* and it must always be taken before other locks.
	*/
	void
	in_delmulti(struct in_multi *inm)
	{
	struct ifnet *ifp;

	KASSERT(inm != NULL, ("%s: inm is NULL", __func__));
	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
	ifp = inm->inm_ifma->ifma_ifp;

	if (ifp != NULL) {
	/*
	* Sanity check that netinet's notion of ifp is the
	* same as net's.
	*/
	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
	IFF_LOCKGIANT(ifp);
	}

	IN_MULTI_LOCK();
	in_delmulti_locked(inm);
	IN_MULTI_UNLOCK();

	if (ifp != NULL)
	IFF_UNLOCKGIANT(ifp);
	}

	/*
	* Delete a multicast address record, with locks held.
	*
	* It is OK to call this routine if the ifp went away.
	* Assumes that caller holds the IN_MULTI lock, and that
	* Giant was taken before other locks if required by the hardware.
	*/
	void
	in_delmulti_locked(struct in_multi *inm)
	{
	struct ifmultiaddr *ifma;

	IN_MULTI_LOCK_ASSERT();
	KASSERT(inm->inm_refcount >= 1, ("%s: freeing freed inm", __func__));

	if (--inm->inm_refcount == 0) {
	igmp_leavegroup(inm);

	ifma = inm->inm_ifma;
	#ifdef DIAGNOSTIC
	if (bootverbose)
	printf("%s: purging ifma %p\n", __func__, ifma);
	#endif
	KASSERT(ifma->ifma_protospec == inm,
	("%s: ifma_protospec != inm", __func__));
	ifma->ifma_protospec = NULL;

	LIST_REMOVE(inm, inm_link);
	free(inm, M_IPMADDR);

	if_delmulti_ifma(ifma);
	}
	}

	/*
	* Block or unblock an ASM/SSM multicast source on an inpcb.
	*/
	static int
	inp_change_source_filter(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET(curvnet);
	struct group_source_req gsr;
	sockunion_t gsa, ssa;
	struct ifnet *ifp;
	struct in_mfilter *imf;
	struct ip_moptions *imo;
	struct in_msource *ims;
	size_t idx;
	int error;
	int block;

	ifp = NULL;
	error = 0;
	block = 0;

	memset(&gsr, 0, sizeof(struct group_source_req));
	gsa = (sockunion_t *)&gsr.gsr_group;
	ssa = (sockunion_t *)&gsr.gsr_source;

	switch (sopt->sopt_name) {
	case IP_BLOCK_SOURCE:
	case IP_UNBLOCK_SOURCE: {
	struct ip_mreq_source mreqs;

	error = sooptcopyin(sopt, &mreqs,
	sizeof(struct ip_mreq_source),
	sizeof(struct ip_mreq_source));
	if (error)
	return (error);

	gsa->sin.sin_family = AF_INET;
	gsa->sin.sin_len = sizeof(struct sockaddr_in);
	gsa->sin.sin_addr = mreqs.imr_multiaddr;

	ssa->sin.sin_family = AF_INET;
	ssa->sin.sin_len = sizeof(struct sockaddr_in);
	ssa->sin.sin_addr = mreqs.imr_sourceaddr;

	if (mreqs.imr_interface.s_addr != INADDR_ANY)
	INADDR_TO_IFP(mreqs.imr_interface, ifp);

	if (sopt->sopt_name == IP_BLOCK_SOURCE)
	block = 1;

	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: imr_interface = %s, ifp = %p\n",
	__func__, inet_ntoa(mreqs.imr_interface), ifp);
	}
	#endif
	break;
	}

	case MCAST_BLOCK_SOURCE:
	case MCAST_UNBLOCK_SOURCE:
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_source_req),
	sizeof(struct group_source_req));
	if (error)
	return (error);

	if (gsa->sin.sin_family != AF_INET \|\|
	gsa->sin.sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);

	if (ssa->sin.sin_family != AF_INET \|\|
	ssa->sin.sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);

	if (gsr.gsr_interface == 0 \|\| V_if_index < gsr.gsr_interface)
	return (EADDRNOTAVAIL);

	ifp = ifnet_byindex(gsr.gsr_interface);

	if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
	block = 1;
	break;

	default:
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: unknown sopt_name %d\n", __func__,
	sopt->sopt_name);
	}
	#endif
	return (EOPNOTSUPP);
	break;
	}

	/* XXX INET6 */
	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
	return (EINVAL);

	/*
	* Check if we are actually a member of this group.
	*/
	imo = inp_findmoptions(inp);
	idx = imo_match_group(imo, ifp, &gsa->sa);
	if (idx == -1 \|\| imo->imo_mfilters == NULL) {
	error = EADDRNOTAVAIL;
	goto out_locked;
	}

	KASSERT(imo->imo_mfilters != NULL,
	("%s: imo_mfilters not allocated", __func__));
	imf = &imo->imo_mfilters[idx];

	/*
	* SSM multicast truth table for block/unblock operations.
	*
	* Operation Filter Mode Entry exists? Action
	*
	* block exclude no add source to filter
	* unblock include no add source to filter
	* block include no EINVAL
	* unblock exclude no EINVAL
	* block exclude yes EADDRNOTAVAIL
	* unblock include yes EADDRNOTAVAIL
	* block include yes remove source from filter
	* unblock exclude yes remove source from filter
	*
	* FreeBSD does not explicitly distinguish between ASM and SSM
	* mode sockets; all sockets are assumed to have a filter list.
	*/
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: imf_fmode is %s\n", __func__,
	imf->imf_fmode == MCAST_INCLUDE ? "include" : "exclude");
	}
	#endif
	ims = imo_match_source(imo, idx, &ssa->sa);
	if (ims == NULL) {
	if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) \|\|
	(block == 0 && imf->imf_fmode == MCAST_INCLUDE)) {
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: adding %s to filter list\n",
	__func__, inet_ntoa(ssa->sin.sin_addr));
	}
	#endif
	error = imo_join_source(imo, idx, ssa);
	}
	if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) \|\|
	(block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) {
	/*
	* If the socket is in inclusive mode:
	* the source is already blocked as it has no entry.
	* If the socket is in exclusive mode:
	* the source is already unblocked as it has no entry.
	*/
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: ims %p; %s already [un]blocked\n",
	__func__, ims,
	inet_ntoa(ssa->sin.sin_addr));
	}
	#endif
	error = EINVAL;
	}
	} else {
	if ((block == 1 && imf->imf_fmode == MCAST_EXCLUDE) \|\|
	(block == 0 && imf->imf_fmode == MCAST_INCLUDE)) {
	/*
	* If the socket is in exclusive mode:
	* the source is already blocked as it has an entry.
	* If the socket is in inclusive mode:
	* the source is already unblocked as it has an entry.
	*/
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: ims %p; %s already [un]blocked\n",
	__func__, ims,
	inet_ntoa(ssa->sin.sin_addr));
	}
	#endif
	error = EADDRNOTAVAIL;
	}
	if ((block == 1 && imf->imf_fmode == MCAST_INCLUDE) \|\|
	(block == 0 && imf->imf_fmode == MCAST_EXCLUDE)) {
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: removing %s from filter list\n",
	__func__, inet_ntoa(ssa->sin.sin_addr));
	}
	#endif
	error = imo_leave_source(imo, idx, ssa);
	}
	}

	out_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Given an inpcb, return its multicast options structure pointer. Accepts
	* an unlocked inpcb pointer, but will return it locked. May sleep.
	*/
	static struct ip_moptions *
	inp_findmoptions(struct inpcb *inp)
	{
	struct ip_moptions *imo;
	struct in_multi **immp;
	struct in_mfilter *imfp;
	size_t idx;

	INP_WLOCK(inp);
	if (inp->inp_moptions != NULL)
	return (inp->inp_moptions);

	INP_WUNLOCK(inp);

	imo = (struct ip_moptions )malloc(sizeof(imo), M_IPMOPTS,
	M_WAITOK);
	immp = (struct in_multi *)malloc(sizeof(immp) * IP_MIN_MEMBERSHIPS,
	M_IPMOPTS, M_WAITOK \| M_ZERO);
	imfp = (struct in_mfilter *)malloc(
	sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS,
	M_IPMSOURCE, M_WAITOK);

	imo->imo_multicast_ifp = NULL;
	imo->imo_multicast_addr.s_addr = INADDR_ANY;
	imo->imo_multicast_vif = -1;
	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
	imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
	imo->imo_num_memberships = 0;
	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
	imo->imo_membership = immp;

	/* Initialize per-group source filters. */
	for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) {
	imfp[idx].imf_fmode = MCAST_EXCLUDE;
	imfp[idx].imf_nsources = 0;
	TAILQ_INIT(&imfp[idx].imf_sources);
	}
	imo->imo_mfilters = imfp;

	INP_WLOCK(inp);
	if (inp->inp_moptions != NULL) {
	free(imfp, M_IPMSOURCE);
	free(immp, M_IPMOPTS);
	free(imo, M_IPMOPTS);
	return (inp->inp_moptions);
	}
	inp->inp_moptions = imo;
	return (imo);
	}

	/*
	* Discard the IP multicast options (and source filters).
	*/
	void
	inp_freemoptions(struct ip_moptions *imo)
	{
	struct in_mfilter *imf;
	struct in_msource ims, tims;
	size_t idx, nmships;

	KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__));

	nmships = imo->imo_num_memberships;
	for (idx = 0; idx < nmships; ++idx) {
	in_delmulti(imo->imo_membership[idx]);

	if (imo->imo_mfilters != NULL) {
	imf = &imo->imo_mfilters[idx];
	TAILQ_FOREACH_SAFE(ims, &imf->imf_sources,
	ims_next, tims) {
	TAILQ_REMOVE(&imf->imf_sources, ims, ims_next);
	FREE(ims, M_IPMSOURCE);
	imf->imf_nsources--;
	}
	KASSERT(imf->imf_nsources == 0,
	("%s: did not free all imf_nsources", __func__));
	}
	}

	if (imo->imo_mfilters != NULL)
	free(imo->imo_mfilters, M_IPMSOURCE);
	free(imo->imo_membership, M_IPMOPTS);
	free(imo, M_IPMOPTS);
	}

	/*
	* Atomically get source filters on a socket for an IPv4 multicast group.
	* Called with INP lock held; returns with lock released.
	*/
	static int
	inp_get_source_filters(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_NET(curvnet);
	struct __msfilterreq msfr;
	sockunion_t *gsa;
	struct ifnet *ifp;
	struct ip_moptions *imo;
	struct in_mfilter *imf;
	struct in_msource *ims;
	struct sockaddr_storage *ptss;
	struct sockaddr_storage *tss;
	int error;
	size_t idx;

	INP_WLOCK_ASSERT(inp);

	imo = inp->inp_moptions;
	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));

	INP_WUNLOCK(inp);

	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
	sizeof(struct __msfilterreq));
	if (error)
	return (error);

	if (msfr.msfr_ifindex == 0 \|\| V_if_index < msfr.msfr_ifindex)
	return (EINVAL);

	ifp = ifnet_byindex(msfr.msfr_ifindex);
	if (ifp == NULL)
	return (EINVAL);

	INP_WLOCK(inp);

	/*
	* Lookup group on the socket.
	*/
	gsa = (sockunion_t *)&msfr.msfr_group;
	idx = imo_match_group(imo, ifp, &gsa->sa);
	if (idx == -1 \|\| imo->imo_mfilters == NULL) {
	INP_WUNLOCK(inp);
	return (EADDRNOTAVAIL);
	}

	imf = &imo->imo_mfilters[idx];
	msfr.msfr_fmode = imf->imf_fmode;
	msfr.msfr_nsrcs = imf->imf_nsources;

	/*
	* If the user specified a buffer, copy out the source filter
	* entries to userland gracefully.
	* msfr.msfr_nsrcs is always set to the total number of filter
	* entries which the kernel currently has for this group.
	*/
	tss = NULL;
	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
	/*
	* Make a copy of the source vector so that we do not
	* thrash the inpcb lock whilst copying it out.
	* We only copy out the number of entries which userland
	* has asked for, but we always tell userland how big the
	* buffer really needs to be.
	*/
	MALLOC(tss, struct sockaddr_storage *,
	sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
	M_TEMP, M_NOWAIT);
	if (tss == NULL) {
	error = ENOBUFS;
	} else {
	ptss = tss;
	TAILQ_FOREACH(ims, &imf->imf_sources, ims_next) {
	memcpy(ptss++, &ims->ims_addr,
	sizeof(struct sockaddr_storage));
	}
	}
	}

	INP_WUNLOCK(inp);

	if (tss != NULL) {
	error = copyout(tss, msfr.msfr_srcs,
	sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
	FREE(tss, M_TEMP);
	}

	if (error)
	return (error);

	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));

	return (error);
	}

	/*
	* Return the IP multicast options in response to user getsockopt().
	*/
	int
	inp_getmoptions(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip_mreqn mreqn;
	struct ip_moptions *imo;
	struct ifnet *ifp;
	struct in_ifaddr *ia;
	int error, optval;
	u_char coptval;

	INP_WLOCK(inp);
	imo = inp->inp_moptions;
	/*
	* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
	* or is a divert socket, reject it.
	*/
	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT \|\|
	(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
	inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
	INP_WUNLOCK(inp);
	return (EOPNOTSUPP);
	}

	error = 0;
	switch (sopt->sopt_name) {
	case IP_MULTICAST_VIF:
	if (imo != NULL)
	optval = imo->imo_multicast_vif;
	else
	optval = -1;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof(int));
	break;

	case IP_MULTICAST_IF:
	memset(&mreqn, 0, sizeof(struct ip_mreqn));
	if (imo != NULL) {
	ifp = imo->imo_multicast_ifp;
	if (imo->imo_multicast_addr.s_addr != INADDR_ANY) {
	mreqn.imr_address = imo->imo_multicast_addr;
	} else if (ifp != NULL) {
	mreqn.imr_ifindex = ifp->if_index;
	IFP_TO_IA(ifp, ia);
	if (ia != NULL) {
	mreqn.imr_address =
	IA_SIN(ia)->sin_addr;
	}
	}
	}
	INP_WUNLOCK(inp);
	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
	error = sooptcopyout(sopt, &mreqn,
	sizeof(struct ip_mreqn));
	} else {
	error = sooptcopyout(sopt, &mreqn.imr_address,
	sizeof(struct in_addr));
	}
	break;

	case IP_MULTICAST_TTL:
	if (imo == 0)
	optval = coptval = IP_DEFAULT_MULTICAST_TTL;
	else
	optval = coptval = imo->imo_multicast_ttl;
	INP_WUNLOCK(inp);
	if (sopt->sopt_valsize == sizeof(u_char))
	error = sooptcopyout(sopt, &coptval, sizeof(u_char));
	else
	error = sooptcopyout(sopt, &optval, sizeof(int));
	break;

	case IP_MULTICAST_LOOP:
	if (imo == 0)
	optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
	else
	optval = coptval = imo->imo_multicast_loop;
	INP_WUNLOCK(inp);
	if (sopt->sopt_valsize == sizeof(u_char))
	error = sooptcopyout(sopt, &coptval, sizeof(u_char));
	else
	error = sooptcopyout(sopt, &optval, sizeof(int));
	break;

	case IP_MSFILTER:
	if (imo == NULL) {
	error = EADDRNOTAVAIL;
	INP_WUNLOCK(inp);
	} else {
	error = inp_get_source_filters(inp, sopt);
	}
	break;

	default:
	INP_WUNLOCK(inp);
	error = ENOPROTOOPT;
	break;
	}

	INP_UNLOCK_ASSERT(inp);

	return (error);
	}

	/*
	* Join an IPv4 multicast group, possibly with a source.
	*/
	static int
	inp_join_group(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET(curvnet);
	struct group_source_req gsr;
	sockunion_t gsa, ssa;
	struct ifnet *ifp;
	struct in_mfilter *imf;
	struct ip_moptions *imo;
	struct in_multi *inm;
	size_t idx;
	int error;

	ifp = NULL;
	error = 0;

	memset(&gsr, 0, sizeof(struct group_source_req));
	gsa = (sockunion_t *)&gsr.gsr_group;
	gsa->ss.ss_family = AF_UNSPEC;
	ssa = (sockunion_t *)&gsr.gsr_source;
	ssa->ss.ss_family = AF_UNSPEC;

	switch (sopt->sopt_name) {
	case IP_ADD_MEMBERSHIP:
	case IP_ADD_SOURCE_MEMBERSHIP: {
	struct ip_mreq_source mreqs;

	if (sopt->sopt_name == IP_ADD_MEMBERSHIP) {
	error = sooptcopyin(sopt, &mreqs,
	sizeof(struct ip_mreq),
	sizeof(struct ip_mreq));
	/*
	* Do argument switcharoo from ip_mreq into
	* ip_mreq_source to avoid using two instances.
	*/
	mreqs.imr_interface = mreqs.imr_sourceaddr;
	mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
	} else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
	error = sooptcopyin(sopt, &mreqs,
	sizeof(struct ip_mreq_source),
	sizeof(struct ip_mreq_source));
	}
	if (error)
	return (error);

	gsa->sin.sin_family = AF_INET;
	gsa->sin.sin_len = sizeof(struct sockaddr_in);
	gsa->sin.sin_addr = mreqs.imr_multiaddr;

	if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
	ssa->sin.sin_family = AF_INET;
	ssa->sin.sin_len = sizeof(struct sockaddr_in);
	ssa->sin.sin_addr = mreqs.imr_sourceaddr;
	}

	/*
	* Obtain ifp. If no interface address was provided,
	* use the interface of the route in the unicast FIB for
	* the given multicast destination; usually, this is the
	* default route.
	* If this lookup fails, attempt to use the first non-loopback
	* interface with multicast capability in the system as a
	* last resort. The legacy IPv4 ASM API requires that we do
	* this in order to allow groups to be joined when the routing
	* table has not yet been populated during boot.
	* If all of these conditions fail, return EADDRNOTAVAIL, and
	* reject the IPv4 multicast join.
	*/
	if (mreqs.imr_interface.s_addr != INADDR_ANY) {
	INADDR_TO_IFP(mreqs.imr_interface, ifp);
	} else {
	struct route ro;

	ro.ro_rt = NULL;
	(struct sockaddr_in )&ro.ro_dst = gsa->sin;
	in_rtalloc_ign(&ro, RTF_CLONING,
	inp->inp_inc.inc_fibnum);
	if (ro.ro_rt != NULL) {
	ifp = ro.ro_rt->rt_ifp;
	KASSERT(ifp != NULL, ("%s: null ifp",
	__func__));
	RTFREE(ro.ro_rt);
	} else {
	struct in_ifaddr *ia;
	struct ifnet *mfp = NULL;
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	mfp = ia->ia_ifp;
	if (!(mfp->if_flags & IFF_LOOPBACK) &&
	(mfp->if_flags & IFF_MULTICAST)) {
	ifp = mfp;
	break;
	}
	}
	}
	}
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: imr_interface = %s, ifp = %p\n",
	__func__, inet_ntoa(mreqs.imr_interface), ifp);
	}
	#endif
	break;
	}

	case MCAST_JOIN_GROUP:
	case MCAST_JOIN_SOURCE_GROUP:
	if (sopt->sopt_name == MCAST_JOIN_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_req),
	sizeof(struct group_req));
	} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_source_req),
	sizeof(struct group_source_req));
	}
	if (error)
	return (error);

	if (gsa->sin.sin_family != AF_INET \|\|
	gsa->sin.sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);

	/*
	* Overwrite the port field if present, as the sockaddr
	* being copied in may be matched with a binary comparison.
	* XXX INET6
	*/
	gsa->sin.sin_port = 0;
	if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
	if (ssa->sin.sin_family != AF_INET \|\|
	ssa->sin.sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);
	ssa->sin.sin_port = 0;
	}

	/*
	* Obtain the ifp.
	*/
	if (gsr.gsr_interface == 0 \|\| V_if_index < gsr.gsr_interface)
	return (EADDRNOTAVAIL);
	ifp = ifnet_byindex(gsr.gsr_interface);

	break;

	default:
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: unknown sopt_name %d\n", __func__,
	sopt->sopt_name);
	}
	#endif
	return (EOPNOTSUPP);
	break;
	}

	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
	return (EINVAL);

	if (ifp == NULL \|\| (ifp->if_flags & IFF_MULTICAST) == 0)
	return (EADDRNOTAVAIL);

	/*
	* Check if we already hold membership of this group for this inpcb.
	* If so, we do not need to perform the initial join.
	*/
	imo = inp_findmoptions(inp);
	idx = imo_match_group(imo, ifp, &gsa->sa);
	if (idx != -1) {
	if (ssa->ss.ss_family != AF_UNSPEC) {
	/*
	* Attempting to join an ASM group (when already
	* an ASM or SSM member) is an error.
	*/
	error = EADDRNOTAVAIL;
	} else {
	imf = &imo->imo_mfilters[idx];
	if (imf->imf_nsources == 0) {
	/*
	* Attempting to join an SSM group (when
	* already an ASM member) is an error.
	*/
	error = EINVAL;
	} else {
	/*
	* Attempting to join an SSM group (when
	* already an SSM member) means "add this
	* source to the inclusive filter list".
	*/
	error = imo_join_source(imo, idx, ssa);
	}
	}
	goto out_locked;
	}

	/*
	* Call imo_grow() to reallocate the membership and source filter
	* vectors if they are full. If the size would exceed the hard limit,
	* then we know we've really run out of entries. We keep the INP
	* lock held to avoid introducing a race condition.
	*/
	if (imo->imo_num_memberships == imo->imo_max_memberships) {
	error = imo_grow(imo);
	if (error)
	goto out_locked;
	}

	/*
	* So far, so good: perform the layer 3 join, layer 2 join,
	* and make an IGMP announcement if needed.
	*/
	inm = in_addmulti(&gsa->sin.sin_addr, ifp);
	if (inm == NULL) {
	error = ENOBUFS;
	goto out_locked;
	}
	idx = imo->imo_num_memberships;
	imo->imo_membership[idx] = inm;
	imo->imo_num_memberships++;

	KASSERT(imo->imo_mfilters != NULL,
	("%s: imf_mfilters vector was not allocated", __func__));
	imf = &imo->imo_mfilters[idx];
	KASSERT(TAILQ_EMPTY(&imf->imf_sources),
	("%s: imf_sources not empty", __func__));

	/*
	* If this is a new SSM group join (i.e. a source was specified
	* with this group), add this source to the filter list.
	*/
	if (ssa->ss.ss_family != AF_UNSPEC) {
	/*
	* An initial SSM join implies that this socket's membership
	* of the multicast group is now in inclusive mode.
	*/
	imf->imf_fmode = MCAST_INCLUDE;

	error = imo_join_source(imo, idx, ssa);
	if (error) {
	/*
	* Drop inp lock before calling in_delmulti(),
	* to prevent a lock order reversal.
	*/
	--imo->imo_num_memberships;
	INP_WUNLOCK(inp);
	in_delmulti(inm);
	return (error);
	}
	}

	out_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Leave an IPv4 multicast group on an inpcb, possibly with a source.
	*/
	static int
	inp_leave_group(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET(curvnet);
	struct group_source_req gsr;
	struct ip_mreq_source mreqs;
	sockunion_t gsa, ssa;
	struct ifnet *ifp;
	struct in_mfilter *imf;
	struct ip_moptions *imo;
	struct in_msource ims, tims;
	struct in_multi *inm;
	size_t idx;
	int error;

	ifp = NULL;
	error = 0;

	memset(&gsr, 0, sizeof(struct group_source_req));
	gsa = (sockunion_t *)&gsr.gsr_group;
	gsa->ss.ss_family = AF_UNSPEC;
	ssa = (sockunion_t *)&gsr.gsr_source;
	ssa->ss.ss_family = AF_UNSPEC;

	switch (sopt->sopt_name) {
	case IP_DROP_MEMBERSHIP:
	case IP_DROP_SOURCE_MEMBERSHIP:
	if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
	error = sooptcopyin(sopt, &mreqs,
	sizeof(struct ip_mreq),
	sizeof(struct ip_mreq));
	/*
	* Swap interface and sourceaddr arguments,
	* as ip_mreq and ip_mreq_source are laid
	* out differently.
	*/
	mreqs.imr_interface = mreqs.imr_sourceaddr;
	mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
	} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
	error = sooptcopyin(sopt, &mreqs,
	sizeof(struct ip_mreq_source),
	sizeof(struct ip_mreq_source));
	}
	if (error)
	return (error);

	gsa->sin.sin_family = AF_INET;
	gsa->sin.sin_len = sizeof(struct sockaddr_in);
	gsa->sin.sin_addr = mreqs.imr_multiaddr;

	if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
	ssa->sin.sin_family = AF_INET;
	ssa->sin.sin_len = sizeof(struct sockaddr_in);
	ssa->sin.sin_addr = mreqs.imr_sourceaddr;
	}

	if (gsa->sin.sin_addr.s_addr != INADDR_ANY)
	INADDR_TO_IFP(mreqs.imr_interface, ifp);

	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: imr_interface = %s, ifp = %p\n",
	__func__, inet_ntoa(mreqs.imr_interface), ifp);
	}
	#endif
	break;

	case MCAST_LEAVE_GROUP:
	case MCAST_LEAVE_SOURCE_GROUP:
	if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_req),
	sizeof(struct group_req));
	} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
	error = sooptcopyin(sopt, &gsr,
	sizeof(struct group_source_req),
	sizeof(struct group_source_req));
	}
	if (error)
	return (error);

	if (gsa->sin.sin_family != AF_INET \|\|
	gsa->sin.sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);

	if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
	if (ssa->sin.sin_family != AF_INET \|\|
	ssa->sin.sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);
	}

	if (gsr.gsr_interface == 0 \|\| V_if_index < gsr.gsr_interface)
	return (EADDRNOTAVAIL);

	ifp = ifnet_byindex(gsr.gsr_interface);
	break;

	default:
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: unknown sopt_name %d\n", __func__,
	sopt->sopt_name);
	}
	#endif
	return (EOPNOTSUPP);
	break;
	}

	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
	return (EINVAL);

	/*
	* Find the membership in the membership array.
	*/
	imo = inp_findmoptions(inp);
	idx = imo_match_group(imo, ifp, &gsa->sa);
	if (idx == -1) {
	error = EADDRNOTAVAIL;
	goto out_locked;
	}
	imf = &imo->imo_mfilters[idx];

	/*
	* If we were instructed only to leave a given source, do so.
	*/
	if (ssa->ss.ss_family != AF_UNSPEC) {
	if (imf->imf_nsources == 0 \|\|
	imf->imf_fmode == MCAST_EXCLUDE) {
	/*
	* Attempting to SSM leave an ASM group
	* is an error; should use *_BLOCK_SOURCE instead.
	* Attempting to SSM leave a source in a group when
	* the socket is in 'exclude mode' is also an error.
	*/
	error = EINVAL;
	} else {
	error = imo_leave_source(imo, idx, ssa);
	}
	/*
	* If an error occurred, or this source is not the last
	* source in the group, do not leave the whole group.
	*/
	if (error \|\| imf->imf_nsources > 0)
	goto out_locked;
	}

	/*
	* Give up the multicast address record to which the membership points.
	*/
	inm = imo->imo_membership[idx];
	in_delmulti(inm);

	/*
	* Free any source filters for this group if they exist.
	* Revert inpcb to the default MCAST_EXCLUDE state.
	*/
	if (imo->imo_mfilters != NULL) {
	TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) {
	TAILQ_REMOVE(&imf->imf_sources, ims, ims_next);
	FREE(ims, M_IPMSOURCE);
	imf->imf_nsources--;
	}
	KASSERT(imf->imf_nsources == 0,
	("%s: imf_nsources not 0", __func__));
	KASSERT(TAILQ_EMPTY(&imf->imf_sources),
	("%s: imf_sources not empty", __func__));
	imf->imf_fmode = MCAST_EXCLUDE;
	}

	/*
	* Remove the gap in the membership array.
	*/
	for (++idx; idx < imo->imo_num_memberships; ++idx)
	imo->imo_membership[idx-1] = imo->imo_membership[idx];
	imo->imo_num_memberships--;

	out_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Select the interface for transmitting IPv4 multicast datagrams.
	*
	* Either an instance of struct in_addr or an instance of struct ip_mreqn
	* may be passed to this socket option. An address of INADDR_ANY or an
	* interface index of 0 is used to remove a previous selection.
	* When no interface is selected, one is chosen for every send.
	*/
	static int
	inp_set_multicast_if(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_NET(curvnet);
	struct in_addr addr;
	struct ip_mreqn mreqn;
	struct ifnet *ifp;
	struct ip_moptions *imo;
	int error;

	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
	/*
	* An interface index was specified using the
	* Linux-derived ip_mreqn structure.
	*/
	error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
	sizeof(struct ip_mreqn));
	if (error)
	return (error);

	if (mreqn.imr_ifindex < 0 \|\| V_if_index < mreqn.imr_ifindex)
	return (EINVAL);

	if (mreqn.imr_ifindex == 0) {
	ifp = NULL;
	} else {
	ifp = ifnet_byindex(mreqn.imr_ifindex);
	if (ifp == NULL)
	return (EADDRNOTAVAIL);
	}
	} else {
	/*
	* An interface was specified by IPv4 address.
	* This is the traditional BSD usage.
	*/
	error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
	sizeof(struct in_addr));
	if (error)
	return (error);
	if (addr.s_addr == INADDR_ANY) {
	ifp = NULL;
	} else {
	INADDR_TO_IFP(addr, ifp);
	if (ifp == NULL)
	return (EADDRNOTAVAIL);
	}
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: ifp = %p, addr = %s\n",
	__func__, ifp, inet_ntoa(addr)); /* XXX INET6 */
	}
	#endif
	}

	/* Reject interfaces which do not support multicast. */
	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
	return (EOPNOTSUPP);

	imo = inp_findmoptions(inp);
	imo->imo_multicast_ifp = ifp;
	imo->imo_multicast_addr.s_addr = INADDR_ANY;
	INP_WUNLOCK(inp);

	return (0);
	}

	/*
	* Atomically set source filters on a socket for an IPv4 multicast group.
	*/
	static int
	inp_set_source_filters(struct inpcb inp, struct sockopt sopt)
	{
	+ INIT_VNET_NET(curvnet);
	struct __msfilterreq msfr;
	sockunion_t *gsa;
	struct ifnet *ifp;
	struct in_mfilter *imf;
	struct ip_moptions *imo;
	struct in_msource ims, tims;
	size_t idx;
	int error;

	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
	sizeof(struct __msfilterreq));
	if (error)
	return (error);

	if (msfr.msfr_nsrcs > IP_MAX_SOURCE_FILTER \|\|
	(msfr.msfr_fmode != MCAST_EXCLUDE &&
	msfr.msfr_fmode != MCAST_INCLUDE))
	return (EINVAL);

	if (msfr.msfr_group.ss_family != AF_INET \|\|
	msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
	return (EINVAL);

	gsa = (sockunion_t *)&msfr.msfr_group;
	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
	return (EINVAL);

	gsa->sin.sin_port = 0; /* ignore port */

	if (msfr.msfr_ifindex == 0 \|\| V_if_index < msfr.msfr_ifindex)
	return (EADDRNOTAVAIL);

	ifp = ifnet_byindex(msfr.msfr_ifindex);
	if (ifp == NULL)
	return (EADDRNOTAVAIL);

	/*
	* Take the INP lock.
	* Check if this socket is a member of this group.
	*/
	imo = inp_findmoptions(inp);
	idx = imo_match_group(imo, ifp, &gsa->sa);
	if (idx == -1 \|\| imo->imo_mfilters == NULL) {
	error = EADDRNOTAVAIL;
	goto out_locked;
	}
	imf = &imo->imo_mfilters[idx];

	#ifdef DIAGNOSTIC
	if (bootverbose)
	printf("%s: clearing source list\n", __func__);
	#endif

	/*
	* Remove any existing source filters.
	*/
	TAILQ_FOREACH_SAFE(ims, &imf->imf_sources, ims_next, tims) {
	TAILQ_REMOVE(&imf->imf_sources, ims, ims_next);
	FREE(ims, M_IPMSOURCE);
	imf->imf_nsources--;
	}
	KASSERT(imf->imf_nsources == 0,
	("%s: source list not cleared", __func__));

	/*
	* Apply any new source filters, if present.
	*/
	if (msfr.msfr_nsrcs > 0) {
	struct in_msource **pnims;
	struct in_msource *nims;
	struct sockaddr_storage *kss;
	struct sockaddr_storage *pkss;
	sockunion_t *psu;
	int i, j;

	/*
	* Drop the inp lock so we may sleep if we need to
	* in order to satisfy a malloc request.
	* We will re-take it before changing socket state.
	*/
	INP_WUNLOCK(inp);
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: loading %lu source list entries\n",
	__func__, (unsigned long)msfr.msfr_nsrcs);
	}
	#endif
	/*
	* Make a copy of the user-space source vector so
	* that we may copy them with a single copyin. This
	* allows us to deal with page faults up-front.
	*/
	MALLOC(kss, struct sockaddr_storage *,
	sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
	M_TEMP, M_WAITOK);
	error = copyin(msfr.msfr_srcs, kss,
	sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
	if (error) {
	FREE(kss, M_TEMP);
	return (error);
	}

	/*
	* Perform argument checking on every sockaddr_storage
	* structure in the vector provided to us. Overwrite
	* fields which should not apply to source entries.
	* TODO: Check for duplicate sources on this pass.
	*/
	psu = (sockunion_t *)kss;
	for (i = 0; i < msfr.msfr_nsrcs; i++, psu++) {
	switch (psu->ss.ss_family) {
	case AF_INET:
	if (psu->sin.sin_len !=
	sizeof(struct sockaddr_in)) {
	error = EINVAL;
	} else {
	psu->sin.sin_port = 0;
	}
	break;
	#ifdef notyet
	case AF_INET6;
	if (psu->sin6.sin6_len !=
	sizeof(struct sockaddr_in6)) {
	error = EINVAL;
	} else {
	psu->sin6.sin6_port = 0;
	psu->sin6.sin6_flowinfo = 0;
	}
	break;
	#endif
	default:
	error = EAFNOSUPPORT;
	break;
	}
	if (error)
	break;
	}
	if (error) {
	FREE(kss, M_TEMP);
	return (error);
	}

	/*
	* Allocate a block to track all the in_msource
	* entries we are about to allocate, in case we
	* abruptly need to free them.
	*/
	MALLOC(pnims, struct in_msource **,
	sizeof(struct in_msource ) msfr.msfr_nsrcs,
	M_TEMP, M_WAITOK \| M_ZERO);

	/*
	* Allocate up to nsrcs individual chunks.
	* If we encounter an error, backtrack out of
	* all allocations cleanly; updates must be atomic.
	*/
	pkss = kss;
	nims = NULL;
	for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) {
	MALLOC(nims, struct in_msource *,
	sizeof(struct in_msource) * msfr.msfr_nsrcs,
	M_IPMSOURCE, M_WAITOK \| M_ZERO);
	pnims[i] = nims;
	}
	if (i < msfr.msfr_nsrcs) {
	for (j = 0; j < i; j++) {
	if (pnims[j] != NULL)
	FREE(pnims[j], M_IPMSOURCE);
	}
	FREE(pnims, M_TEMP);
	FREE(kss, M_TEMP);
	return (ENOBUFS);
	}

	INP_UNLOCK_ASSERT(inp);

	/*
	* Finally, apply the filters to the socket.
	* Re-take the inp lock; we are changing socket state.
	*/
	pkss = kss;
	INP_WLOCK(inp);
	for (i = 0; i < msfr.msfr_nsrcs; i++, pkss++) {
	memcpy(&(pnims[i]->ims_addr), pkss,
	sizeof(struct sockaddr_storage));
	TAILQ_INSERT_TAIL(&imf->imf_sources, pnims[i],
	ims_next);
	imf->imf_nsources++;
	}
	FREE(pnims, M_TEMP);
	FREE(kss, M_TEMP);
	}

	/*
	* Update the filter mode on the socket before releasing the inpcb.
	*/
	INP_WLOCK_ASSERT(inp);
	imf->imf_fmode = msfr.msfr_fmode;

	out_locked:
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Set the IP multicast options in response to user setsockopt().
	*
	* Many of the socket options handled in this function duplicate the
	* functionality of socket options in the regular unicast API. However,
	* it is not possible to merge the duplicate code, because the idempotence
	* of the IPv4 multicast part of the BSD Sockets API must be preserved;
	* the effects of these options must be treated as separate and distinct.
	*/
	int
	inp_setmoptions(struct inpcb inp, struct sockopt sopt)
	{
	struct ip_moptions *imo;
	int error;

	error = 0;

	/*
	* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
	* or is a divert socket, reject it.
	* XXX Unlocked read of inp_socket believed OK.
	*/
	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT \|\|
	(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
	inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
	return (EOPNOTSUPP);

	switch (sopt->sopt_name) {
	case IP_MULTICAST_VIF: {
	int vifi;
	/*
	* Select a multicast VIF for transmission.
	* Only useful if multicast forwarding is active.
	*/
	if (legal_vif_num == NULL) {
	error = EOPNOTSUPP;
	break;
	}
	error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
	if (error)
	break;
	if (!legal_vif_num(vifi) && (vifi != -1)) {
	error = EINVAL;
	break;
	}
	imo = inp_findmoptions(inp);
	imo->imo_multicast_vif = vifi;
	INP_WUNLOCK(inp);
	break;
	}

	case IP_MULTICAST_IF:
	error = inp_set_multicast_if(inp, sopt);
	break;

	case IP_MULTICAST_TTL: {
	u_char ttl;

	/*
	* Set the IP time-to-live for outgoing multicast packets.
	* The original multicast API required a char argument,
	* which is inconsistent with the rest of the socket API.
	* We allow either a char or an int.
	*/
	if (sopt->sopt_valsize == sizeof(u_char)) {
	error = sooptcopyin(sopt, &ttl, sizeof(u_char),
	sizeof(u_char));
	if (error)
	break;
	} else {
	u_int ittl;

	error = sooptcopyin(sopt, &ittl, sizeof(u_int),
	sizeof(u_int));
	if (error)
	break;
	if (ittl > 255) {
	error = EINVAL;
	break;
	}
	ttl = (u_char)ittl;
	}
	imo = inp_findmoptions(inp);
	imo->imo_multicast_ttl = ttl;
	INP_WUNLOCK(inp);
	break;
	}

	case IP_MULTICAST_LOOP: {
	u_char loop;

	/*
	* Set the loopback flag for outgoing multicast packets.
	* Must be zero or one. The original multicast API required a
	* char argument, which is inconsistent with the rest
	* of the socket API. We allow either a char or an int.
	*/
	if (sopt->sopt_valsize == sizeof(u_char)) {
	error = sooptcopyin(sopt, &loop, sizeof(u_char),
	sizeof(u_char));
	if (error)
	break;
	} else {
	u_int iloop;

	error = sooptcopyin(sopt, &iloop, sizeof(u_int),
	sizeof(u_int));
	if (error)
	break;
	loop = (u_char)iloop;
	}
	imo = inp_findmoptions(inp);
	imo->imo_multicast_loop = !!loop;
	INP_WUNLOCK(inp);
	break;
	}

	case IP_ADD_MEMBERSHIP:
	case IP_ADD_SOURCE_MEMBERSHIP:
	case MCAST_JOIN_GROUP:
	case MCAST_JOIN_SOURCE_GROUP:
	error = inp_join_group(inp, sopt);
	break;

	case IP_DROP_MEMBERSHIP:
	case IP_DROP_SOURCE_MEMBERSHIP:
	case MCAST_LEAVE_GROUP:
	case MCAST_LEAVE_SOURCE_GROUP:
	error = inp_leave_group(inp, sopt);
	break;

	case IP_BLOCK_SOURCE:
	case IP_UNBLOCK_SOURCE:
	case MCAST_BLOCK_SOURCE:
	case MCAST_UNBLOCK_SOURCE:
	error = inp_change_source_filter(inp, sopt);
	break;

	case IP_MSFILTER:
	error = inp_set_source_filters(inp, sopt);
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}

	INP_UNLOCK_ASSERT(inp);

	return (error);
	}
	Index: head/sys/netinet/in_pcb.c
	===================================================================
	--- head/sys/netinet/in_pcb.c (revision 183549)
	+++ head/sys/netinet/in_pcb.c (revision 183550)
	@@ -1,1595 +1,1617 @@
	/*-
	* Copyright (c) 1982, 1986, 1991, 1993, 1995
	* The Regents of the University of California.
	* Copyright (c) 2007 Robert N. M. Watson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_ipsec.h"
	#include "opt_inet6.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#include <netinet/tcp_var.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif /* INET6 */


	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/key.h>
	#endif /* IPSEC */

	#include <security/mac/mac_framework.h>

	/*
	* These configure the range of local port addresses assigned to
	* "unspecified" outgoing connections/packets/whatever.
	*/
	int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
	int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
	int ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */
	int ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */
	int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
	int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */

	/*
	* Reserved ports accessible only to root. There are significant
	* security considerations that must be accounted for when changing these,
	* but the security benefits can be great. Please be careful.
	*/
	int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */
	int ipport_reservedlow = 0;

	/* Variables dealing with random ephemeral port allocation. */
	int ipport_randomized = 1; /* user controlled via sysctl */
	int ipport_randomcps = 10; /* user controlled via sysctl */
	int ipport_randomtime = 45; /* user controlled via sysctl */
	int ipport_stoprandom = 0; /* toggled by ipport_tick */
	int ipport_tcpallocs;
	int ipport_tcplastcount;

	#define RANGECHK(var, min, max) \
	if ((var) < (min)) { (var) = (min); } \
	else if ((var) > (max)) { (var) = (max); }

	static int
	sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
	{
	int error;

	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
	if (error == 0) {
	RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
	RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
	RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
	RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
	RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
	RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
	}
	return (error);
	}

	#undef RANGECHK

	SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");

	-SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT\|CTLFLAG_RW,
	- &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
	-SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT\|CTLFLAG_RW,
	- &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
	-SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT\|CTLFLAG_RW,
	- &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
	-SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT\|CTLFLAG_RW,
	- &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
	-SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT\|CTLFLAG_RW,
	- &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
	-SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT\|CTLFLAG_RW,
	- &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
	-SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
	- CTLFLAG_RW\|CTLFLAG_SECURE, &ipport_reservedhigh, 0, "");
	-SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
	- CTLFLAG_RW\|CTLFLAG_SECURE, &ipport_reservedlow, 0, "");
	-SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
	- &ipport_randomized, 0, "Enable random port allocation");
	-SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
	- &ipport_randomcps, 0, "Maximum number of random port "
	- "allocations before switching to a sequental one");
	-SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
	- &ipport_randomtime, 0, "Minimum time to keep sequental port "
	- "allocation before switching to a random one");
	+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ lowfirst, CTLTYPE_INT\|CTLFLAG_RW, ipport_lowfirstauto, 0,
	+ &sysctl_net_ipport_check, "I", "");
	+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ lowlast, CTLTYPE_INT\|CTLFLAG_RW, ipport_lowlastauto, 0,
	+ &sysctl_net_ipport_check, "I", "");
	+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ first, CTLTYPE_INT\|CTLFLAG_RW, ipport_firstauto, 0,
	+ &sysctl_net_ipport_check, "I", "");
	+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ last, CTLTYPE_INT\|CTLFLAG_RW, ipport_lastauto, 0,
	+ &sysctl_net_ipport_check, "I", "");
	+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ hifirst, CTLTYPE_INT\|CTLFLAG_RW, ipport_hifirstauto, 0,
	+ &sysctl_net_ipport_check, "I", "");
	+SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ hilast, CTLTYPE_INT\|CTLFLAG_RW, ipport_hilastauto, 0,
	+ &sysctl_net_ipport_check, "I", "");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
	+ reservedhigh, CTLFLAG_RW\|CTLFLAG_SECURE, ipport_reservedhigh, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow,
	+ CTLFLAG_RW\|CTLFLAG_SECURE, ipport_reservedlow, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized,
	+ CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps,
	+ CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port "
	+ "allocations before switching to a sequental one");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime,
	+ CTLFLAG_RW, ipport_randomtime, 0,
	+ "Minimum time to keep sequental port "
	+ "allocation before switching to a random one");

	/*
	* in_pcb.c: manage the Protocol Control Blocks.
	*
	* NOTE: It is assumed that most of these functions will be called with
	* the pcbinfo lock held, and often, the inpcb lock held, as these utility
	* functions often modify hash chains or addresses in pcbs.
	*/

	/*
	* Allocate a PCB and associate it with the socket.
	* On success return with the PCB locked.
	*/
	int
	in_pcballoc(struct socket so, struct inpcbinfo pcbinfo)
	{
	+#ifdef INET6
	+ INIT_VNET_INET6(curvnet);
	+#endif
	struct inpcb *inp;
	int error;

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	error = 0;
	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
	if (inp == NULL)
	return (ENOBUFS);
	bzero(inp, inp_zero_size);
	inp->inp_pcbinfo = pcbinfo;
	inp->inp_socket = so;
	inp->inp_inc.inc_fibnum = so->so_fibnum;
	#ifdef MAC
	error = mac_inpcb_init(inp, M_NOWAIT);
	if (error != 0)
	goto out;
	SOCK_LOCK(so);
	mac_inpcb_create(so, inp);
	SOCK_UNLOCK(so);
	#endif

	#ifdef IPSEC
	error = ipsec_init_policy(so, &inp->inp_sp);
	if (error != 0) {
	#ifdef MAC
	mac_inpcb_destroy(inp);
	#endif
	goto out;
	}
	#endif /IPSEC/
	#ifdef INET6
	if (INP_SOCKAF(so) == AF_INET6) {
	inp->inp_vflag \|= INP_IPV6PROTO;
	if (V_ip6_v6only)
	inp->inp_flags \|= IN6P_IPV6_V6ONLY;
	}
	#endif
	LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
	pcbinfo->ipi_count++;
	so->so_pcb = (caddr_t)inp;
	#ifdef INET6
	if (V_ip6_auto_flowlabel)
	inp->inp_flags \|= IN6P_AUTOFLOWLABEL;
	#endif
	INP_WLOCK(inp);
	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;

	#if defined(IPSEC) \|\| defined(MAC)
	out:
	if (error != 0)
	uma_zfree(pcbinfo->ipi_zone, inp);
	#endif
	return (error);
	}

	int
	in_pcbbind(struct inpcb inp, struct sockaddr nam, struct ucred *cred)
	{
	int anonport, error;

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	if (inp->inp_lport != 0 \|\| inp->inp_laddr.s_addr != INADDR_ANY)
	return (EINVAL);
	anonport = inp->inp_lport == 0 && (nam == NULL \|\|
	((struct sockaddr_in *)nam)->sin_port == 0);
	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
	&inp->inp_lport, cred);
	if (error)
	return (error);
	if (in_pcbinshash(inp) != 0) {
	inp->inp_laddr.s_addr = INADDR_ANY;
	inp->inp_lport = 0;
	return (EAGAIN);
	}
	if (anonport)
	inp->inp_flags \|= INP_ANONPORT;
	return (0);
	}

	/*
	* Set up a bind operation on a PCB, performing port allocation
	* as required, but do not actually modify the PCB. Callers can
	* either complete the bind by setting inp_laddr/inp_lport and
	* calling in_pcbinshash(), or they can just use the resulting
	* port and address to authorise the sending of a once-off packet.
	*
	* On error, the values of laddrp and lportp are not changed.
	*/
	int
	in_pcbbind_setup(struct inpcb inp, struct sockaddr nam, in_addr_t *laddrp,
	u_short lportp, struct ucred cred)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct socket *so = inp->inp_socket;
	unsigned short *lastport;
	struct sockaddr_in *sin;
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
	struct in_addr laddr;
	u_short lport = 0;
	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
	int error, prison = 0;
	int dorandom;

	/*
	* Because no actual state changes occur here, a global write lock on
	* the pcbinfo isn't required.
	*/
	INP_INFO_LOCK_ASSERT(pcbinfo);
	INP_LOCK_ASSERT(inp);

	if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
	return (EADDRNOTAVAIL);
	laddr.s_addr = *laddrp;
	if (nam != NULL && laddr.s_addr != INADDR_ANY)
	return (EINVAL);
	if ((so->so_options & (SO_REUSEADDR\|SO_REUSEPORT)) == 0)
	wild = INPLOOKUP_WILDCARD;
	if (nam) {
	sin = (struct sockaddr_in *)nam;
	if (nam->sa_len != sizeof (*sin))
	return (EINVAL);
	#ifdef notdef
	/*
	* We should check the family, but old programs
	* incorrectly fail to initialize it.
	*/
	if (sin->sin_family != AF_INET)
	return (EAFNOSUPPORT);
	#endif
	if (sin->sin_addr.s_addr != INADDR_ANY)
	if (prison_ip(cred, 0, &sin->sin_addr.s_addr))
	return(EINVAL);
	if (sin->sin_port != *lportp) {
	/* Don't allow the port to change. */
	if (*lportp != 0)
	return (EINVAL);
	lport = sin->sin_port;
	}
	/* NB: lport is left as 0 if the port isn't being changed. */
	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
	/*
	* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
	* allow complete duplication of binding if
	* SO_REUSEPORT is set, or if SO_REUSEADDR is set
	* and a multicast address is bound on both
	* new and duplicated sockets.
	*/
	if (so->so_options & SO_REUSEADDR)
	reuseport = SO_REUSEADDR\|SO_REUSEPORT;
	} else if (sin->sin_addr.s_addr != INADDR_ANY) {
	sin->sin_port = 0; /* yech... */
	bzero(&sin->sin_zero, sizeof(sin->sin_zero));
	if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
	return (EADDRNOTAVAIL);
	}
	laddr = sin->sin_addr;
	if (lport) {
	struct inpcb *t;
	struct tcptw *tw;

	/* GROSS */
	if (ntohs(lport) <= V_ipport_reservedhigh &&
	ntohs(lport) >= V_ipport_reservedlow &&
	priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
	0))
	return (EACCES);
	if (jailed(cred))
	prison = 1;
	if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
	priv_check_cred(so->so_cred,
	PRIV_NETINET_REUSEPORT, 0) != 0) {
	t = in_pcblookup_local(pcbinfo, sin->sin_addr,
	lport, prison ? 0 : INPLOOKUP_WILDCARD,
	cred);
	/*
	* XXX
	* This entire block sorely needs a rewrite.
	*/
	if (t &&
	((t->inp_vflag & INP_TIMEWAIT) == 0) &&
	(so->so_type != SOCK_STREAM \|\|
	ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
	(ntohl(sin->sin_addr.s_addr) != INADDR_ANY \|\|
	ntohl(t->inp_laddr.s_addr) != INADDR_ANY \|\|
	(t->inp_socket->so_options &
	SO_REUSEPORT) == 0) &&
	(so->so_cred->cr_uid !=
	t->inp_socket->so_cred->cr_uid))
	return (EADDRINUSE);
	}
	if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr))
	return (EADDRNOTAVAIL);
	t = in_pcblookup_local(pcbinfo, sin->sin_addr,
	lport, prison ? 0 : wild, cred);
	if (t && (t->inp_vflag & INP_TIMEWAIT)) {
	/*
	* XXXRW: If an incpb has had its timewait
	* state recycled, we treat the address as
	* being in use (for now). This is better
	* than a panic, but not desirable.
	*/
	tw = intotw(inp);
	if (tw == NULL \|\|
	(reuseport & tw->tw_so_options) == 0)
	return (EADDRINUSE);
	} else if (t &&
	(reuseport & t->inp_socket->so_options) == 0) {
	#ifdef INET6
	if (ntohl(sin->sin_addr.s_addr) !=
	INADDR_ANY \|\|
	ntohl(t->inp_laddr.s_addr) !=
	INADDR_ANY \|\|
	INP_SOCKAF(so) ==
	INP_SOCKAF(t->inp_socket))
	#endif
	return (EADDRINUSE);
	}
	}
	}
	if (*lportp != 0)
	lport = *lportp;
	if (lport == 0) {
	u_short first, last, aux;
	int count;

	if (laddr.s_addr != INADDR_ANY)
	if (prison_ip(cred, 0, &laddr.s_addr))
	return (EINVAL);

	if (inp->inp_flags & INP_HIGHPORT) {
	first = V_ipport_hifirstauto; /* sysctl */
	last = V_ipport_hilastauto;
	lastport = &pcbinfo->ipi_lasthi;
	} else if (inp->inp_flags & INP_LOWPORT) {
	error = priv_check_cred(cred,
	PRIV_NETINET_RESERVEDPORT, 0);
	if (error)
	return error;
	first = V_ipport_lowfirstauto; /* 1023 */
	last = V_ipport_lowlastauto; /* 600 */
	lastport = &pcbinfo->ipi_lastlow;
	} else {
	first = V_ipport_firstauto; /* sysctl */
	last = V_ipport_lastauto;
	lastport = &pcbinfo->ipi_lastport;
	}
	/*
	* For UDP, use random port allocation as long as the user
	* allows it. For TCP (and as of yet unknown) connections,
	* use random port allocation only if the user allows it AND
	* ipport_tick() allows it.
	*/
	if (V_ipport_randomized &&
	(!V_ipport_stoprandom \|\| pcbinfo == &V_udbinfo))
	dorandom = 1;
	else
	dorandom = 0;
	/*
	* It makes no sense to do random port allocation if
	* we have the only port available.
	*/
	if (first == last)
	dorandom = 0;
	/* Make sure to not include UDP packets in the count. */
	if (pcbinfo != &V_udbinfo)
	V_ipport_tcpallocs++;
	/*
	* Simple check to ensure all ports are not used up causing
	* a deadlock here.
	*/
	if (first > last) {
	aux = first;
	first = last;
	last = aux;
	}

	if (dorandom)
	*lastport = first +
	(arc4random() % (last - first));

	count = last - first;

	do {
	if (count-- < 0) /* completely used? */
	return (EADDRNOTAVAIL);
	++*lastport;
	if (lastport < first \|\| lastport > last)
	*lastport = first;
	lport = htons(*lastport);
	} while (in_pcblookup_local(pcbinfo, laddr,
	lport, wild, cred));
	}
	if (prison_ip(cred, 0, &laddr.s_addr))
	return (EINVAL);
	*laddrp = laddr.s_addr;
	*lportp = lport;
	return (0);
	}

	/*
	* Connect from a socket to a specified address.
	* Both address and port must be specified in argument sin.
	* If don't have a local address for this socket yet,
	* then pick one.
	*/
	int
	in_pcbconnect(struct inpcb inp, struct sockaddr nam, struct ucred *cred)
	{
	u_short lport, fport;
	in_addr_t laddr, faddr;
	int anonport, error;

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	lport = inp->inp_lport;
	laddr = inp->inp_laddr.s_addr;
	anonport = (lport == 0);
	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
	NULL, cred);
	if (error)
	return (error);

	/* Do the initial binding of the local address if required. */
	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
	inp->inp_lport = lport;
	inp->inp_laddr.s_addr = laddr;
	if (in_pcbinshash(inp) != 0) {
	inp->inp_laddr.s_addr = INADDR_ANY;
	inp->inp_lport = 0;
	return (EAGAIN);
	}
	}

	/* Commit the remaining changes. */
	inp->inp_lport = lport;
	inp->inp_laddr.s_addr = laddr;
	inp->inp_faddr.s_addr = faddr;
	inp->inp_fport = fport;
	in_pcbrehash(inp);

	if (anonport)
	inp->inp_flags \|= INP_ANONPORT;
	return (0);
	}

	/*
	* Set up for a connect from a socket to the specified address.
	* On entry, laddrp and lportp should contain the current local
	* address and port for the PCB; these are updated to the values
	* that should be placed in inp_laddr and inp_lport to complete
	* the connect.
	*
	* On success, faddrp and fportp will be set to the remote address
	* and port. These are not updated in the error case.
	*
	* If the operation fails because the connection already exists,
	* *oinpp will be set to the PCB of that connection so that the
	* caller can decide to override it. In all other cases, *oinpp
	* is set to NULL.
	*/
	int
	in_pcbconnect_setup(struct inpcb inp, struct sockaddr nam,
	in_addr_t laddrp, u_short lportp, in_addr_t faddrp, u_short fportp,
	struct inpcb *oinpp, struct ucred cred)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct sockaddr_in sin = (struct sockaddr_in )nam;
	struct in_ifaddr *ia;
	struct sockaddr_in sa;
	struct ucred *socred;
	struct inpcb *oinp;
	struct in_addr laddr, faddr;
	u_short lport, fport;
	int error;

	/*
	* Because a global state change doesn't actually occur here, a read
	* lock is sufficient.
	*/
	INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
	INP_LOCK_ASSERT(inp);

	if (oinpp != NULL)
	*oinpp = NULL;
	if (nam->sa_len != sizeof (*sin))
	return (EINVAL);
	if (sin->sin_family != AF_INET)
	return (EAFNOSUPPORT);
	if (sin->sin_port == 0)
	return (EADDRNOTAVAIL);
	laddr.s_addr = *laddrp;
	lport = *lportp;
	faddr = sin->sin_addr;
	fport = sin->sin_port;
	socred = inp->inp_socket->so_cred;
	if (laddr.s_addr == INADDR_ANY && jailed(socred)) {
	bzero(&sa, sizeof(sa));
	sa.sin_addr.s_addr = htonl(prison_getip(socred));
	sa.sin_len = sizeof(sa);
	sa.sin_family = AF_INET;
	error = in_pcbbind_setup(inp, (struct sockaddr *)&sa,
	&laddr.s_addr, &lport, cred);
	if (error)
	return (error);
	}
	if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
	/*
	* If the destination address is INADDR_ANY,
	* use the primary local address.
	* If the supplied address is INADDR_BROADCAST,
	* and the primary interface supports broadcast,
	* choose the broadcast address for that interface.
	*/
	if (faddr.s_addr == INADDR_ANY)
	faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
	else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
	(TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
	IFF_BROADCAST))
	faddr = satosin(&TAILQ_FIRST(
	&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
	}
	if (laddr.s_addr == INADDR_ANY) {
	ia = NULL;
	/*
	* If route is known our src addr is taken from the i/f,
	* else punt.
	*
	* Find out route to destination
	*/
	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
	ia = ip_rtaddr(faddr, inp->inp_inc.inc_fibnum);
	/*
	* If we found a route, use the address corresponding to
	* the outgoing interface.
	*
	* Otherwise assume faddr is reachable on a directly connected
	* network and try to find a corresponding interface to take
	* the source address from.
	*/
	if (ia == NULL) {
	bzero(&sa, sizeof(sa));
	sa.sin_addr = faddr;
	sa.sin_len = sizeof(sa);
	sa.sin_family = AF_INET;

	ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sa)));
	if (ia == NULL)
	ia = ifatoia(ifa_ifwithnet(sintosa(&sa)));
	if (ia == NULL)
	return (ENETUNREACH);
	}
	/*
	* If the destination address is multicast and an outgoing
	* interface has been set as a multicast option, use the
	* address of that interface as our source address.
	*/
	if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
	inp->inp_moptions != NULL) {
	struct ip_moptions *imo;
	struct ifnet *ifp;

	imo = inp->inp_moptions;
	if (imo->imo_multicast_ifp != NULL) {
	ifp = imo->imo_multicast_ifp;
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
	if (ia->ia_ifp == ifp)
	break;
	if (ia == NULL)
	return (EADDRNOTAVAIL);
	}
	}
	laddr = ia->ia_addr.sin_addr;
	}

	oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
	0, NULL);
	if (oinp != NULL) {
	if (oinpp != NULL)
	*oinpp = oinp;
	return (EADDRINUSE);
	}
	if (lport == 0) {
	error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
	cred);
	if (error)
	return (error);
	}
	*laddrp = laddr.s_addr;
	*lportp = lport;
	*faddrp = faddr.s_addr;
	*fportp = fport;
	return (0);
	}

	void
	in_pcbdisconnect(struct inpcb *inp)
	{

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	inp->inp_faddr.s_addr = INADDR_ANY;
	inp->inp_fport = 0;
	in_pcbrehash(inp);
	}

	/*
	* Historically, in_pcbdetach() included the functionality now found in
	* in_pcbfree() and in_pcbdrop(). They are now broken out to reflect the
	* more complex life cycle of TCP.
	*
	* in_pcbdetach() is responsibe for disconnecting the socket from an inpcb.
	* For most protocols, this will be invoked immediately prior to calling
	* in_pcbfree(). However, for TCP the inpcb may significantly outlive the
	* socket, in which case in_pcbfree() may be deferred.
	*/
	void
	in_pcbdetach(struct inpcb *inp)
	{

	KASSERT(inp->inp_socket != NULL, ("in_pcbdetach: inp_socket == NULL"));

	inp->inp_socket->so_pcb = NULL;
	inp->inp_socket = NULL;
	}

	/*
	* in_pcbfree() is responsible for freeing an already-detached inpcb, as well
	* as removing it from any global inpcb lists it might be on.
	*/
	void
	in_pcbfree(struct inpcb *inp)
	{
	struct inpcbinfo *ipi = inp->inp_pcbinfo;

	KASSERT(inp->inp_socket == NULL, ("in_pcbfree: inp_socket != NULL"));

	INP_INFO_WLOCK_ASSERT(ipi);
	INP_WLOCK_ASSERT(inp);

	#ifdef IPSEC
	ipsec4_delete_pcbpolicy(inp);
	#endif /IPSEC/
	inp->inp_gencnt = ++ipi->ipi_gencnt;
	in_pcbremlists(inp);
	if (inp->inp_options)
	(void)m_free(inp->inp_options);
	if (inp->inp_moptions != NULL)
	inp_freemoptions(inp->inp_moptions);
	inp->inp_vflag = 0;

	#ifdef MAC
	mac_inpcb_destroy(inp);
	#endif
	INP_WUNLOCK(inp);
	uma_zfree(ipi->ipi_zone, inp);
	}

	/*
	* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
	* port reservation, and preventing it from being returned by inpcb lookups.
	*
	* It is used by TCP to mark an inpcb as unused and avoid future packet
	* delivery or event notification when a socket remains open but TCP has
	* closed. This might occur as a result of a shutdown()-initiated TCP close
	* or a RST on the wire, and allows the port binding to be reused while still
	* maintaining the invariant that so_pcb always points to a valid inpcb until
	* in_pcbdetach().
	*
	* XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
	* lists, but can lead to confusing netstat output, as open sockets with
	* closed TCP connections will no longer appear to have their bound port
	* number. An explicit flag would be better, as it would allow us to leave
	* the port number intact after the connection is dropped.
	*
	* XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
	* in_pcbnotifyall() and in_pcbpurgeif0()?
	*/
	void
	in_pcbdrop(struct inpcb *inp)
	{

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	inp->inp_vflag \|= INP_DROPPED;
	if (inp->inp_lport) {
	struct inpcbport *phd = inp->inp_phd;

	LIST_REMOVE(inp, inp_hash);
	LIST_REMOVE(inp, inp_portlist);
	if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
	LIST_REMOVE(phd, phd_hash);
	free(phd, M_PCB);
	}
	inp->inp_lport = 0;
	}
	}

	/*
	* Common routines to return the socket addresses associated with inpcbs.
	*/
	struct sockaddr *
	in_sockaddr(in_port_t port, struct in_addr *addr_p)
	{
	struct sockaddr_in *sin;

	MALLOC(sin, struct sockaddr_in , sizeof sin, M_SONAME,
	M_WAITOK \| M_ZERO);
	sin->sin_family = AF_INET;
	sin->sin_len = sizeof(*sin);
	sin->sin_addr = *addr_p;
	sin->sin_port = port;

	return (struct sockaddr *)sin;
	}

	int
	in_getsockaddr(struct socket so, struct sockaddr *nam)
	{
	struct inpcb *inp;
	struct in_addr addr;
	in_port_t port;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));

	INP_RLOCK(inp);
	port = inp->inp_lport;
	addr = inp->inp_laddr;
	INP_RUNLOCK(inp);

	*nam = in_sockaddr(port, &addr);
	return 0;
	}

	int
	in_getpeeraddr(struct socket so, struct sockaddr *nam)
	{
	struct inpcb *inp;
	struct in_addr addr;
	in_port_t port;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));

	INP_RLOCK(inp);
	port = inp->inp_fport;
	addr = inp->inp_faddr;
	INP_RUNLOCK(inp);

	*nam = in_sockaddr(port, &addr);
	return 0;
	}

	void
	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
	struct inpcb (notify)(struct inpcb *, int))
	{
	struct inpcb inp, inp_temp;

	INP_INFO_WLOCK(pcbinfo);
	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
	INP_WLOCK(inp);
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0) {
	INP_WUNLOCK(inp);
	continue;
	}
	#endif
	if (inp->inp_faddr.s_addr != faddr.s_addr \|\|
	inp->inp_socket == NULL) {
	INP_WUNLOCK(inp);
	continue;
	}
	if ((*notify)(inp, errno))
	INP_WUNLOCK(inp);
	}
	INP_INFO_WUNLOCK(pcbinfo);
	}

	void
	in_pcbpurgeif0(struct inpcbinfo pcbinfo, struct ifnet ifp)
	{
	struct inpcb *inp;
	struct ip_moptions *imo;
	int i, gap;

	INP_INFO_RLOCK(pcbinfo);
	LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
	INP_WLOCK(inp);
	imo = inp->inp_moptions;
	if ((inp->inp_vflag & INP_IPV4) &&
	imo != NULL) {
	/*
	* Unselect the outgoing interface if it is being
	* detached.
	*/
	if (imo->imo_multicast_ifp == ifp)
	imo->imo_multicast_ifp = NULL;

	/*
	* Drop multicast group membership if we joined
	* through the interface being detached.
	*/
	for (i = 0, gap = 0; i < imo->imo_num_memberships;
	i++) {
	if (imo->imo_membership[i]->inm_ifp == ifp) {
	in_delmulti(imo->imo_membership[i]);
	gap++;
	} else if (gap != 0)
	imo->imo_membership[i - gap] =
	imo->imo_membership[i];
	}
	imo->imo_num_memberships -= gap;
	}
	INP_WUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(pcbinfo);
	}

	/*
	* Lookup a PCB based on the local address and port.
	*/
	#define INP_LOOKUP_MAPPED_PCB_COST 3
	struct inpcb *
	in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
	u_short lport, int wild_okay, struct ucred *cred)
	{
	struct inpcb *inp;
	#ifdef INET6
	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
	#else
	int matchwild = 3;
	#endif
	int wildcard;

	INP_INFO_LOCK_ASSERT(pcbinfo);

	if (!wild_okay) {
	struct inpcbhead *head;
	/*
	* Look for an unconnected (wildcard foreign addr) PCB that
	* matches the local address and port we're looking for.
	*/
	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
	0, pcbinfo->ipi_hashmask)];
	LIST_FOREACH(inp, head, inp_hash) {
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	#endif
	if (inp->inp_faddr.s_addr == INADDR_ANY &&
	inp->inp_laddr.s_addr == laddr.s_addr &&
	inp->inp_lport == lport) {
	/*
	* Found.
	*/
	return (inp);
	}
	}
	/*
	* Not found.
	*/
	return (NULL);
	} else {
	struct inpcbporthead *porthash;
	struct inpcbport *phd;
	struct inpcb *match = NULL;
	/*
	* Best fit PCB lookup.
	*
	* First see if this local port is in use by looking on the
	* port hash list.
	*/
	porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
	pcbinfo->ipi_porthashmask)];
	LIST_FOREACH(phd, porthash, phd_hash) {
	if (phd->phd_port == lport)
	break;
	}
	if (phd != NULL) {
	/*
	* Port is in use by one or more PCBs. Look for best
	* fit.
	*/
	LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
	wildcard = 0;
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	/*
	* We never select the PCB that has
	* INP_IPV6 flag and is bound to :: if
	* we have another PCB which is bound
	* to 0.0.0.0. If a PCB has the
	* INP_IPV6 flag, then we set its cost
	* higher than IPv4 only PCBs.
	*
	* Note that the case only happens
	* when a socket is bound to ::, under
	* the condition that the use of the
	* mapped address is allowed.
	*/
	if ((inp->inp_vflag & INP_IPV6) != 0)
	wildcard += INP_LOOKUP_MAPPED_PCB_COST;
	#endif
	if (inp->inp_faddr.s_addr != INADDR_ANY)
	wildcard++;
	if (inp->inp_laddr.s_addr != INADDR_ANY) {
	if (laddr.s_addr == INADDR_ANY)
	wildcard++;
	else if (inp->inp_laddr.s_addr != laddr.s_addr)
	continue;
	} else {
	if (laddr.s_addr != INADDR_ANY)
	wildcard++;
	}
	if (wildcard < matchwild) {
	match = inp;
	matchwild = wildcard;
	if (matchwild == 0) {
	break;
	}
	}
	}
	}
	return (match);
	}
	}
	#undef INP_LOOKUP_MAPPED_PCB_COST

	/*
	* Lookup PCB in hash list.
	*/
	struct inpcb *
	in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
	u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
	struct ifnet *ifp)
	{
	struct inpcbhead *head;
	struct inpcb *inp;
	u_short fport = fport_arg, lport = lport_arg;

	INP_INFO_LOCK_ASSERT(pcbinfo);

	/*
	* First look for an exact match.
	*/
	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
	pcbinfo->ipi_hashmask)];
	LIST_FOREACH(inp, head, inp_hash) {
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	#endif
	if (inp->inp_faddr.s_addr == faddr.s_addr &&
	inp->inp_laddr.s_addr == laddr.s_addr &&
	inp->inp_fport == fport &&
	inp->inp_lport == lport)
	return (inp);
	}

	/*
	* Then look for a wildcard match, if requested.
	*/
	if (wildcard) {
	struct inpcb *local_wild = NULL;
	#ifdef INET6
	struct inpcb *local_wild_mapped = NULL;
	#endif

	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
	0, pcbinfo->ipi_hashmask)];
	LIST_FOREACH(inp, head, inp_hash) {
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	#endif
	if (inp->inp_faddr.s_addr == INADDR_ANY &&
	inp->inp_lport == lport) {
	if (ifp && ifp->if_type == IFT_FAITH &&
	(inp->inp_flags & INP_FAITH) == 0)
	continue;
	if (inp->inp_laddr.s_addr == laddr.s_addr)
	return (inp);
	else if (inp->inp_laddr.s_addr == INADDR_ANY) {
	#ifdef INET6
	if (INP_CHECK_SOCKAF(inp->inp_socket,
	AF_INET6))
	local_wild_mapped = inp;
	else
	#endif
	local_wild = inp;
	}
	}
	}
	#ifdef INET6
	if (local_wild == NULL)
	return (local_wild_mapped);
	#endif
	return (local_wild);
	}
	return (NULL);
	}

	/*
	* Insert PCB onto various hash lists.
	*/
	int
	in_pcbinshash(struct inpcb *inp)
	{
	struct inpcbhead *pcbhash;
	struct inpcbporthead *pcbporthash;
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
	struct inpcbport *phd;
	u_int32_t hashkey_faddr;

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	INP_WLOCK_ASSERT(inp);

	#ifdef INET6
	if (inp->inp_vflag & INP_IPV6)
	hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
	else
	#endif /* INET6 */
	hashkey_faddr = inp->inp_faddr.s_addr;

	pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
	inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];

	pcbporthash = &pcbinfo->ipi_porthashbase[
	INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];

	/*
	* Go through port list and look for a head for this lport.
	*/
	LIST_FOREACH(phd, pcbporthash, phd_hash) {
	if (phd->phd_port == inp->inp_lport)
	break;
	}
	/*
	* If none exists, malloc one and tack it on.
	*/
	if (phd == NULL) {
	MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
	if (phd == NULL) {
	return (ENOBUFS); /* XXX */
	}
	phd->phd_port = inp->inp_lport;
	LIST_INIT(&phd->phd_pcblist);
	LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
	}
	inp->inp_phd = phd;
	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
	return (0);
	}

	/*
	* Move PCB to the proper hash bucket when { faddr, fport } have been
	* changed. NOTE: This does not handle the case of the lport changing (the
	* hashed port list would have to be updated as well), so the lport must
	* not change after in_pcbinshash() has been called.
	*/
	void
	in_pcbrehash(struct inpcb *inp)
	{
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
	struct inpcbhead *head;
	u_int32_t hashkey_faddr;

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	INP_WLOCK_ASSERT(inp);

	#ifdef INET6
	if (inp->inp_vflag & INP_IPV6)
	hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
	else
	#endif /* INET6 */
	hashkey_faddr = inp->inp_faddr.s_addr;

	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
	inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];

	LIST_REMOVE(inp, inp_hash);
	LIST_INSERT_HEAD(head, inp, inp_hash);
	}

	/*
	* Remove PCB from various lists.
	*/
	void
	in_pcbremlists(struct inpcb *inp)
	{
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	INP_WLOCK_ASSERT(inp);

	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
	if (inp->inp_lport) {
	struct inpcbport *phd = inp->inp_phd;

	LIST_REMOVE(inp, inp_hash);
	LIST_REMOVE(inp, inp_portlist);
	if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
	LIST_REMOVE(phd, phd_hash);
	free(phd, M_PCB);
	}
	}
	LIST_REMOVE(inp, inp_list);
	pcbinfo->ipi_count--;
	}

	/*
	* A set label operation has occurred at the socket layer, propagate the
	* label change into the in_pcb for the socket.
	*/
	void
	in_pcbsosetlabel(struct socket *so)
	{
	#ifdef MAC
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));

	INP_WLOCK(inp);
	SOCK_LOCK(so);
	mac_inpcb_sosetlabel(so, inp);
	SOCK_UNLOCK(so);
	INP_WUNLOCK(inp);
	#endif
	}

	/*
	* ipport_tick runs once per second, determining if random port allocation
	* should be continued. If more than ipport_randomcps ports have been
	* allocated in the last second, then we return to sequential port
	* allocation. We return to random allocation only once we drop below
	* ipport_randomcps for at least ipport_randomtime seconds.
	*/
	void
	ipport_tick(void *xtp)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);

	- if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) {
	- if (V_ipport_stoprandom > 0)
	- V_ipport_stoprandom--;
	- } else
	- V_ipport_stoprandom = V_ipport_randomtime;
	- V_ipport_tcplastcount = V_ipport_tcpallocs;
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
	+ INIT_VNET_INET(vnet_iter);
	+ if (V_ipport_tcpallocs <=
	+ V_ipport_tcplastcount + V_ipport_randomcps) {
	+ if (V_ipport_stoprandom > 0)
	+ V_ipport_stoprandom--;
	+ } else
	+ V_ipport_stoprandom = V_ipport_randomtime;
	+ V_ipport_tcplastcount = V_ipport_tcpallocs;
	+ CURVNET_RESTORE();
	+ }
	+ VNET_LIST_RUNLOCK();
	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
	}

	void
	inp_wlock(struct inpcb *inp)
	{

	INP_WLOCK(inp);
	}

	void
	inp_wunlock(struct inpcb *inp)
	{

	INP_WUNLOCK(inp);
	}

	void
	inp_rlock(struct inpcb *inp)
	{

	INP_RLOCK(inp);
	}

	void
	inp_runlock(struct inpcb *inp)
	{

	INP_RUNLOCK(inp);
	}

	#ifdef INVARIANTS
	void
	inp_lock_assert(struct inpcb *inp)
	{

	INP_WLOCK_ASSERT(inp);
	}

	void
	inp_unlock_assert(struct inpcb *inp)
	{

	INP_UNLOCK_ASSERT(inp);
	}
	#endif

	void
	inp_apply_all(void (func)(struct inpcb , void ), void arg)
	{
	+ INIT_VNET_INET(curvnet);
	struct inpcb *inp;

	INP_INFO_RLOCK(&V_tcbinfo);
	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
	INP_WLOCK(inp);
	func(inp, arg);
	INP_WUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(&V_tcbinfo);
	}

	struct socket *
	inp_inpcbtosocket(struct inpcb *inp)
	{

	INP_WLOCK_ASSERT(inp);
	return (inp->inp_socket);
	}

	struct tcpcb *
	inp_inpcbtotcpcb(struct inpcb *inp)
	{

	INP_WLOCK_ASSERT(inp);
	return ((struct tcpcb *)inp->inp_ppcb);
	}

	int
	inp_ip_tos_get(const struct inpcb *inp)
	{

	return (inp->inp_ip_tos);
	}

	void
	inp_ip_tos_set(struct inpcb *inp, int val)
	{

	inp->inp_ip_tos = val;
	}

	void
	inp_4tuple_get(struct inpcb inp, uint32_t laddr, uint16_t *lp,
	uint32_t faddr, uint16_t fp)
	{

	INP_LOCK_ASSERT(inp);
	*laddr = inp->inp_laddr.s_addr;
	*faddr = inp->inp_faddr.s_addr;
	*lp = inp->inp_lport;
	*fp = inp->inp_fport;
	}

	struct inpcb *
	so_sotoinpcb(struct socket *so)
	{

	return (sotoinpcb(so));
	}

	struct tcpcb *
	so_sototcpcb(struct socket *so)
	{

	return (sototcpcb(so));
	}

	#ifdef DDB
	static void
	db_print_indent(int indent)
	{
	int i;

	for (i = 0; i < indent; i++)
	db_printf(" ");
	}

	static void
	db_print_inconninfo(struct in_conninfo inc, const char name, int indent)
	{
	char faddr_str[48], laddr_str[48];

	db_print_indent(indent);
	db_printf("%s at %p\n", name, inc);

	indent += 2;

	#ifdef INET6
	if (inc->inc_flags == 1) {
	/* IPv6. */
	ip6_sprintf(laddr_str, &inc->inc6_laddr);
	ip6_sprintf(faddr_str, &inc->inc6_faddr);
	} else {
	#endif
	/* IPv4. */
	inet_ntoa_r(inc->inc_laddr, laddr_str);
	inet_ntoa_r(inc->inc_faddr, faddr_str);
	#ifdef INET6
	}
	#endif
	db_print_indent(indent);
	db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
	ntohs(inc->inc_lport));
	db_print_indent(indent);
	db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
	ntohs(inc->inc_fport));
	}

	static void
	db_print_inpflags(int inp_flags)
	{
	int comma;

	comma = 0;
	if (inp_flags & INP_RECVOPTS) {
	db_printf("%sINP_RECVOPTS", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_RECVRETOPTS) {
	db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_RECVDSTADDR) {
	db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_HDRINCL) {
	db_printf("%sINP_HDRINCL", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_HIGHPORT) {
	db_printf("%sINP_HIGHPORT", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_LOWPORT) {
	db_printf("%sINP_LOWPORT", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_ANONPORT) {
	db_printf("%sINP_ANONPORT", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_RECVIF) {
	db_printf("%sINP_RECVIF", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_MTUDISC) {
	db_printf("%sINP_MTUDISC", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_FAITH) {
	db_printf("%sINP_FAITH", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_RECVTTL) {
	db_printf("%sINP_RECVTTL", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & INP_DONTFRAG) {
	db_printf("%sINP_DONTFRAG", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_IPV6_V6ONLY) {
	db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_PKTINFO) {
	db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_HOPLIMIT) {
	db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_HOPOPTS) {
	db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_DSTOPTS) {
	db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_RTHDR) {
	db_printf("%sIN6P_RTHDR", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_RTHDRDSTOPTS) {
	db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_TCLASS) {
	db_printf("%sIN6P_TCLASS", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_AUTOFLOWLABEL) {
	db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_RFC2292) {
	db_printf("%sIN6P_RFC2292", comma ? ", " : "");
	comma = 1;
	}
	if (inp_flags & IN6P_MTU) {
	db_printf("IN6P_MTU%s", comma ? ", " : "");
	comma = 1;
	}
	}

	static void
	db_print_inpvflag(u_char inp_vflag)
	{
	int comma;

	comma = 0;
	if (inp_vflag & INP_IPV4) {
	db_printf("%sINP_IPV4", comma ? ", " : "");
	comma = 1;
	}
	if (inp_vflag & INP_IPV6) {
	db_printf("%sINP_IPV6", comma ? ", " : "");
	comma = 1;
	}
	if (inp_vflag & INP_IPV6PROTO) {
	db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
	comma = 1;
	}
	if (inp_vflag & INP_TIMEWAIT) {
	db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
	comma = 1;
	}
	if (inp_vflag & INP_ONESBCAST) {
	db_printf("%sINP_ONESBCAST", comma ? ", " : "");
	comma = 1;
	}
	if (inp_vflag & INP_DROPPED) {
	db_printf("%sINP_DROPPED", comma ? ", " : "");
	comma = 1;
	}
	if (inp_vflag & INP_SOCKREF) {
	db_printf("%sINP_SOCKREF", comma ? ", " : "");
	comma = 1;
	}
	}

	void
	db_print_inpcb(struct inpcb inp, const char name, int indent)
	{

	db_print_indent(indent);
	db_printf("%s at %p\n", name, inp);

	indent += 2;

	db_print_indent(indent);
	db_printf("inp_flow: 0x%x\n", inp->inp_flow);

	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);

	db_print_indent(indent);
	db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
	inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);

	db_print_indent(indent);
	db_printf("inp_label: %p inp_flags: 0x%x (",
	inp->inp_label, inp->inp_flags);
	db_print_inpflags(inp->inp_flags);
	db_printf(")\n");

	db_print_indent(indent);
	db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
	inp->inp_vflag);
	db_print_inpvflag(inp->inp_vflag);
	db_printf(")\n");

	db_print_indent(indent);
	db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
	inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);

	db_print_indent(indent);
	#ifdef INET6
	if (inp->inp_vflag & INP_IPV6) {
	db_printf("in6p_options: %p in6p_outputopts: %p "
	"in6p_moptions: %p\n", inp->in6p_options,
	inp->in6p_outputopts, inp->in6p_moptions);
	db_printf("in6p_icmp6filt: %p in6p_cksum %d "
	"in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
	inp->in6p_hops);
	} else
	#endif
	{
	db_printf("inp_ip_tos: %d inp_ip_options: %p "
	"inp_ip_moptions: %p\n", inp->inp_ip_tos,
	inp->inp_options, inp->inp_moptions);
	}

	db_print_indent(indent);
	db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
	(uintmax_t)inp->inp_gencnt);
	}

	DB_SHOW_COMMAND(inpcb, db_show_inpcb)
	{
	struct inpcb *inp;

	if (!have_addr) {
	db_printf("usage: show inpcb <addr>\n");
	return;
	}
	inp = (struct inpcb *)addr;

	db_print_inpcb(inp, "inpcb", 0);
	}
	#endif
	Index: head/sys/netinet/in_rmx.c
	===================================================================
	--- head/sys/netinet/in_rmx.c (revision 183549)
	+++ head/sys/netinet/in_rmx.c (revision 183550)
	@@ -1,491 +1,505 @@
	/*-
	* Copyright 1994, 1995 Massachusetts Institute of Technology
	*
	* Permission to use, copy, modify, and distribute this software and
	* its documentation for any purpose and without fee is hereby
	* granted, provided that both the above copyright notice and this
	* permission notice appear in all copies, that both the above
	* copyright notice and this permission notice appear in all
	* supporting documentation, and that the name of M.I.T. not be used
	* in advertising or publicity pertaining to distribution of the
	* software without specific, written prior permission. M.I.T. makes
	* no representations about the suitability of this software for any
	* purpose. It is provided "as is" without express or implied
	* warranty.
	*
	* THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
	* ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
	* SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* This code does two things necessary for the enhanced TCP metrics to
	* function in a useful manner:
	* 1) It marks all non-host routes as `cloning', thus ensuring that
	* every actual reference to such a route actually gets turned
	* into a reference to a host route to the specific destination
	* requested.
	* 2) When such routes lose all their references, it arranges for them
	* to be deleted in some random collection of circumstances, so that
	* a large quantity of stale routing data is not kept in kernel memory
	* indefinitely. See in_rtqtimo() below for the exact mechanism.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/socket.h>
	#include <sys/mbuf.h>
	#include <sys/syslog.h>
	#include <sys/callout.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>

	extern int in_inithead(void **head, int off);

	#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */

	/*
	* Do what we need to do when inserting a route.
	*/
	static struct radix_node *
	in_addroute(void v_arg, void n_arg, struct radix_node_head *head,
	struct radix_node *treenodes)
	{
	struct rtentry rt = (struct rtentry )treenodes;
	struct sockaddr_in sin = (struct sockaddr_in )rt_key(rt);
	struct radix_node *ret;

	/*
	* A little bit of help for both IP output and input:
	* For host routes, we make sure that RTF_BROADCAST
	* is set for anything that looks like a broadcast address.
	* This way, we can avoid an expensive call to in_broadcast()
	* in ip_output() most of the time (because the route passed
	* to ip_output() is almost always a host route).
	*
	* We also do the same for local addresses, with the thought
	* that this might one day be used to speed up ip_input().
	*
	* We also mark routes to multicast addresses as such, because
	* it's easy to do and might be useful (but this is much more
	* dubious since it's so easy to inspect the address).
	*/
	if (rt->rt_flags & RTF_HOST) {
	if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
	rt->rt_flags \|= RTF_BROADCAST;
	} else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr ==
	sin->sin_addr.s_addr) {
	rt->rt_flags \|= RTF_LOCAL;
	}
	}
	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
	rt->rt_flags \|= RTF_MULTICAST;

	if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
	rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;

	ret = rn_addroute(v_arg, n_arg, head, treenodes);
	if (ret == NULL && rt->rt_flags & RTF_HOST) {
	struct rtentry *rt2;
	/*
	* We are trying to add a host route, but can't.
	* Find out if it is because of an
	* ARP entry and delete it if so.
	*/
	rt2 = in_rtalloc1((struct sockaddr *)sin, 0,
	RTF_CLONING, rt->rt_fibnum);
	if (rt2) {
	if (rt2->rt_flags & RTF_LLINFO &&
	rt2->rt_flags & RTF_HOST &&
	rt2->rt_gateway &&
	rt2->rt_gateway->sa_family == AF_LINK) {
	rtexpunge(rt2);
	RTFREE_LOCKED(rt2);
	ret = rn_addroute(v_arg, n_arg, head,
	treenodes);
	} else
	RTFREE_LOCKED(rt2);
	}
	}

	return ret;
	}

	/*
	* This code is the inverse of in_clsroute: on first reference, if we
	* were managing the route, stop doing so and set the expiration timer
	* back off again.
	*/
	static struct radix_node *
	in_matroute(void v_arg, struct radix_node_head head)
	{
	struct radix_node *rn = rn_match(v_arg, head);
	struct rtentry rt = (struct rtentry )rn;

	/XXX locking? /
	if (rt && rt->rt_refcnt == 0) { /* this is first reference */
	if (rt->rt_flags & RTPRF_OURS) {
	rt->rt_flags &= ~RTPRF_OURS;
	rt->rt_rmx.rmx_expire = 0;
	}
	}
	return rn;
	}

	static int rtq_reallyold = 6060; / one hour is "really old" */
	-SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
	- &rtq_reallyold, 0, "Default expiration time on dynamically learned routes");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTEXPIRE, rtexpire,
	+ CTLFLAG_RW, rtq_reallyold, 0,
	+ "Default expiration time on dynamically learned routes");

	static int rtq_minreallyold = 10; /* never automatically crank down to less */
	-SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
	- &rtq_minreallyold, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMINEXPIRE,
	+ rtminexpire, CTLFLAG_RW, rtq_minreallyold, 0,
	"Minimum time to attempt to hold onto dynamically learned routes");

	static int rtq_toomany = 128; /* 128 cached routes is "too many" */
	-SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
	- &rtq_toomany, 0, "Upper limit on dynamically learned routes");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_RTMAXCACHE,
	+ rtmaxcache, CTLFLAG_RW, rtq_toomany, 0,
	+ "Upper limit on dynamically learned routes");

	/*
	* On last reference drop, mark the route as belong to us so that it can be
	* timed out.
	*/
	static void
	in_clsroute(struct radix_node rn, struct radix_node_head head)
	{
	+ INIT_VNET_INET(curvnet);
	struct rtentry rt = (struct rtentry )rn;

	RT_LOCK_ASSERT(rt);

	if (!(rt->rt_flags & RTF_UP))
	return; /* prophylactic measures */

	if ((rt->rt_flags & (RTF_LLINFO \| RTF_HOST)) != RTF_HOST)
	return;

	if (rt->rt_flags & RTPRF_OURS)
	return;

	if (!(rt->rt_flags & (RTF_WASCLONED \| RTF_DYNAMIC)))
	return;

	/*
	* If rtq_reallyold is 0, just delete the route without
	* waiting for a timeout cycle to kill it.
	*/
	if (V_rtq_reallyold != 0) {
	rt->rt_flags \|= RTPRF_OURS;
	rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold;
	} else {
	rtexpunge(rt);
	}
	}

	struct rtqk_arg {
	struct radix_node_head *rnh;
	int draining;
	int killed;
	int found;
	int updating;
	time_t nextstop;
	};

	/*
	* Get rid of old routes. When draining, this deletes everything, even when
	* the timeout is not expired yet. When updating, this makes sure that
	* nothing has a timeout longer than the current value of rtq_reallyold.
	*/
	static int
	in_rtqkill(struct radix_node rn, void rock)
	{
	+ INIT_VNET_INET(curvnet);
	struct rtqk_arg *ap = rock;
	struct rtentry rt = (struct rtentry )rn;
	int err;

	if (rt->rt_flags & RTPRF_OURS) {
	ap->found++;

	if (ap->draining \|\| rt->rt_rmx.rmx_expire <= time_uptime) {
	if (rt->rt_refcnt > 0)
	panic("rtqkill route really not free");

	err = in_rtrequest(RTM_DELETE,
	(struct sockaddr *)rt_key(rt),
	rt->rt_gateway, rt_mask(rt),
	rt->rt_flags, 0, rt->rt_fibnum);
	if (err) {
	log(LOG_WARNING, "in_rtqkill: error %d\n", err);
	} else {
	ap->killed++;
	}
	} else {
	if (ap->updating &&
	(rt->rt_rmx.rmx_expire - time_uptime >
	V_rtq_reallyold)) {
	rt->rt_rmx.rmx_expire =
	time_uptime + V_rtq_reallyold;
	}
	ap->nextstop = lmin(ap->nextstop,
	rt->rt_rmx.rmx_expire);
	}
	}

	return 0;
	}

	#define RTQ_TIMEOUT 6010 / run no less than once every ten minutes */
	static int rtq_timeout = RTQ_TIMEOUT;
	static struct callout rtq_timer;

	static void in_rtqtimo_one(void *rock);

	static void
	in_rtqtimo(void *rock)
	{
	int fibnum;
	void *newrock;
	struct timeval atv;

	KASSERT((rock == (void *)V_rt_tables[0][AF_INET]),
	("in_rtqtimo: unexpected arg"));
	for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
	if ((newrock = V_rt_tables[fibnum][AF_INET]) != NULL)
	in_rtqtimo_one(newrock);
	}
	atv.tv_usec = 0;
	atv.tv_sec = V_rtq_timeout;
	callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock);
	}

	static void
	in_rtqtimo_one(void *rock)
	{
	struct radix_node_head *rnh = rock;
	struct rtqk_arg arg;
	static time_t last_adjusted_timeout = 0;

	arg.found = arg.killed = 0;
	arg.rnh = rnh;
	arg.nextstop = time_uptime + V_rtq_timeout;
	arg.draining = arg.updating = 0;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in_rtqkill, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);

	/*
	* Attempt to be somewhat dynamic about this:
	* If there are ``too many'' routes sitting around taking up space,
	* then crank down the timeout, and see if we can't make some more
	* go away. However, we make sure that we will never adjust more
	* than once in rtq_timeout seconds, to keep from cranking down too
	* hard.
	*/
	if ((arg.found - arg.killed > V_rtq_toomany) &&
	(time_uptime - last_adjusted_timeout >= V_rtq_timeout) &&
	V_rtq_reallyold > V_rtq_minreallyold) {
	V_rtq_reallyold = 2 * V_rtq_reallyold / 3;
	if (V_rtq_reallyold < V_rtq_minreallyold) {
	V_rtq_reallyold = V_rtq_minreallyold;
	}

	last_adjusted_timeout = time_uptime;
	#ifdef DIAGNOSTIC
	log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
	V_rtq_reallyold);
	#endif
	arg.found = arg.killed = 0;
	arg.updating = 1;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in_rtqkill, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	}

	}

	void
	in_rtqdrain(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct radix_node_head *rnh;
	struct rtqk_arg arg;
	int fibnum;

	- for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
	- rnh = V_rt_tables[fibnum][AF_INET];
	- arg.found = arg.killed = 0;
	- arg.rnh = rnh;
	- arg.nextstop = 0;
	- arg.draining = 1;
	- arg.updating = 0;
	- RADIX_NODE_HEAD_LOCK(rnh);
	- rnh->rnh_walktree(rnh, in_rtqkill, &arg);
	- RADIX_NODE_HEAD_UNLOCK(rnh);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_NET(vnet_iter);
	+ for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
	+ rnh = V_rt_tables[fibnum][AF_INET];
	+ arg.found = arg.killed = 0;
	+ arg.rnh = rnh;
	+ arg.nextstop = 0;
	+ arg.draining = 1;
	+ arg.updating = 0;
	+ RADIX_NODE_HEAD_LOCK(rnh);
	+ rnh->rnh_walktree(rnh, in_rtqkill, &arg);
	+ RADIX_NODE_HEAD_UNLOCK(rnh);
	+ }
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	}

	static int _in_rt_was_here;
	/*
	* Initialize our routing tree.
	*/
	int
	in_inithead(void **head, int off)
	{
	+ INIT_VNET_INET(curvnet);
	struct radix_node_head *rnh;

	/* XXX MRT
	* This can be called from vfs_export.c too in which case 'off'
	* will be 0. We know the correct value so just use that and
	* return directly if it was 0.
	* This is a hack that replaces an even worse hack on a bad hack
	* on a bad design. After RELENG_7 this should be fixed but that
	* will change the ABI, so for now do it this way.
	*/
	if (!rn_inithead(head, 32))
	return 0;

	if (off == 0) /* XXX MRT see above */
	return 1; /* only do the rest for a real routing table */

	rnh = *head;
	rnh->rnh_addaddr = in_addroute;
	rnh->rnh_matchaddr = in_matroute;
	rnh->rnh_close = in_clsroute;
	if (_in_rt_was_here == 0 ) {
	callout_init(&V_rtq_timer, CALLOUT_MPSAFE);
	in_rtqtimo(rnh); /* kick off timeout first time */
	_in_rt_was_here = 1;
	}
	return 1;
	}

	/*
	* This zaps old routes when the interface goes down or interface
	* address is deleted. In the latter case, it deletes static routes
	* that point to this address. If we don't do this, we may end up
	* using the old address in the future. The ones we always want to
	* get rid of are things like ARP entries, since the user might down
	* the interface, walk over to a completely different network, and
	* plug back in.
	*/
	struct in_ifadown_arg {
	struct ifaddr *ifa;
	int del;
	};

	static int
	in_ifadownkill(struct radix_node rn, void xap)
	{
	struct in_ifadown_arg *ap = xap;
	struct rtentry rt = (struct rtentry )rn;

	RT_LOCK(rt);
	if (rt->rt_ifa == ap->ifa &&
	(ap->del \|\| !(rt->rt_flags & RTF_STATIC))) {
	/*
	* We need to disable the automatic prune that happens
	* in this case in rtrequest() because it will blow
	* away the pointers that rn_walktree() needs in order
	* continue our descent. We will end up deleting all
	* the routes that rtrequest() would have in any case,
	* so that behavior is not needed there.
	*/
	rt->rt_flags &= ~RTF_CLONING;
	rtexpunge(rt);
	}
	RT_UNLOCK(rt);
	return 0;
	}

	int
	in_ifadown(struct ifaddr *ifa, int delete)
	{
	+ INIT_VNET_NET(curvnet);
	struct in_ifadown_arg arg;
	struct radix_node_head *rnh;
	int fibnum;

	if (ifa->ifa_addr->sa_family != AF_INET)
	return 1;

	for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
	rnh = V_rt_tables[fibnum][AF_INET];
	arg.ifa = ifa;
	arg.del = delete;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */
	}
	return 0;
	}

	/*
	* inet versions of rt functions. These have fib extensions and
	* for now will just reference the _fib variants.
	* eventually this order will be reversed,
	*/
	void
	in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum)
	{
	rtalloc_ign_fib(ro, ignflags, fibnum);
	}

	int
	in_rtrequest( int req,
	struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct rtentry **ret_nrt,
	u_int fibnum)
	{
	return (rtrequest_fib(req, dst, gateway, netmask,
	flags, ret_nrt, fibnum));
	}

	struct rtentry *
	in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum)
	{
	return (rtalloc1_fib(dst, report, ignflags, fibnum));
	}

	void
	in_rtredirect(struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct sockaddr *src,
	u_int fibnum)
	{
	rtredirect_fib(dst, gateway, netmask, flags, src, fibnum);
	}

	void
	in_rtalloc(struct route *ro, u_int fibnum)
	{
	rtalloc_ign_fib(ro, 0UL, fibnum);
	}

	#if 0
	int in_rt_getifa(struct rt_addrinfo *, u_int fibnum);
	int in_rtioctl(u_long, caddr_t, u_int);
	int in_rtrequest1(int, struct rt_addrinfo , struct rtentry *, u_int);
	#endif


	Index: head/sys/netinet/ip6.h
	===================================================================
	--- head/sys/netinet/ip6.h (revision 183549)
	+++ head/sys/netinet/ip6.h (revision 183550)
	@@ -1,351 +1,354 @@
	/* $FreeBSD$ */
	/* $KAME: ip6.h,v 1.18 2001/03/29 05:34:30 itojun Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*-
	* Copyright (c) 1982, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip.h 8.1 (Berkeley) 6/10/93
	*/

	#ifndef _NETINET_IP6_H_
	#define _NETINET_IP6_H_

	/*
	* Definition for internet protocol version 6.
	* RFC 2460
	*/

	struct ip6_hdr {
	union {
	struct ip6_hdrctl {
	u_int32_t ip6_un1_flow; /* 20 bits of flow-ID */
	u_int16_t ip6_un1_plen; /* payload length */
	u_int8_t ip6_un1_nxt; /* next header */
	u_int8_t ip6_un1_hlim; /* hop limit */
	} ip6_un1;
	u_int8_t ip6_un2_vfc; /* 4 bits version, top 4 bits class */
	} ip6_ctlun;
	struct in6_addr ip6_src; /* source address */
	struct in6_addr ip6_dst; /* destination address */
	} __packed;

	#define ip6_vfc ip6_ctlun.ip6_un2_vfc
	#define ip6_flow ip6_ctlun.ip6_un1.ip6_un1_flow
	#define ip6_plen ip6_ctlun.ip6_un1.ip6_un1_plen
	#define ip6_nxt ip6_ctlun.ip6_un1.ip6_un1_nxt
	#define ip6_hlim ip6_ctlun.ip6_un1.ip6_un1_hlim
	#define ip6_hops ip6_ctlun.ip6_un1.ip6_un1_hlim

	#define IPV6_VERSION 0x60
	#define IPV6_VERSION_MASK 0xf0

	#if BYTE_ORDER == BIG_ENDIAN
	#define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */
	#define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */
	#else
	#if BYTE_ORDER == LITTLE_ENDIAN
	#define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */
	#define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */
	#endif /* LITTLE_ENDIAN */
	#endif
	#if 1
	/* ECN bits proposed by Sally Floyd */
	#define IP6TOS_CE 0x01 /* congestion experienced */
	#define IP6TOS_ECT 0x02 /* ECN-capable transport */
	#endif

	/*
	* Extension Headers
	*/

	struct ip6_ext {
	u_int8_t ip6e_nxt;
	u_int8_t ip6e_len;
	} __packed;

	/* Hop-by-Hop options header */
	/* XXX should we pad it to force alignment on an 8-byte boundary? */
	struct ip6_hbh {
	u_int8_t ip6h_nxt; /* next header */
	u_int8_t ip6h_len; /* length in units of 8 octets */
	/* followed by options */
	} __packed;

	/* Destination options header */
	/* XXX should we pad it to force alignment on an 8-byte boundary? */
	struct ip6_dest {
	u_int8_t ip6d_nxt; /* next header */
	u_int8_t ip6d_len; /* length in units of 8 octets */
	/* followed by options */
	} __packed;

	/* Option types and related macros */
	#define IP6OPT_PAD1 0x00 /* 00 0 00000 */
	#define IP6OPT_PADN 0x01 /* 00 0 00001 */
	#define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */
	#define IP6OPT_NSAP_ADDR 0xC3 /* 11 0 00011 */
	#define IP6OPT_TUNNEL_LIMIT 0x04 /* 00 0 00100 */
	#ifndef _KERNEL
	#define IP6OPT_RTALERT 0x05 /* 00 0 00101 (KAME definition) */
	#endif
	#define IP6OPT_ROUTER_ALERT 0x05 /* 00 0 00101 (RFC3542, recommended) */

	#define IP6OPT_RTALERT_LEN 4
	#define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */
	#define IP6OPT_RTALERT_RSVP 1 /* Datagram contains an RSVP message */
	#define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */
	#define IP6OPT_MINLEN 2

	#define IP6OPT_EID 0x8a /* 10 0 01010 */

	#define IP6OPT_TYPE(o) ((o) & 0xC0)
	#define IP6OPT_TYPE_SKIP 0x00
	#define IP6OPT_TYPE_DISCARD 0x40
	#define IP6OPT_TYPE_FORCEICMP 0x80
	#define IP6OPT_TYPE_ICMP 0xC0

	#define IP6OPT_MUTABLE 0x20

	/* IPv6 options: common part */
	struct ip6_opt {
	u_int8_t ip6o_type;
	u_int8_t ip6o_len;
	} __packed;

	/* Jumbo Payload Option */
	struct ip6_opt_jumbo {
	u_int8_t ip6oj_type;
	u_int8_t ip6oj_len;
	u_int8_t ip6oj_jumbo_len[4];
	} __packed;
	#define IP6OPT_JUMBO_LEN 6

	/* NSAP Address Option */
	struct ip6_opt_nsap {
	u_int8_t ip6on_type;
	u_int8_t ip6on_len;
	u_int8_t ip6on_src_nsap_len;
	u_int8_t ip6on_dst_nsap_len;
	/* followed by source NSAP */
	/* followed by destination NSAP */
	} __packed;

	/* Tunnel Limit Option */
	struct ip6_opt_tunnel {
	u_int8_t ip6ot_type;
	u_int8_t ip6ot_len;
	u_int8_t ip6ot_encap_limit;
	} __packed;

	/* Router Alert Option */
	struct ip6_opt_router {
	u_int8_t ip6or_type;
	u_int8_t ip6or_len;
	u_int8_t ip6or_value[2];
	} __packed;
	/* Router alert values (in network byte order) */
	#if BYTE_ORDER == BIG_ENDIAN
	#define IP6_ALERT_MLD 0x0000
	#define IP6_ALERT_RSVP 0x0001
	#define IP6_ALERT_AN 0x0002
	#else
	#if BYTE_ORDER == LITTLE_ENDIAN
	#define IP6_ALERT_MLD 0x0000
	#define IP6_ALERT_RSVP 0x0100
	#define IP6_ALERT_AN 0x0200
	#endif /* LITTLE_ENDIAN */
	#endif

	/* Routing header */
	struct ip6_rthdr {
	u_int8_t ip6r_nxt; /* next header */
	u_int8_t ip6r_len; /* length in units of 8 octets */
	u_int8_t ip6r_type; /* routing type */
	u_int8_t ip6r_segleft; /* segments left */
	/* followed by routing type specific data */
	} __packed;

	/* Type 0 Routing header */
	struct ip6_rthdr0 {
	u_int8_t ip6r0_nxt; /* next header */
	u_int8_t ip6r0_len; /* length in units of 8 octets */
	u_int8_t ip6r0_type; /* always zero */
	u_int8_t ip6r0_segleft; /* segments left */
	u_int32_t ip6r0_reserved; /* reserved field */
	/* followed by up to 127 struct in6_addr */
	} __packed;

	/* Fragment header */
	struct ip6_frag {
	u_int8_t ip6f_nxt; /* next header */
	u_int8_t ip6f_reserved; /* reserved field */
	u_int16_t ip6f_offlg; /* offset, reserved, and flag */
	u_int32_t ip6f_ident; /* identification */
	} __packed;

	#if BYTE_ORDER == BIG_ENDIAN
	#define IP6F_OFF_MASK 0xfff8 /* mask out offset from _offlg */
	#define IP6F_RESERVED_MASK 0x0006 /* reserved bits in ip6f_offlg */
	#define IP6F_MORE_FRAG 0x0001 /* more-fragments flag */
	#else /* BYTE_ORDER == LITTLE_ENDIAN */
	#define IP6F_OFF_MASK 0xf8ff /* mask out offset from _offlg */
	#define IP6F_RESERVED_MASK 0x0600 /* reserved bits in ip6f_offlg */
	#define IP6F_MORE_FRAG 0x0100 /* more-fragments flag */
	#endif /* BYTE_ORDER == LITTLE_ENDIAN */

	/*
	* Internet implementation parameters.
	*/
	#define IPV6_MAXHLIM 255 /* maximum hoplimit */
	#define IPV6_DEFHLIM 64 /* default hlim */
	#define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */
	#define IPV6_HLIMDEC 1 /* subtracted when forwarding */

	#define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */
	#define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/
	#define IPV6_MAXOPTHDR 2048 /* max option header size, 256 64-bit words */

	#ifdef _KERNEL
	/*
	* IP6_EXTHDR_CHECK ensures that region between the IP6 header and the
	* target header (including IPv6 itself, extension headers and
	* TCP/UDP/ICMP6 headers) are continuous. KAME requires drivers
	* to store incoming data into one internal mbuf or one or more external
	* mbufs(never into two or more internal mbufs). Thus, the third case is
	* supposed to never be matched but is prepared just in case.
	*/

	#define IP6_EXTHDR_CHECK(m, off, hlen, ret) \
	do { \
	if ((m)->m_next != NULL) { \
	if (((m)->m_flags & M_LOOP) && \
	((m)->m_len < (off) + (hlen)) && \
	(((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \
	V_ip6stat.ip6s_exthdrtoolong++; \
	return ret; \
	} else if ((m)->m_flags & M_EXT) { \
	if ((m)->m_len < (off) + (hlen)) { \
	V_ip6stat.ip6s_exthdrtoolong++; \
	m_freem(m); \
	return ret; \
	} \
	} else { \
	if ((m)->m_len < (off) + (hlen)) { \
	V_ip6stat.ip6s_exthdrtoolong++; \
	m_freem(m); \
	return ret; \
	} \
	} \
	} else { \
	if ((m)->m_len < (off) + (hlen)) { \
	V_ip6stat.ip6s_tooshort++; \
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); \
	m_freem(m); \
	return ret; \
	} \
	} \
	} while (/CONSTCOND/ 0)

	/*
	* IP6_EXTHDR_GET ensures that intermediate protocol header (from "off" to
	* "len") is located in single mbuf, on contiguous memory region.
	* The pointer to the region will be returned to pointer variable "val",
	* with type "typ".
	* IP6_EXTHDR_GET0 does the same, except that it aligns the structure at the
	* very top of mbuf. GET0 is likely to make memory copy than GET.
	*
	* XXX we're now testing this, needs m_pulldown()
	*/
	#define IP6_EXTHDR_GET(val, typ, m, off, len) \
	do { \
	struct mbuf *t; \
	int tmp; \
	if ((m)->m_len >= (off) + (len)) \
	(val) = (typ)(mtod((m), caddr_t) + (off)); \
	else { \
	t = m_pulldown((m), (off), (len), &tmp); \
	if (t) { \
	if (t->m_len < tmp + (len)) \
	panic("m_pulldown malfunction"); \
	(val) = (typ)(mtod(t, caddr_t) + tmp); \
	} else { \
	(val) = (typ)NULL; \
	(m) = NULL; \
	} \
	} \
	} while (/CONSTCOND/ 0)

	#define IP6_EXTHDR_GET0(val, typ, m, off, len) \
	do { \
	struct mbuf *t; \
	if ((off) == 0) \
	(val) = (typ)mtod(m, caddr_t); \
	else { \
	t = m_pulldown((m), (off), (len), NULL); \
	if (t) { \
	if (t->m_len < (len)) \
	panic("m_pulldown malfunction"); \
	(val) = (typ)mtod(t, caddr_t); \
	} else { \
	(val) = (typ)NULL; \
	(m) = NULL; \
	} \
	} \
	} while (/CONSTCOND/ 0)
	+
	+#include <netinet6/vinet6.h>
	+
	#endif /_KERNEL/

	#endif /* not _NETINET_IP6_H_ */
	Index: head/sys/netinet/ip_carp.c
	===================================================================
	--- head/sys/netinet/ip_carp.c (revision 183549)
	+++ head/sys/netinet/ip_carp.c (revision 183550)
	@@ -1,2256 +1,2259 @@
	/*
	* Copyright (c) 2002 Michael Shalayeff. All rights reserved.
	* Copyright (c) 2003 Ryan McBride. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
	* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_carp.h"
	#include "opt_bpf.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/time.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/signalvar.h>
	#include <sys/filio.h>
	#include <sys/sockio.h>

	#include <sys/socket.h>
	#include <sys/vnode.h>
	#include <sys/vimage.h>

	#include <machine/stdarg.h>

	#include <net/bpf.h>
	#include <net/ethernet.h>
	#include <net/fddi.h>
	#include <net/iso88025.h>
	#include <net/if.h>
	#include <net/if_clone.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#ifdef INET
	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/if_ether.h>
	#include <machine/in_cksum.h>
	#endif

	#ifdef INET6
	#include <netinet/icmp6.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#endif

	#include <crypto/sha1.h>
	#include <netinet/ip_carp.h>

	#define CARP_IFNAME "carp"
	static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
	SYSCTL_DECL(_net_inet_carp);

	struct carp_softc {
	struct ifnet sc_ifp; / Interface clue */
	struct ifnet sc_carpdev; / Pointer to parent interface */
	struct in_ifaddr sc_ia; / primary iface address */
	struct ip_moptions sc_imo;
	#ifdef INET6
	struct in6_ifaddr sc_ia6; / primary iface address v6 */
	struct ip6_moptions sc_im6o;
	#endif /* INET6 */
	TAILQ_ENTRY(carp_softc) sc_list;

	enum { INIT = 0, BACKUP, MASTER } sc_state;

	int sc_flags_backup;
	int sc_suppress;

	int sc_sendad_errors;
	#define CARP_SENDAD_MAX_ERRORS 3
	int sc_sendad_success;
	#define CARP_SENDAD_MIN_SUCCESS 3

	int sc_vhid;
	int sc_advskew;
	int sc_naddrs;
	int sc_naddrs6;
	int sc_advbase; /* seconds */
	int sc_init_counter;
	u_int64_t sc_counter;

	/* authentication */
	#define CARP_HMAC_PAD 64
	unsigned char sc_key[CARP_KEY_LEN];
	unsigned char sc_pad[CARP_HMAC_PAD];
	SHA1_CTX sc_sha1;

	struct callout sc_ad_tmo; /* advertisement timeout */
	struct callout sc_md_tmo; /* master down timeout */
	struct callout sc_md6_tmo; /* master down timeout */

	LIST_ENTRY(carp_softc) sc_next; /* Interface clue */
	};
	#define SC2IFP(sc) ((sc)->sc_ifp)

	int carp_suppress_preempt = 0;
	int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */
	SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
	&carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
	SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
	&carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
	SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
	&carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
	SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
	&carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
	SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
	&carp_suppress_preempt, 0, "Preemption is suppressed");

	struct carpstats carpstats;
	SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
	&carpstats, carpstats,
	"CARP statistics (struct carpstats, netinet/ip_carp.h)");

	struct carp_if {
	TAILQ_HEAD(, carp_softc) vhif_vrs;
	int vhif_nvrs;

	struct ifnet *vhif_ifp;
	struct mtx vhif_mtx;
	};

	/* Get carp_if from softc. Valid after carp_set_addr{,6}. */
	#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp)

	/* lock per carp_if queue */
	#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \
	NULL, MTX_DEF)
	#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx)
	#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED)
	#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx)
	#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx)

	#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx)
	#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx)
	#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED)

	#define CARP_LOG(...) do { \
	if (carp_opts[CARPCTL_LOG] > 0) \
	log(LOG_INFO, __VA_ARGS__); \
	} while (0)

	#define CARP_DEBUG(...) do { \
	if (carp_opts[CARPCTL_LOG] > 1) \
	log(LOG_DEBUG, __VA_ARGS__); \
	} while (0)

	static void carp_hmac_prepare(struct carp_softc *);
	static void carp_hmac_generate(struct carp_softc , u_int32_t ,
	unsigned char *);
	static int carp_hmac_verify(struct carp_softc , u_int32_t ,
	unsigned char *);
	static void carp_setroute(struct carp_softc *, int);
	static void carp_input_c(struct mbuf , struct carp_header , sa_family_t);
	static int carp_clone_create(struct if_clone *, int, caddr_t);
	static void carp_clone_destroy(struct ifnet *);
	static void carpdetach(struct carp_softc *, int);
	static int carp_prepare_ad(struct mbuf , struct carp_softc ,
	struct carp_header *);
	static void carp_send_ad_all(void);
	static void carp_send_ad(void *);
	static void carp_send_ad_locked(struct carp_softc *);
	static void carp_send_arp(struct carp_softc *);
	static void carp_master_down(void *);
	static void carp_master_down_locked(struct carp_softc *);
	static int carp_ioctl(struct ifnet *, u_long, caddr_t);
	static int carp_looutput(struct ifnet , struct mbuf , struct sockaddr *,
	struct rtentry *);
	static void carp_start(struct ifnet *);
	static void carp_setrun(struct carp_softc *, sa_family_t);
	static void carp_set_state(struct carp_softc *, int);
	static int carp_addrcount(struct carp_if , struct in_ifaddr , int);
	enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING };

	static void carp_multicast_cleanup(struct carp_softc *);
	static int carp_set_addr(struct carp_softc , struct sockaddr_in );
	static int carp_del_addr(struct carp_softc , struct sockaddr_in );
	static void carp_carpdev_state_locked(struct carp_if *);
	static void carp_sc_state_locked(struct carp_softc *);
	#ifdef INET6
	static void carp_send_na(struct carp_softc *);
	static int carp_set_addr6(struct carp_softc , struct sockaddr_in6 );
	static int carp_del_addr6(struct carp_softc , struct sockaddr_in6 );
	static void carp_multicast6_cleanup(struct carp_softc *);
	#endif

	static LIST_HEAD(, carp_softc) carpif_list;
	static struct mtx carp_mtx;
	IFC_SIMPLE_DECLARE(carp, 0);

	static eventhandler_tag if_detach_event_tag;

	static __inline u_int16_t
	carp_cksum(struct mbuf *m, int len)
	{
	return (in_cksum(m, len));
	}

	static void
	carp_hmac_prepare(struct carp_softc *sc)
	{
	u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
	u_int8_t vhid = sc->sc_vhid & 0xff;
	struct ifaddr *ifa;
	int i, found;
	#ifdef INET
	struct in_addr last, cur, in;
	#endif
	#ifdef INET6
	struct in6_addr last6, cur6, in6;
	#endif

	if (sc->sc_carpdev)
	CARP_SCLOCK(sc);

	/* XXX: possible race here */

	/* compute ipad from key */
	bzero(sc->sc_pad, sizeof(sc->sc_pad));
	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
	for (i = 0; i < sizeof(sc->sc_pad); i++)
	sc->sc_pad[i] ^= 0x36;

	/* precompute first part of inner hash */
	SHA1Init(&sc->sc_sha1);
	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
	#ifdef INET
	cur.s_addr = 0;
	do {
	found = 0;
	last = cur;
	cur.s_addr = 0xffffffff;
	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
	in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
	if (ifa->ifa_addr->sa_family == AF_INET &&
	ntohl(in.s_addr) > ntohl(last.s_addr) &&
	ntohl(in.s_addr) < ntohl(cur.s_addr)) {
	cur.s_addr = in.s_addr;
	found++;
	}
	}
	if (found)
	SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
	} while (found);
	#endif /* INET */
	#ifdef INET6
	memset(&cur6, 0, sizeof(cur6));
	do {
	found = 0;
	last6 = cur6;
	memset(&cur6, 0xff, sizeof(cur6));
	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
	in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
	if (IN6_IS_SCOPE_EMBED(&in6))
	in6.s6_addr16[1] = 0;
	if (ifa->ifa_addr->sa_family == AF_INET6 &&
	memcmp(&in6, &last6, sizeof(in6)) > 0 &&
	memcmp(&in6, &cur6, sizeof(in6)) < 0) {
	cur6 = in6;
	found++;
	}
	}
	if (found)
	SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
	} while (found);
	#endif /* INET6 */

	/* convert ipad to opad */
	for (i = 0; i < sizeof(sc->sc_pad); i++)
	sc->sc_pad[i] ^= 0x36 ^ 0x5c;

	if (sc->sc_carpdev)
	CARP_SCUNLOCK(sc);
	}

	static void
	carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2],
	unsigned char md[20])
	{
	SHA1_CTX sha1ctx;

	/* fetch first half of inner hash */
	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));

	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
	SHA1Final(md, &sha1ctx);

	/* outer hash */
	SHA1Init(&sha1ctx);
	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
	SHA1Update(&sha1ctx, md, 20);
	SHA1Final(md, &sha1ctx);
	}

	static int
	carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2],
	unsigned char md[20])
	{
	unsigned char md2[20];

	CARP_SCLOCK_ASSERT(sc);

	carp_hmac_generate(sc, counter, md2);

	return (bcmp(md, md2, sizeof(md2)));
	}

	static void
	carp_setroute(struct carp_softc *sc, int cmd)
	{
	struct ifaddr *ifa;
	int s;

	if (sc->sc_carpdev)
	CARP_SCLOCK_ASSERT(sc);

	s = splnet();
	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family == AF_INET &&
	sc->sc_carpdev != NULL) {
	int count = carp_addrcount(
	(struct carp_if *)sc->sc_carpdev->if_carp,
	ifatoia(ifa), CARP_COUNT_MASTER);

	if ((cmd == RTM_ADD && count == 1) \|\|
	(cmd == RTM_DELETE && count == 0))
	rtinit(ifa, cmd, RTF_UP \| RTF_HOST);
	}
	#ifdef INET6
	if (ifa->ifa_addr->sa_family == AF_INET6) {
	if (cmd == RTM_ADD)
	in6_ifaddloop(ifa);
	else
	in6_ifremloop(ifa);
	}
	#endif /* INET6 */
	}
	splx(s);
	}

	static int
	carp_clone_create(struct if_clone *ifc, int unit, caddr_t params)
	{

	struct carp_softc *sc;
	struct ifnet *ifp;

	MALLOC(sc, struct carp_softc , sizeof(sc), M_CARP, M_WAITOK\|M_ZERO);
	ifp = SC2IFP(sc) = if_alloc(IFT_ETHER);
	if (ifp == NULL) {
	FREE(sc, M_CARP);
	return (ENOSPC);
	}

	sc->sc_flags_backup = 0;
	sc->sc_suppress = 0;
	sc->sc_advbase = CARP_DFLTINTV;
	sc->sc_vhid = -1; /* required setting */
	sc->sc_advskew = 0;
	sc->sc_init_counter = 1;
	sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */
	#ifdef INET6
	sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
	#endif
	sc->sc_imo.imo_membership = (struct in_multi **)malloc(
	(sizeof(struct in_multi ) IP_MIN_MEMBERSHIPS), M_CARP,
	M_WAITOK);
	sc->sc_imo.imo_mfilters = NULL;
	sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
	sc->sc_imo.imo_multicast_vif = -1;

	callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE);
	callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE);
	callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE);

	ifp->if_softc = sc;
	if_initname(ifp, CARP_IFNAME, unit);
	ifp->if_mtu = ETHERMTU;
	ifp->if_flags = IFF_LOOPBACK;
	ifp->if_ioctl = carp_ioctl;
	ifp->if_output = carp_looutput;
	ifp->if_start = carp_start;
	ifp->if_type = IFT_CARP;
	ifp->if_snd.ifq_maxlen = ifqmaxlen;
	ifp->if_hdrlen = 0;
	if_attach(ifp);
	bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t));
	mtx_lock(&carp_mtx);
	LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
	mtx_unlock(&carp_mtx);
	return (0);
	}

	static void
	carp_clone_destroy(struct ifnet *ifp)
	{
	struct carp_softc *sc = ifp->if_softc;

	if (sc->sc_carpdev)
	CARP_SCLOCK(sc);
	carpdetach(sc, 1); /* Returns unlocked. */

	mtx_lock(&carp_mtx);
	LIST_REMOVE(sc, sc_next);
	mtx_unlock(&carp_mtx);
	bpfdetach(ifp);
	if_detach(ifp);
	if_free_type(ifp, IFT_ETHER);
	free(sc->sc_imo.imo_membership, M_CARP);
	free(sc, M_CARP);
	}

	/*
	* This function can be called on CARP interface destroy path,
	* and in case of the removal of the underlying interface as
	* well. We differentiate these two cases. In the latter case
	* we do not cleanup our multicast memberships, since they
	* are already freed. Also, in the latter case we do not
	* release the lock on return, because the function will be
	* called once more, for another CARP instance on the same
	* interface.
	*/
	static void
	carpdetach(struct carp_softc *sc, int unlock)
	{
	struct carp_if *cif;

	callout_stop(&sc->sc_ad_tmo);
	callout_stop(&sc->sc_md_tmo);
	callout_stop(&sc->sc_md6_tmo);

	if (sc->sc_suppress)
	carp_suppress_preempt--;
	sc->sc_suppress = 0;

	if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
	carp_suppress_preempt--;
	sc->sc_sendad_errors = 0;

	carp_set_state(sc, INIT);
	SC2IFP(sc)->if_flags &= ~IFF_UP;
	carp_setrun(sc, 0);
	if (unlock)
	carp_multicast_cleanup(sc);
	#ifdef INET6
	carp_multicast6_cleanup(sc);
	#endif

	if (sc->sc_carpdev != NULL) {
	cif = (struct carp_if *)sc->sc_carpdev->if_carp;
	CARP_LOCK_ASSERT(cif);
	TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
	if (!--cif->vhif_nvrs) {
	ifpromisc(sc->sc_carpdev, 0);
	sc->sc_carpdev->if_carp = NULL;
	CARP_LOCK_DESTROY(cif);
	FREE(cif, M_IFADDR);
	} else if (unlock)
	CARP_UNLOCK(cif);
	sc->sc_carpdev = NULL;
	}
	}

	/* Detach an interface from the carp. */
	static void
	carp_ifdetach(void arg __unused, struct ifnet ifp)
	{
	struct carp_if cif = (struct carp_if )ifp->if_carp;
	struct carp_softc sc, nextsc;

	if (cif == NULL)
	return;

	/*
	* XXX: At the end of for() cycle the lock will be destroyed.
	*/
	CARP_LOCK(cif);
	for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) {
	nextsc = TAILQ_NEXT(sc, sc_list);
	carpdetach(sc, 0);
	}
	}

	/*
	* process input packet.
	* we have rearranged checks order compared to the rfc,
	* but it seems more efficient this way or not possible otherwise.
	*/
	void
	carp_input(struct mbuf *m, int hlen)
	{
	struct ip ip = mtod(m, struct ip );
	struct carp_header *ch;
	int iplen, len;

	carpstats.carps_ipackets++;

	if (!carp_opts[CARPCTL_ALLOW]) {
	m_freem(m);
	return;
	}

	/* check if received on a valid carp interface */
	if (m->m_pkthdr.rcvif->if_carp == NULL) {
	carpstats.carps_badif++;
	CARP_LOG("carp_input: packet received on non-carp "
	"interface: %s\n",
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return;
	}

	/* verify that the IP TTL is 255. */
	if (ip->ip_ttl != CARP_DFLTTL) {
	carpstats.carps_badttl++;
	CARP_LOG("carp_input: received ttl %d != 255i on %s\n",
	ip->ip_ttl,
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return;
	}

	iplen = ip->ip_hl << 2;

	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
	carpstats.carps_badlen++;
	CARP_LOG("carp_input: received len %zd < "
	"sizeof(struct carp_header)\n",
	m->m_len - sizeof(struct ip));
	m_freem(m);
	return;
	}

	if (iplen + sizeof(*ch) < m->m_len) {
	if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
	carpstats.carps_hdrops++;
	CARP_LOG("carp_input: pullup failed\n");
	return;
	}
	ip = mtod(m, struct ip *);
	}
	ch = (struct carp_header )((char )ip + iplen);

	/*
	* verify that the received packet length is
	* equal to the CARP header
	*/
	len = iplen + sizeof(*ch);
	if (len > m->m_pkthdr.len) {
	carpstats.carps_badlen++;
	CARP_LOG("carp_input: packet too short %d on %s\n",
	m->m_pkthdr.len,
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return;
	}

	if ((m = m_pullup(m, len)) == NULL) {
	carpstats.carps_hdrops++;
	return;
	}
	ip = mtod(m, struct ip *);
	ch = (struct carp_header )((char )ip + iplen);

	/* verify the CARP checksum */
	m->m_data += iplen;
	if (carp_cksum(m, len - iplen)) {
	carpstats.carps_badsum++;
	CARP_LOG("carp_input: checksum failed on %s\n",
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return;
	}
	m->m_data -= iplen;

	carp_input_c(m, ch, AF_INET);
	}

	#ifdef INET6
	int
	carp6_input(struct mbuf *mp, int offp, int proto)
	{
	struct mbuf m = mp;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct carp_header *ch;
	u_int len;

	carpstats.carps_ipackets6++;

	if (!carp_opts[CARPCTL_ALLOW]) {
	m_freem(m);
	return (IPPROTO_DONE);
	}

	/* check if received on a valid carp interface */
	if (m->m_pkthdr.rcvif->if_carp == NULL) {
	carpstats.carps_badif++;
	CARP_LOG("carp6_input: packet received on non-carp "
	"interface: %s\n",
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return (IPPROTO_DONE);
	}

	/* verify that the IP TTL is 255 */
	if (ip6->ip6_hlim != CARP_DFLTTL) {
	carpstats.carps_badttl++;
	CARP_LOG("carp6_input: received ttl %d != 255 on %s\n",
	ip6->ip6_hlim,
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return (IPPROTO_DONE);
	}

	/* verify that we have a complete carp packet */
	len = m->m_len;
	IP6_EXTHDR_GET(ch, struct carp_header , m, offp, sizeof(*ch));
	if (ch == NULL) {
	carpstats.carps_badlen++;
	CARP_LOG("carp6_input: packet size %u too small\n", len);
	return (IPPROTO_DONE);
	}


	/* verify the CARP checksum */
	m->m_data += *offp;
	if (carp_cksum(m, sizeof(*ch))) {
	carpstats.carps_badsum++;
	CARP_LOG("carp6_input: checksum failed, on %s\n",
	m->m_pkthdr.rcvif->if_xname);
	m_freem(m);
	return (IPPROTO_DONE);
	}
	m->m_data -= *offp;

	carp_input_c(m, ch, AF_INET6);
	return (IPPROTO_DONE);
	}
	#endif /* INET6 */

	static void
	carp_input_c(struct mbuf m, struct carp_header ch, sa_family_t af)
	{
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct carp_softc *sc;
	u_int64_t tmp_counter;
	struct timeval sc_tv, ch_tv;

	/* verify that the VHID is valid on the receiving interface */
	CARP_LOCK(ifp->if_carp);
	TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list)
	if (sc->sc_vhid == ch->carp_vhid)
	break;

	if (!sc \|\| !((SC2IFP(sc)->if_flags & IFF_UP) &&
	(SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
	carpstats.carps_badvhid++;
	CARP_UNLOCK(ifp->if_carp);
	m_freem(m);
	return;
	}

	getmicrotime(&SC2IFP(sc)->if_lastchange);
	SC2IFP(sc)->if_ipackets++;
	SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;

	if (bpf_peers_present(SC2IFP(sc)->if_bpf)) {
	struct ip ip = mtod(m, struct ip );
	uint32_t af1 = af;

	/* BPF wants net byte order */
	ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
	ip->ip_off = htons(ip->ip_off);
	bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m);
	}

	/* verify the CARP version. */
	if (ch->carp_version != CARP_VERSION) {
	carpstats.carps_badver++;
	SC2IFP(sc)->if_ierrors++;
	CARP_UNLOCK(ifp->if_carp);
	CARP_LOG("%s; invalid version %d\n",
	SC2IFP(sc)->if_xname,
	ch->carp_version);
	m_freem(m);
	return;
	}

	/* verify the hash */
	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
	carpstats.carps_badauth++;
	SC2IFP(sc)->if_ierrors++;
	CARP_UNLOCK(ifp->if_carp);
	CARP_LOG("%s: incorrect hash\n", SC2IFP(sc)->if_xname);
	m_freem(m);
	return;
	}

	tmp_counter = ntohl(ch->carp_counter[0]);
	tmp_counter = tmp_counter<<32;
	tmp_counter += ntohl(ch->carp_counter[1]);

	/* XXX Replay protection goes here */

	sc->sc_init_counter = 0;
	sc->sc_counter = tmp_counter;

	sc_tv.tv_sec = sc->sc_advbase;
	if (carp_suppress_preempt && sc->sc_advskew < 240)
	sc_tv.tv_usec = 240 * 1000000 / 256;
	else
	sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
	ch_tv.tv_sec = ch->carp_advbase;
	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;

	switch (sc->sc_state) {
	case INIT:
	break;
	case MASTER:
	/*
	* If we receive an advertisement from a master who's going to
	* be more frequent than us, go into BACKUP state.
	*/
	if (timevalcmp(&sc_tv, &ch_tv, >) \|\|
	timevalcmp(&sc_tv, &ch_tv, ==)) {
	callout_stop(&sc->sc_ad_tmo);
	CARP_DEBUG("%s: MASTER -> BACKUP "
	"(more frequent advertisement received)\n",
	SC2IFP(sc)->if_xname);
	carp_set_state(sc, BACKUP);
	carp_setrun(sc, 0);
	carp_setroute(sc, RTM_DELETE);
	}
	break;
	case BACKUP:
	/*
	* If we're pre-empting masters who advertise slower than us,
	* and this one claims to be slower, treat him as down.
	*/
	if (carp_opts[CARPCTL_PREEMPT] &&
	timevalcmp(&sc_tv, &ch_tv, <)) {
	CARP_DEBUG("%s: BACKUP -> MASTER "
	"(preempting a slower master)\n",
	SC2IFP(sc)->if_xname);
	carp_master_down_locked(sc);
	break;
	}

	/*
	* If the master is going to advertise at such a low frequency
	* that he's guaranteed to time out, we'd might as well just
	* treat him as timed out now.
	*/
	sc_tv.tv_sec = sc->sc_advbase * 3;
	if (timevalcmp(&sc_tv, &ch_tv, <)) {
	CARP_DEBUG("%s: BACKUP -> MASTER "
	"(master timed out)\n",
	SC2IFP(sc)->if_xname);
	carp_master_down_locked(sc);
	break;
	}

	/*
	* Otherwise, we reset the counter and wait for the next
	* advertisement.
	*/
	carp_setrun(sc, af);
	break;
	}

	CARP_UNLOCK(ifp->if_carp);

	m_freem(m);
	return;
	}

	static int
	carp_prepare_ad(struct mbuf m, struct carp_softc sc, struct carp_header *ch)
	{
	struct m_tag *mtag;
	struct ifnet *ifp = SC2IFP(sc);

	if (sc->sc_init_counter) {
	/* this could also be seconds since unix epoch */
	sc->sc_counter = arc4random();
	sc->sc_counter = sc->sc_counter << 32;
	sc->sc_counter += arc4random();
	} else
	sc->sc_counter++;

	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);

	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);

	/* Tag packet for carp_output */
	mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT);
	if (mtag == NULL) {
	m_freem(m);
	SC2IFP(sc)->if_oerrors++;
	return (ENOMEM);
	}
	bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
	m_tag_prepend(m, mtag);

	return (0);
	}

	static void
	carp_send_ad_all(void)
	{
	struct carp_softc *sc;

	mtx_lock(&carp_mtx);
	LIST_FOREACH(sc, &carpif_list, sc_next) {
	if (sc->sc_carpdev == NULL)
	continue;
	CARP_SCLOCK(sc);
	if ((SC2IFP(sc)->if_flags & IFF_UP) &&
	(SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) &&
	sc->sc_state == MASTER)
	carp_send_ad_locked(sc);
	CARP_SCUNLOCK(sc);
	}
	mtx_unlock(&carp_mtx);
	}

	static void
	carp_send_ad(void *v)
	{
	struct carp_softc *sc = v;

	CARP_SCLOCK(sc);
	carp_send_ad_locked(sc);
	CARP_SCUNLOCK(sc);
	}

	static void
	carp_send_ad_locked(struct carp_softc *sc)
	{
	struct carp_header ch;
	struct timeval tv;
	struct carp_header *ch_ptr;
	struct mbuf *m;
	int len, advbase, advskew;

	CARP_SCLOCK_ASSERT(sc);

	/* bow out if we've lost our UPness or RUNNINGuiness */
	if (!((SC2IFP(sc)->if_flags & IFF_UP) &&
	(SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
	advbase = 255;
	advskew = 255;
	} else {
	advbase = sc->sc_advbase;
	if (!carp_suppress_preempt \|\| sc->sc_advskew > 240)
	advskew = sc->sc_advskew;
	else
	advskew = 240;
	tv.tv_sec = advbase;
	tv.tv_usec = advskew * 1000000 / 256;
	}

	ch.carp_version = CARP_VERSION;
	ch.carp_type = CARP_ADVERTISEMENT;
	ch.carp_vhid = sc->sc_vhid;
	ch.carp_advbase = advbase;
	ch.carp_advskew = advskew;
	ch.carp_authlen = 7; /* XXX DEFINE */
	ch.carp_pad1 = 0; /* must be zero */
	ch.carp_cksum = 0;

	#ifdef INET
	+ INIT_VNET_INET(curvnet);
	if (sc->sc_ia) {
	struct ip *ip;

	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	if (m == NULL) {
	SC2IFP(sc)->if_oerrors++;
	carpstats.carps_onomem++;
	/* XXX maybe less ? */
	if (advbase != 255 \|\| advskew != 255)
	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
	carp_send_ad, sc);
	return;
	}
	len = sizeof(*ip) + sizeof(ch);
	m->m_pkthdr.len = len;
	m->m_pkthdr.rcvif = NULL;
	m->m_len = len;
	MH_ALIGN(m, m->m_len);
	m->m_flags \|= M_MCAST;
	ip = mtod(m, struct ip *);
	ip->ip_v = IPVERSION;
	ip->ip_hl = sizeof(*ip) >> 2;
	ip->ip_tos = IPTOS_LOWDELAY;
	ip->ip_len = len;
	ip->ip_id = ip_newid();
	ip->ip_off = IP_DF;
	ip->ip_ttl = CARP_DFLTTL;
	ip->ip_p = IPPROTO_CARP;
	ip->ip_sum = 0;
	ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr;
	ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);

	ch_ptr = (struct carp_header *)(&ip[1]);
	bcopy(&ch, ch_ptr, sizeof(ch));
	if (carp_prepare_ad(m, sc, ch_ptr))
	return;

	m->m_data += sizeof(*ip);
	ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
	m->m_data -= sizeof(*ip);

	getmicrotime(&SC2IFP(sc)->if_lastchange);
	SC2IFP(sc)->if_opackets++;
	SC2IFP(sc)->if_obytes += len;
	carpstats.carps_opackets++;

	if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
	SC2IFP(sc)->if_oerrors++;
	if (sc->sc_sendad_errors < INT_MAX)
	sc->sc_sendad_errors++;
	if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
	carp_suppress_preempt++;
	if (carp_suppress_preempt == 1) {
	CARP_SCUNLOCK(sc);
	carp_send_ad_all();
	CARP_SCLOCK(sc);
	}
	}
	sc->sc_sendad_success = 0;
	} else {
	if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
	if (++sc->sc_sendad_success >=
	CARP_SENDAD_MIN_SUCCESS) {
	carp_suppress_preempt--;
	sc->sc_sendad_errors = 0;
	}
	} else
	sc->sc_sendad_errors = 0;
	}
	}
	#endif /* INET */
	#ifdef INET6
	if (sc->sc_ia6) {
	struct ip6_hdr *ip6;

	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	if (m == NULL) {
	SC2IFP(sc)->if_oerrors++;
	carpstats.carps_onomem++;
	/* XXX maybe less ? */
	if (advbase != 255 \|\| advskew != 255)
	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
	carp_send_ad, sc);
	return;
	}
	len = sizeof(*ip6) + sizeof(ch);
	m->m_pkthdr.len = len;
	m->m_pkthdr.rcvif = NULL;
	m->m_len = len;
	MH_ALIGN(m, m->m_len);
	m->m_flags \|= M_MCAST;
	ip6 = mtod(m, struct ip6_hdr *);
	bzero(ip6, sizeof(*ip6));
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_hlim = CARP_DFLTTL;
	ip6->ip6_nxt = IPPROTO_CARP;
	bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
	sizeof(struct in6_addr));
	/* set the multicast destination */

	ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
	ip6->ip6_dst.s6_addr8[15] = 0x12;
	if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
	SC2IFP(sc)->if_oerrors++;
	m_freem(m);
	CARP_LOG("%s: in6_setscope failed\n", __func__);
	return;
	}

	ch_ptr = (struct carp_header *)(&ip6[1]);
	bcopy(&ch, ch_ptr, sizeof(ch));
	if (carp_prepare_ad(m, sc, ch_ptr))
	return;

	m->m_data += sizeof(*ip6);
	ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
	m->m_data -= sizeof(*ip6);

	getmicrotime(&SC2IFP(sc)->if_lastchange);
	SC2IFP(sc)->if_opackets++;
	SC2IFP(sc)->if_obytes += len;
	carpstats.carps_opackets6++;

	if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
	SC2IFP(sc)->if_oerrors++;
	if (sc->sc_sendad_errors < INT_MAX)
	sc->sc_sendad_errors++;
	if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
	carp_suppress_preempt++;
	if (carp_suppress_preempt == 1) {
	CARP_SCUNLOCK(sc);
	carp_send_ad_all();
	CARP_SCLOCK(sc);
	}
	}
	sc->sc_sendad_success = 0;
	} else {
	if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
	if (++sc->sc_sendad_success >=
	CARP_SENDAD_MIN_SUCCESS) {
	carp_suppress_preempt--;
	sc->sc_sendad_errors = 0;
	}
	} else
	sc->sc_sendad_errors = 0;
	}
	}
	#endif /* INET6 */

	if (advbase != 255 \|\| advskew != 255)
	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
	carp_send_ad, sc);

	}

	/*
	* Broadcast a gratuitous ARP request containing
	* the virtual router MAC address for each IP address
	* associated with the virtual router.
	*/
	static void
	carp_send_arp(struct carp_softc *sc)
	{
	struct ifaddr *ifa;

	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {

	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;

	/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */
	arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp));

	DELAY(1000); /* XXX */
	}
	}

	#ifdef INET6
	static void
	carp_send_na(struct carp_softc *sc)
	{
	struct ifaddr *ifa;
	struct in6_addr *in6;
	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;

	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {

	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;

	in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
	nd6_na_output(sc->sc_carpdev, &mcast, in6,
	ND_NA_FLAG_OVERRIDE, 1, NULL);
	DELAY(1000); /* XXX */
	}
	}
	#endif /* INET6 */

	static int
	carp_addrcount(struct carp_if cif, struct in_ifaddr ia, int type)
	{
	struct carp_softc *vh;
	struct ifaddr *ifa;
	int count = 0;

	CARP_LOCK_ASSERT(cif);

	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
	if ((type == CARP_COUNT_RUNNING &&
	(SC2IFP(vh)->if_flags & IFF_UP) &&
	(SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) \|\|
	(type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) {
	TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
	ifa_list) {
	if (ifa->ifa_addr->sa_family == AF_INET &&
	ia->ia_addr.sin_addr.s_addr ==
	ifatoia(ifa)->ia_addr.sin_addr.s_addr)
	count++;
	}
	}
	}
	return (count);
	}

	int
	carp_iamatch(void v, struct in_ifaddr ia,
	struct in_addr isaddr, u_int8_t *enaddr)
	{
	struct carp_if *cif = v;
	struct carp_softc *vh;
	int index, count = 0;
	struct ifaddr *ifa;

	CARP_LOCK(cif);

	if (carp_opts[CARPCTL_ARPBALANCE]) {
	/*
	* XXX proof of concept implementation.
	* We use the source ip to decide which virtual host should
	* handle the request. If we're master of that virtual host,
	* then we respond, otherwise, just drop the arp packet on
	* the floor.
	*/
	count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING);
	if (count == 0) {
	/* should never reach this */
	CARP_UNLOCK(cif);
	return (0);
	}

	/* this should be a hash, like pf_hash() */
	index = ntohl(isaddr->s_addr) % count;
	count = 0;

	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
	if ((SC2IFP(vh)->if_flags & IFF_UP) &&
	(SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) {
	TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
	ifa_list) {
	if (ifa->ifa_addr->sa_family ==
	AF_INET &&
	ia->ia_addr.sin_addr.s_addr ==
	ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
	if (count == index) {
	if (vh->sc_state ==
	MASTER) {
	*enaddr = IF_LLADDR(vh->sc_ifp);
	CARP_UNLOCK(cif);
	return (1);
	} else {
	CARP_UNLOCK(cif);
	return (0);
	}
	}
	count++;
	}
	}
	}
	}
	} else {
	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
	if ((SC2IFP(vh)->if_flags & IFF_UP) &&
	(SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
	ia->ia_ifp == SC2IFP(vh) &&
	vh->sc_state == MASTER) {
	*enaddr = IF_LLADDR(vh->sc_ifp);
	CARP_UNLOCK(cif);
	return (1);
	}
	}
	}
	CARP_UNLOCK(cif);
	return (0);
	}

	#ifdef INET6
	struct ifaddr *
	carp_iamatch6(void v, struct in6_addr taddr)
	{
	struct carp_if *cif = v;
	struct carp_softc *vh;
	struct ifaddr *ifa;

	CARP_LOCK(cif);
	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
	TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) {
	if (IN6_ARE_ADDR_EQUAL(taddr,
	&ifatoia6(ifa)->ia_addr.sin6_addr) &&
	(SC2IFP(vh)->if_flags & IFF_UP) &&
	(SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
	vh->sc_state == MASTER) {
	CARP_UNLOCK(cif);
	return (ifa);
	}
	}
	}
	CARP_UNLOCK(cif);

	return (NULL);
	}

	void *
	carp_macmatch6(void v, struct mbuf m, const struct in6_addr *taddr)
	{
	struct m_tag *mtag;
	struct carp_if *cif = v;
	struct carp_softc *sc;
	struct ifaddr *ifa;

	CARP_LOCK(cif);
	TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
	TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
	if (IN6_ARE_ADDR_EQUAL(taddr,
	&ifatoia6(ifa)->ia_addr.sin6_addr) &&
	(SC2IFP(sc)->if_flags & IFF_UP) &&
	(SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) {
	struct ifnet *ifp = SC2IFP(sc);
	mtag = m_tag_get(PACKET_TAG_CARP,
	sizeof(struct ifnet *), M_NOWAIT);
	if (mtag == NULL) {
	/* better a bit than nothing */
	CARP_UNLOCK(cif);
	return (IF_LLADDR(sc->sc_ifp));
	}
	bcopy(&ifp, (caddr_t)(mtag + 1),
	sizeof(struct ifnet *));
	m_tag_prepend(m, mtag);

	CARP_UNLOCK(cif);
	return (IF_LLADDR(sc->sc_ifp));
	}
	}
	}
	CARP_UNLOCK(cif);

	return (NULL);
	}
	#endif

	struct ifnet *
	carp_forus(void v, void dhost)
	{
	struct carp_if *cif = v;
	struct carp_softc *vh;
	u_int8_t *ena = dhost;

	if (ena[0] \|\| ena[1] \|\| ena[2] != 0x5e \|\| ena[3] \|\| ena[4] != 1)
	return (NULL);

	CARP_LOCK(cif);
	TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list)
	if ((SC2IFP(vh)->if_flags & IFF_UP) &&
	(SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
	vh->sc_state == MASTER &&
	!bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) {
	CARP_UNLOCK(cif);
	return (SC2IFP(vh));
	}

	CARP_UNLOCK(cif);
	return (NULL);
	}

	static void
	carp_master_down(void *v)
	{
	struct carp_softc *sc = v;

	CARP_SCLOCK(sc);
	carp_master_down_locked(sc);
	CARP_SCUNLOCK(sc);
	}

	static void
	carp_master_down_locked(struct carp_softc *sc)
	{
	if (sc->sc_carpdev)
	CARP_SCLOCK_ASSERT(sc);

	switch (sc->sc_state) {
	case INIT:
	printf("%s: master_down event in INIT state\n",
	SC2IFP(sc)->if_xname);
	break;
	case MASTER:
	break;
	case BACKUP:
	carp_set_state(sc, MASTER);
	carp_send_ad_locked(sc);
	carp_send_arp(sc);
	#ifdef INET6
	carp_send_na(sc);
	#endif /* INET6 */
	carp_setrun(sc, 0);
	carp_setroute(sc, RTM_ADD);
	break;
	}
	}

	/*
	* When in backup state, af indicates whether to reset the master down timer
	* for v4 or v6. If it's set to zero, reset the ones which are already pending.
	*/
	static void
	carp_setrun(struct carp_softc *sc, sa_family_t af)
	{
	struct timeval tv;

	if (sc->sc_carpdev == NULL) {
	SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
	carp_set_state(sc, INIT);
	return;
	} else
	CARP_SCLOCK_ASSERT(sc);

	if (SC2IFP(sc)->if_flags & IFF_UP &&
	sc->sc_vhid > 0 && (sc->sc_naddrs \|\| sc->sc_naddrs6))
	SC2IFP(sc)->if_drv_flags \|= IFF_DRV_RUNNING;
	else {
	SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
	carp_setroute(sc, RTM_DELETE);
	return;
	}

	switch (sc->sc_state) {
	case INIT:
	if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
	carp_send_ad_locked(sc);
	carp_send_arp(sc);
	#ifdef INET6
	carp_send_na(sc);
	#endif /* INET6 */
	CARP_DEBUG("%s: INIT -> MASTER (preempting)\n",
	SC2IFP(sc)->if_xname);
	carp_set_state(sc, MASTER);
	carp_setroute(sc, RTM_ADD);
	} else {
	CARP_DEBUG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname);
	carp_set_state(sc, BACKUP);
	carp_setroute(sc, RTM_DELETE);
	carp_setrun(sc, 0);
	}
	break;
	case BACKUP:
	callout_stop(&sc->sc_ad_tmo);
	tv.tv_sec = 3 * sc->sc_advbase;
	tv.tv_usec = sc->sc_advskew * 1000000 / 256;
	switch (af) {
	#ifdef INET
	case AF_INET:
	callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
	carp_master_down, sc);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
	carp_master_down, sc);
	break;
	#endif /* INET6 */
	default:
	if (sc->sc_naddrs)
	callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
	carp_master_down, sc);
	if (sc->sc_naddrs6)
	callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
	carp_master_down, sc);
	break;
	}
	break;
	case MASTER:
	tv.tv_sec = sc->sc_advbase;
	tv.tv_usec = sc->sc_advskew * 1000000 / 256;
	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
	carp_send_ad, sc);
	break;
	}
	}

	static void
	carp_multicast_cleanup(struct carp_softc *sc)
	{
	struct ip_moptions *imo = &sc->sc_imo;
	u_int16_t n = imo->imo_num_memberships;

	/* Clean up our own multicast memberships */
	while (n-- > 0) {
	if (imo->imo_membership[n] != NULL) {
	in_delmulti(imo->imo_membership[n]);
	imo->imo_membership[n] = NULL;
	}
	}
	KASSERT(imo->imo_mfilters == NULL,
	("%s: imo_mfilters != NULL", __func__));
	imo->imo_num_memberships = 0;
	imo->imo_multicast_ifp = NULL;
	}

	#ifdef INET6
	static void
	carp_multicast6_cleanup(struct carp_softc *sc)
	{
	struct ip6_moptions *im6o = &sc->sc_im6o;

	while (!LIST_EMPTY(&im6o->im6o_memberships)) {
	struct in6_multi_mship *imm =
	LIST_FIRST(&im6o->im6o_memberships);

	LIST_REMOVE(imm, i6mm_chain);
	in6_leavegroup(imm);
	}
	im6o->im6o_multicast_ifp = NULL;
	}
	#endif

	static int
	carp_set_addr(struct carp_softc sc, struct sockaddr_in sin)
	{
	+ INIT_VNET_INET(curvnet);
	struct ifnet *ifp;
	struct carp_if *cif;
	struct in_ifaddr ia, ia_if;
	struct ip_moptions *imo = &sc->sc_imo;
	struct in_addr addr;
	u_long iaddr = htonl(sin->sin_addr.s_addr);
	int own, error;

	if (sin->sin_addr.s_addr == 0) {
	if (!(SC2IFP(sc)->if_flags & IFF_UP))
	carp_set_state(sc, INIT);
	if (sc->sc_naddrs)
	SC2IFP(sc)->if_flags \|= IFF_UP;
	if (sc->sc_carpdev)
	CARP_SCLOCK(sc);
	carp_setrun(sc, 0);
	if (sc->sc_carpdev)
	CARP_SCUNLOCK(sc);
	return (0);
	}

	/* we have to do it by hands to check we won't match on us */
	ia_if = NULL; own = 0;
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	/* and, yeah, we need a multicast-capable iface too */
	if (ia->ia_ifp != SC2IFP(sc) &&
	(ia->ia_ifp->if_flags & IFF_MULTICAST) &&
	(iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
	if (!ia_if)
	ia_if = ia;
	if (sin->sin_addr.s_addr ==
	ia->ia_addr.sin_addr.s_addr)
	own++;
	}
	}

	if (!ia_if)
	return (EADDRNOTAVAIL);

	ia = ia_if;
	ifp = ia->ia_ifp;

	if (ifp == NULL \|\| (ifp->if_flags & IFF_MULTICAST) == 0 \|\|
	(imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp))
	return (EADDRNOTAVAIL);

	if (imo->imo_num_memberships == 0) {
	addr.s_addr = htonl(INADDR_CARP_GROUP);
	if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == NULL)
	return (ENOBUFS);
	imo->imo_num_memberships++;
	imo->imo_multicast_ifp = ifp;
	imo->imo_multicast_ttl = CARP_DFLTTL;
	imo->imo_multicast_loop = 0;
	}

	if (!ifp->if_carp) {

	MALLOC(cif, struct carp_if , sizeof(cif), M_CARP,
	M_WAITOK\|M_ZERO);
	if (!cif) {
	error = ENOBUFS;
	goto cleanup;
	}
	if ((error = ifpromisc(ifp, 1))) {
	FREE(cif, M_CARP);
	goto cleanup;
	}

	CARP_LOCK_INIT(cif);
	CARP_LOCK(cif);
	cif->vhif_ifp = ifp;
	TAILQ_INIT(&cif->vhif_vrs);
	ifp->if_carp = cif;

	} else {
	struct carp_softc *vr;

	cif = (struct carp_if *)ifp->if_carp;
	CARP_LOCK(cif);
	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
	if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
	CARP_UNLOCK(cif);
	error = EEXIST;
	goto cleanup;
	}
	}
	sc->sc_ia = ia;
	sc->sc_carpdev = ifp;

	{ /* XXX prevent endless loop if already in queue */
	struct carp_softc vr, after = NULL;
	int myself = 0;
	cif = (struct carp_if *)ifp->if_carp;

	/* XXX: cif should not change, right? So we still hold the lock */
	CARP_LOCK_ASSERT(cif);

	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
	if (vr == sc)
	myself = 1;
	if (vr->sc_vhid < sc->sc_vhid)
	after = vr;
	}

	if (!myself) {
	/* We're trying to keep things in order */
	if (after == NULL) {
	TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
	} else {
	TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
	}
	cif->vhif_nvrs++;
	}
	}

	sc->sc_naddrs++;
	SC2IFP(sc)->if_flags \|= IFF_UP;
	if (own)
	sc->sc_advskew = 0;
	carp_sc_state_locked(sc);
	carp_setrun(sc, 0);

	CARP_UNLOCK(cif);

	return (0);

	cleanup:
	in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
	return (error);
	}

	static int
	carp_del_addr(struct carp_softc sc, struct sockaddr_in sin)
	{
	int error = 0;

	if (!--sc->sc_naddrs) {
	struct carp_if cif = (struct carp_if )sc->sc_carpdev->if_carp;
	struct ip_moptions *imo = &sc->sc_imo;

	CARP_LOCK(cif);
	callout_stop(&sc->sc_ad_tmo);
	SC2IFP(sc)->if_flags &= ~IFF_UP;
	SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
	sc->sc_vhid = -1;
	in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
	imo->imo_multicast_ifp = NULL;
	TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
	if (!--cif->vhif_nvrs) {
	sc->sc_carpdev->if_carp = NULL;
	CARP_LOCK_DESTROY(cif);
	FREE(cif, M_IFADDR);
	} else {
	CARP_UNLOCK(cif);
	}
	}

	return (error);
	}

	#ifdef INET6
	static int
	carp_set_addr6(struct carp_softc sc, struct sockaddr_in6 sin6)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp;
	struct carp_if *cif;
	struct in6_ifaddr ia, ia_if;
	struct ip6_moptions *im6o = &sc->sc_im6o;
	struct in6_multi_mship *imm;
	struct in6_addr in6;
	int own, error;

	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
	if (!(SC2IFP(sc)->if_flags & IFF_UP))
	carp_set_state(sc, INIT);
	if (sc->sc_naddrs6)
	SC2IFP(sc)->if_flags \|= IFF_UP;
	if (sc->sc_carpdev)
	CARP_SCLOCK(sc);
	carp_setrun(sc, 0);
	if (sc->sc_carpdev)
	CARP_SCUNLOCK(sc);
	return (0);
	}

	/* we have to do it by hands to check we won't match on us */
	ia_if = NULL; own = 0;
	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	int i;

	for (i = 0; i < 4; i++) {
	if ((sin6->sin6_addr.s6_addr32[i] &
	ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
	(ia->ia_addr.sin6_addr.s6_addr32[i] &
	ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
	break;
	}
	/* and, yeah, we need a multicast-capable iface too */
	if (ia->ia_ifp != SC2IFP(sc) &&
	(ia->ia_ifp->if_flags & IFF_MULTICAST) &&
	(i == 4)) {
	if (!ia_if)
	ia_if = ia;
	if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
	&ia->ia_addr.sin6_addr))
	own++;
	}
	}

	if (!ia_if)
	return (EADDRNOTAVAIL);
	ia = ia_if;
	ifp = ia->ia_ifp;

	if (ifp == NULL \|\| (ifp->if_flags & IFF_MULTICAST) == 0 \|\|
	(im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp))
	return (EADDRNOTAVAIL);

	if (!sc->sc_naddrs6) {
	im6o->im6o_multicast_ifp = ifp;

	/* join CARP multicast address */
	bzero(&in6, sizeof(in6));
	in6.s6_addr16[0] = htons(0xff02);
	in6.s6_addr8[15] = 0x12;
	if (in6_setscope(&in6, ifp, NULL) != 0)
	goto cleanup;
	if ((imm = in6_joingroup(ifp, &in6, &error, 0)) == NULL)
	goto cleanup;
	LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);

	/* join solicited multicast address */
	bzero(&in6, sizeof(in6));
	in6.s6_addr16[0] = htons(0xff02);
	in6.s6_addr32[1] = 0;
	in6.s6_addr32[2] = htonl(1);
	in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
	in6.s6_addr8[12] = 0xff;
	if (in6_setscope(&in6, ifp, NULL) != 0)
	goto cleanup;
	if ((imm = in6_joingroup(ifp, &in6, &error, 0)) == NULL)
	goto cleanup;
	LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
	}

	if (!ifp->if_carp) {
	MALLOC(cif, struct carp_if , sizeof(cif), M_CARP,
	M_WAITOK\|M_ZERO);
	if (!cif) {
	error = ENOBUFS;
	goto cleanup;
	}
	if ((error = ifpromisc(ifp, 1))) {
	FREE(cif, M_CARP);
	goto cleanup;
	}

	CARP_LOCK_INIT(cif);
	CARP_LOCK(cif);
	cif->vhif_ifp = ifp;
	TAILQ_INIT(&cif->vhif_vrs);
	ifp->if_carp = cif;

	} else {
	struct carp_softc *vr;

	cif = (struct carp_if *)ifp->if_carp;
	CARP_LOCK(cif);
	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
	if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
	CARP_UNLOCK(cif);
	error = EINVAL;
	goto cleanup;
	}
	}
	sc->sc_ia6 = ia;
	sc->sc_carpdev = ifp;

	{ /* XXX prevent endless loop if already in queue */
	struct carp_softc vr, after = NULL;
	int myself = 0;
	cif = (struct carp_if *)ifp->if_carp;
	CARP_LOCK_ASSERT(cif);

	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
	if (vr == sc)
	myself = 1;
	if (vr->sc_vhid < sc->sc_vhid)
	after = vr;
	}

	if (!myself) {
	/* We're trying to keep things in order */
	if (after == NULL) {
	TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
	} else {
	TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
	}
	cif->vhif_nvrs++;
	}
	}

	sc->sc_naddrs6++;
	SC2IFP(sc)->if_flags \|= IFF_UP;
	if (own)
	sc->sc_advskew = 0;
	carp_sc_state_locked(sc);
	carp_setrun(sc, 0);

	CARP_UNLOCK(cif);

	return (0);

	cleanup:
	/* clean up multicast memberships */
	if (!sc->sc_naddrs6) {
	while (!LIST_EMPTY(&im6o->im6o_memberships)) {
	imm = LIST_FIRST(&im6o->im6o_memberships);
	LIST_REMOVE(imm, i6mm_chain);
	in6_leavegroup(imm);
	}
	}
	return (error);
	}

	static int
	carp_del_addr6(struct carp_softc sc, struct sockaddr_in6 sin6)
	{
	int error = 0;

	if (!--sc->sc_naddrs6) {
	struct carp_if cif = (struct carp_if )sc->sc_carpdev->if_carp;
	struct ip6_moptions *im6o = &sc->sc_im6o;

	CARP_LOCK(cif);
	callout_stop(&sc->sc_ad_tmo);
	SC2IFP(sc)->if_flags &= ~IFF_UP;
	SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
	sc->sc_vhid = -1;
	while (!LIST_EMPTY(&im6o->im6o_memberships)) {
	struct in6_multi_mship *imm =
	LIST_FIRST(&im6o->im6o_memberships);

	LIST_REMOVE(imm, i6mm_chain);
	in6_leavegroup(imm);
	}
	im6o->im6o_multicast_ifp = NULL;
	TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
	if (!--cif->vhif_nvrs) {
	CARP_LOCK_DESTROY(cif);
	sc->sc_carpdev->if_carp = NULL;
	FREE(cif, M_IFADDR);
	} else
	CARP_UNLOCK(cif);
	}

	return (error);
	}
	#endif /* INET6 */

	static int
	carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
	{
	struct carp_softc sc = ifp->if_softc, vr;
	struct carpreq carpr;
	struct ifaddr *ifa;
	struct ifreq *ifr;
	struct ifaliasreq *ifra;
	int locked = 0, error = 0;

	ifa = (struct ifaddr *)addr;
	ifra = (struct ifaliasreq *)addr;
	ifr = (struct ifreq *)addr;

	switch (cmd) {
	case SIOCSIFADDR:
	switch (ifa->ifa_addr->sa_family) {
	#ifdef INET
	case AF_INET:
	SC2IFP(sc)->if_flags \|= IFF_UP;
	bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
	sizeof(struct sockaddr));
	error = carp_set_addr(sc, satosin(ifa->ifa_addr));
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	SC2IFP(sc)->if_flags \|= IFF_UP;
	error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
	break;
	#endif /* INET6 */
	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;

	case SIOCAIFADDR:
	switch (ifa->ifa_addr->sa_family) {
	#ifdef INET
	case AF_INET:
	SC2IFP(sc)->if_flags \|= IFF_UP;
	bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
	sizeof(struct sockaddr));
	error = carp_set_addr(sc, satosin(&ifra->ifra_addr));
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	SC2IFP(sc)->if_flags \|= IFF_UP;
	error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr));
	break;
	#endif /* INET6 */
	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;

	case SIOCDIFADDR:
	switch (ifa->ifa_addr->sa_family) {
	#ifdef INET
	case AF_INET:
	error = carp_del_addr(sc, satosin(&ifra->ifra_addr));
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr));
	break;
	#endif /* INET6 */
	default:
	error = EAFNOSUPPORT;
	break;
	}
	break;

	case SIOCSIFFLAGS:
	if (sc->sc_carpdev) {
	locked = 1;
	CARP_SCLOCK(sc);
	}
	if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) {
	callout_stop(&sc->sc_ad_tmo);
	callout_stop(&sc->sc_md_tmo);
	callout_stop(&sc->sc_md6_tmo);
	if (sc->sc_state == MASTER)
	carp_send_ad_locked(sc);
	carp_set_state(sc, INIT);
	carp_setrun(sc, 0);
	} else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) {
	SC2IFP(sc)->if_flags \|= IFF_UP;
	carp_setrun(sc, 0);
	}
	break;

	case SIOCSVH:
	error = priv_check(curthread, PRIV_NETINET_CARP);
	if (error)
	break;
	if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
	break;
	error = 1;
	if (sc->sc_carpdev) {
	locked = 1;
	CARP_SCLOCK(sc);
	}
	if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
	switch (carpr.carpr_state) {
	case BACKUP:
	callout_stop(&sc->sc_ad_tmo);
	carp_set_state(sc, BACKUP);
	carp_setrun(sc, 0);
	carp_setroute(sc, RTM_DELETE);
	break;
	case MASTER:
	carp_master_down_locked(sc);
	break;
	default:
	break;
	}
	}
	if (carpr.carpr_vhid > 0) {
	if (carpr.carpr_vhid > 255) {
	error = EINVAL;
	break;
	}
	if (sc->sc_carpdev) {
	struct carp_if *cif;
	cif = (struct carp_if *)sc->sc_carpdev->if_carp;
	TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
	if (vr != sc &&
	vr->sc_vhid == carpr.carpr_vhid) {
	error = EEXIST;
	break;
	}
	if (error == EEXIST)
	break;
	}
	sc->sc_vhid = carpr.carpr_vhid;
	IF_LLADDR(sc->sc_ifp)[0] = 0;
	IF_LLADDR(sc->sc_ifp)[1] = 0;
	IF_LLADDR(sc->sc_ifp)[2] = 0x5e;
	IF_LLADDR(sc->sc_ifp)[3] = 0;
	IF_LLADDR(sc->sc_ifp)[4] = 1;
	IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid;
	error--;
	}
	if (carpr.carpr_advbase > 0 \|\| carpr.carpr_advskew > 0) {
	if (carpr.carpr_advskew >= 255) {
	error = EINVAL;
	break;
	}
	if (carpr.carpr_advbase > 255) {
	error = EINVAL;
	break;
	}
	sc->sc_advbase = carpr.carpr_advbase;
	sc->sc_advskew = carpr.carpr_advskew;
	error--;
	}
	bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
	if (error > 0)
	error = EINVAL;
	else {
	error = 0;
	carp_setrun(sc, 0);
	}
	break;

	case SIOCGVH:
	/* XXX: lockless read */
	bzero(&carpr, sizeof(carpr));
	carpr.carpr_state = sc->sc_state;
	carpr.carpr_vhid = sc->sc_vhid;
	carpr.carpr_advbase = sc->sc_advbase;
	carpr.carpr_advskew = sc->sc_advskew;
	error = priv_check(curthread, PRIV_NETINET_CARP);
	if (error == 0)
	bcopy(sc->sc_key, carpr.carpr_key,
	sizeof(carpr.carpr_key));
	error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
	break;

	default:
	error = EINVAL;
	}

	if (locked)
	CARP_SCUNLOCK(sc);

	carp_hmac_prepare(sc);

	return (error);
	}

	/*
	* XXX: this is looutput. We should eventually use it from there.
	*/
	static int
	carp_looutput(struct ifnet ifp, struct mbuf m, struct sockaddr *dst,
	struct rtentry *rt)
	{
	u_int32_t af;

	M_ASSERTPKTHDR(m); /* check if we have the packet header */

	if (rt && rt->rt_flags & (RTF_REJECT\|RTF_BLACKHOLE)) {
	m_freem(m);
	return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
	rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
	}

	ifp->if_opackets++;
	ifp->if_obytes += m->m_pkthdr.len;

	/* BPF writes need to be handled specially. */
	if (dst->sa_family == AF_UNSPEC) {
	bcopy(dst->sa_data, &af, sizeof(af));
	dst->sa_family = af;
	}

	#if 1 /* XXX */
	switch (dst->sa_family) {
	case AF_INET:
	case AF_INET6:
	case AF_IPX:
	case AF_APPLETALK:
	break;
	default:
	printf("carp_looutput: af=%d unexpected\n", dst->sa_family);
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	#endif
	return(if_simloop(ifp, m, dst->sa_family, 0));
	}

	/*
	* Start output on carp interface. This function should never be called.
	*/
	static void
	carp_start(struct ifnet *ifp)
	{
	#ifdef DEBUG
	printf("%s: start called\n", ifp->if_xname);
	#endif
	}

	int
	carp_output(struct ifnet ifp, struct mbuf m, struct sockaddr *sa,
	struct rtentry *rt)
	{
	struct m_tag *mtag;
	struct carp_softc *sc;
	struct ifnet *carp_ifp;

	if (!sa)
	return (0);

	switch (sa->sa_family) {
	#ifdef INET
	case AF_INET:
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	break;
	#endif /* INET6 */
	default:
	return (0);
	}

	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
	if (mtag == NULL)
	return (0);

	bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *));
	sc = carp_ifp->if_softc;

	/* Set the source MAC address to Virtual Router MAC Address */
	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_L2VLAN: {
	struct ether_header *eh;

	eh = mtod(m, struct ether_header *);
	eh->ether_shost[0] = 0;
	eh->ether_shost[1] = 0;
	eh->ether_shost[2] = 0x5e;
	eh->ether_shost[3] = 0;
	eh->ether_shost[4] = 1;
	eh->ether_shost[5] = sc->sc_vhid;
	}
	break;
	case IFT_FDDI: {
	struct fddi_header *fh;

	fh = mtod(m, struct fddi_header *);
	fh->fddi_shost[0] = 0;
	fh->fddi_shost[1] = 0;
	fh->fddi_shost[2] = 0x5e;
	fh->fddi_shost[3] = 0;
	fh->fddi_shost[4] = 1;
	fh->fddi_shost[5] = sc->sc_vhid;
	}
	break;
	case IFT_ISO88025: {
	struct iso88025_header *th;
	th = mtod(m, struct iso88025_header *);
	th->iso88025_shost[0] = 3;
	th->iso88025_shost[1] = 0;
	th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
	th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
	th->iso88025_shost[4] = 0;
	th->iso88025_shost[5] = 0;
	}
	break;
	default:
	printf("%s: carp is not supported for this interface type\n",
	ifp->if_xname);
	return (EOPNOTSUPP);
	}

	return (0);
	}

	static void
	carp_set_state(struct carp_softc *sc, int state)
	{

	if (sc->sc_carpdev)
	CARP_SCLOCK_ASSERT(sc);

	if (sc->sc_state == state)
	return;

	sc->sc_state = state;
	switch (state) {
	case BACKUP:
	SC2IFP(sc)->if_link_state = LINK_STATE_DOWN;
	break;
	case MASTER:
	SC2IFP(sc)->if_link_state = LINK_STATE_UP;
	break;
	default:
	SC2IFP(sc)->if_link_state = LINK_STATE_UNKNOWN;
	break;
	}
	rt_ifmsg(SC2IFP(sc));
	}

	void
	carp_carpdev_state(void *v)
	{
	struct carp_if *cif = v;

	CARP_LOCK(cif);
	carp_carpdev_state_locked(cif);
	CARP_UNLOCK(cif);
	}

	static void
	carp_carpdev_state_locked(struct carp_if *cif)
	{
	struct carp_softc *sc;

	TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
	carp_sc_state_locked(sc);
	}

	static void
	carp_sc_state_locked(struct carp_softc *sc)
	{
	CARP_SCLOCK_ASSERT(sc);

	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP \|\|
	!(sc->sc_carpdev->if_flags & IFF_UP)) {
	sc->sc_flags_backup = SC2IFP(sc)->if_flags;
	SC2IFP(sc)->if_flags &= ~IFF_UP;
	SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
	callout_stop(&sc->sc_ad_tmo);
	callout_stop(&sc->sc_md_tmo);
	callout_stop(&sc->sc_md6_tmo);
	carp_set_state(sc, INIT);
	carp_setrun(sc, 0);
	if (!sc->sc_suppress) {
	carp_suppress_preempt++;
	if (carp_suppress_preempt == 1) {
	CARP_SCUNLOCK(sc);
	carp_send_ad_all();
	CARP_SCLOCK(sc);
	}
	}
	sc->sc_suppress = 1;
	} else {
	SC2IFP(sc)->if_flags \|= sc->sc_flags_backup;
	carp_set_state(sc, INIT);
	carp_setrun(sc, 0);
	if (sc->sc_suppress)
	carp_suppress_preempt--;
	sc->sc_suppress = 0;
	}

	return;
	}

	static int
	carp_modevent(module_t mod, int type, void *data)
	{
	switch (type) {
	case MOD_LOAD:
	if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
	carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
	if (if_detach_event_tag == NULL)
	return (ENOMEM);
	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
	LIST_INIT(&carpif_list);
	if_clone_attach(&carp_cloner);
	break;

	case MOD_UNLOAD:
	EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
	if_clone_detach(&carp_cloner);
	mtx_destroy(&carp_mtx);
	break;

	default:
	return (EINVAL);
	}

	return (0);
	}

	static moduledata_t carp_mod = {
	"carp",
	carp_modevent,
	0
	};

	DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	Index: head/sys/netinet/ip_divert.c
	===================================================================
	--- head/sys/netinet/ip_divert.c (revision 183549)
	+++ head/sys/netinet/ip_divert.c (revision 183550)
	@@ -1,756 +1,766 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#if !defined(KLD_MODULE)
	#include "opt_inet.h"
	#include "opt_ipfw.h"
	#include "opt_mac.h"
	#ifndef INET
	#error "IPDIVERT requires INET."
	#endif
	#ifndef IPFIREWALL
	#error "IPDIVERT requires IPFIREWALL"
	#endif
	#endif

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/kernel.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/netisr.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_divert.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_fw.h>

	#include <security/mac/mac_framework.h>

	/*
	* Divert sockets
	*/

	/*
	* Allocate enough space to hold a full IP packet
	*/
	#define DIVSNDQ (65536 + 100)
	#define DIVRCVQ (65536 + 100)

	/*
	* Divert sockets work in conjunction with ipfw, see the divert(4)
	* manpage for features.
	* Internally, packets selected by ipfw in ip_input() or ip_output(),
	* and never diverted before, are passed to the input queue of the
	* divert socket with a given 'divert_port' number (as specified in
	* the matching ipfw rule), and they are tagged with a 16 bit cookie
	* (representing the rule number of the matching ipfw rule), which
	* is passed to process reading from the socket.
	*
	* Packets written to the divert socket are again tagged with a cookie
	* (usually the same as above) and a destination address.
	* If the destination address is INADDR_ANY then the packet is
	* treated as outgoing and sent to ip_output(), otherwise it is
	* treated as incoming and sent to ip_input().
	* In both cases, the packet is tagged with the cookie.
	*
	* On reinjection, processing in ip_input() and ip_output()
	* will be exactly the same as for the original packet, except that
	* ipfw processing will start at the rule number after the one
	* written in the cookie (so, tagging a packet with a cookie of 0
	* will cause it to be effectively considered as a standard packet).
	*/

	/* Internal variables. */
	static struct inpcbhead divcb;
	static struct inpcbinfo divcbinfo;

	static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */
	static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */

	/*
	* Initialize divert connection block queue.
	*/
	static void
	div_zone_change(void *tag)
	{

	uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
	}

	static int
	div_inpcb_init(void *mem, int size, int flags)
	{
	struct inpcb *inp = mem;

	INP_LOCK_INIT(inp, "inp", "divinp");
	return (0);
	}

	static void
	div_inpcb_fini(void *mem, int size)
	{
	struct inpcb *inp = mem;

	INP_LOCK_DESTROY(inp);
	}

	void
	div_init(void)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_LOCK_INIT(&V_divcbinfo, "div");
	LIST_INIT(&V_divcb);
	V_divcbinfo.ipi_listhead = &V_divcb;
	/*
	* XXX We don't use the hash list for divert IP, but it's easier
	* to allocate a one entry hash list than it is to check all
	* over the place for hashbase == NULL.
	*/
	V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask);
	V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB,
	&V_divcbinfo.ipi_porthashmask);
	V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb),
	NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR,
	UMA_ZONE_NOFREE);
	uma_zone_set_max(divcbinfo.ipi_zone, maxsockets);
	EVENTHANDLER_REGISTER(maxsockets_change, div_zone_change,
	NULL, EVENTHANDLER_PRI_ANY);
	}

	/*
	* IPPROTO_DIVERT is not in the real IP protocol number space; this
	* function should never be called. Just in case, drop any packets.
	*/
	void
	div_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET(curvnet);
	+
	V_ipstat.ips_noproto++;
	m_freem(m);
	}

	/*
	* Divert a packet by passing it up to the divert socket at port 'port'.
	*
	* Setup generic address and protocol structures for div_input routine,
	* then pass them along with mbuf chain.
	*/
	static void
	divert_packet(struct mbuf *m, int incoming)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip;
	struct inpcb *inp;
	struct socket *sa;
	u_int16_t nport;
	struct sockaddr_in divsrc;
	struct m_tag *mtag;

	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
	if (mtag == NULL) {
	printf("%s: no divert tag\n", __func__);
	m_freem(m);
	return;
	}
	/* Assure header */
	if (m->m_len < sizeof(struct ip) &&
	(m = m_pullup(m, sizeof(struct ip))) == 0)
	return;
	ip = mtod(m, struct ip *);

	/* Delayed checksums are currently not compatible with divert. */
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	ip->ip_len = ntohs(ip->ip_len);
	in_delayed_cksum(m);
	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	ip->ip_len = htons(ip->ip_len);
	}

	/*
	* Record receive interface address, if any.
	* But only for incoming packets.
	*/
	bzero(&divsrc, sizeof(divsrc));
	divsrc.sin_len = sizeof(divsrc);
	divsrc.sin_family = AF_INET;
	divsrc.sin_port = divert_cookie(mtag); /* record matching rule */
	if (incoming) {
	struct ifaddr *ifa;

	/* Sanity check */
	M_ASSERTPKTHDR(m);

	/* Find IP address for receive interface */
	TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	divsrc.sin_addr =
	((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
	break;
	}
	}
	/*
	* Record the incoming interface name whenever we have one.
	*/
	if (m->m_pkthdr.rcvif) {
	/*
	* Hide the actual interface name in there in the
	* sin_zero array. XXX This needs to be moved to a
	* different sockaddr type for divert, e.g.
	* sockaddr_div with multiple fields like
	* sockaddr_dl. Presently we have only 7 bytes
	* but that will do for now as most interfaces
	* are 4 or less + 2 or less bytes for unit.
	* There is probably a faster way of doing this,
	* possibly taking it from the sockaddr_dl on the iface.
	* This solves the problem of a P2P link and a LAN interface
	* having the same address, which can result in the wrong
	* interface being assigned to the packet when fed back
	* into the divert socket. Theoretically if the daemon saves
	* and re-uses the sockaddr_in as suggested in the man pages,
	* this iface name will come along for the ride.
	* (see div_output for the other half of this.)
	*/
	strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
	sizeof(divsrc.sin_zero));
	}

	/* Put packet on socket queue, if any */
	sa = NULL;
	nport = htons((u_int16_t)divert_info(mtag));
	INP_INFO_RLOCK(&V_divcbinfo);
	LIST_FOREACH(inp, &V_divcb, inp_list) {
	/* XXX why does only one socket match? */
	if (inp->inp_lport == nport) {
	INP_RLOCK(inp);
	sa = inp->inp_socket;
	SOCKBUF_LOCK(&sa->so_rcv);
	if (sbappendaddr_locked(&sa->so_rcv,
	(struct sockaddr *)&divsrc, m,
	(struct mbuf *)0) == 0) {
	SOCKBUF_UNLOCK(&sa->so_rcv);
	sa = NULL; /* force mbuf reclaim below */
	} else
	sorwakeup_locked(sa);
	INP_RUNLOCK(inp);
	break;
	}
	}
	INP_INFO_RUNLOCK(&V_divcbinfo);
	if (sa == NULL) {
	m_freem(m);
	V_ipstat.ips_noproto++;
	V_ipstat.ips_delivered--;
	}
	}

	/*
	* Deliver packet back into the IP processing machinery.
	*
	* If no address specified, or address is 0.0.0.0, send to ip_output();
	* otherwise, send to ip_input() and mark as having been received on
	* the interface with that address.
	*/
	static int
	div_output(struct socket so, struct mbuf m, struct sockaddr_in *sin,
	struct mbuf *control)
	{
	+ INIT_VNET_INET(curvnet);
	struct m_tag *mtag;
	struct divert_tag *dt;
	int error = 0;
	struct mbuf *options;

	/*
	* An mbuf may hasn't come from userland, but we pretend
	* that it has.
	*/
	m->m_pkthdr.rcvif = NULL;
	m->m_nextpkt = NULL;

	if (control)
	m_freem(control); /* XXX */

	if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) {
	mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag),
	M_NOWAIT \| M_ZERO);
	if (mtag == NULL) {
	error = ENOBUFS;
	goto cantsend;
	}
	dt = (struct divert_tag *)(mtag+1);
	m_tag_prepend(m, mtag);
	} else
	dt = (struct divert_tag *)(mtag+1);

	/* Loopback avoidance and state recovery */
	if (sin) {
	int i;

	dt->cookie = sin->sin_port;
	/*
	* Find receive interface with the given name, stuffed
	* (if it exists) in the sin_zero[] field.
	* The name is user supplied data so don't trust its size
	* or that it is zero terminated.
	*/
	for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
	;
	if ( i > 0 && i < sizeof(sin->sin_zero))
	m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
	}

	/* Reinject packet into the system as incoming or outgoing */
	if (!sin \|\| sin->sin_addr.s_addr == 0) {
	struct ip const ip = mtod(m, struct ip );
	struct inpcb *inp;

	dt->info \|= IP_FW_DIVERT_OUTPUT_FLAG;
	INP_INFO_WLOCK(&V_divcbinfo);
	inp = sotoinpcb(so);
	INP_RLOCK(inp);
	/*
	* Don't allow both user specified and setsockopt options,
	* and don't allow packet length sizes that will crash
	*/
	if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) \|\|
	((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
	error = EINVAL;
	INP_RUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_divcbinfo);
	m_freem(m);
	} else {
	/* Convert fields to host order for ip_output() */
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);

	/* Send packet to output processing */
	V_ipstat.ips_rawout++; /* XXX */

	#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
	#endif
	/*
	* Get ready to inject the packet into ip_output().
	* Just in case socket options were specified on the
	* divert socket, we duplicate them. This is done
	* to avoid having to hold the PCB locks over the call
	* to ip_output(), as doing this results in a number of
	* lock ordering complexities.
	*
	* Note that we set the multicast options argument for
	* ip_output() to NULL since it should be invariant that
	* they are not present.
	*/
	KASSERT(inp->inp_moptions == NULL,
	("multicast options set on a divert socket"));
	options = NULL;
	/*
	* XXXCSJP: It is unclear to me whether or not it makes
	* sense for divert sockets to have options. However,
	* for now we will duplicate them with the INP locks
	* held so we can use them in ip_output() without
	* requring a reference to the pcb.
	*/
	if (inp->inp_options != NULL) {
	options = m_dup(inp->inp_options, M_DONTWAIT);
	if (options == NULL)
	error = ENOBUFS;
	}
	INP_RUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_divcbinfo);
	if (error == ENOBUFS) {
	m_freem(m);
	return (error);
	}
	error = ip_output(m, options, NULL,
	((so->so_options & SO_DONTROUTE) ?
	IP_ROUTETOIF : 0) \| IP_ALLOWBROADCAST \|
	IP_RAWOUTPUT, NULL, NULL);
	if (options != NULL)
	m_freem(options);
	}
	} else {
	dt->info \|= IP_FW_DIVERT_LOOPBACK_FLAG;
	if (m->m_pkthdr.rcvif == NULL) {
	/*
	* No luck with the name, check by IP address.
	* Clear the port and the ifname to make sure
	* there are no distractions for ifa_ifwithaddr.
	*/
	struct ifaddr *ifa;

	bzero(sin->sin_zero, sizeof(sin->sin_zero));
	sin->sin_port = 0;
	ifa = ifa_ifwithaddr((struct sockaddr *) sin);
	if (ifa == NULL) {
	error = EADDRNOTAVAIL;
	goto cantsend;
	}
	m->m_pkthdr.rcvif = ifa->ifa_ifp;
	}
	#ifdef MAC
	SOCK_LOCK(so);
	mac_socket_create_mbuf(so, m);
	SOCK_UNLOCK(so);
	#endif
	/* Send packet to input processing via netisr */
	netisr_queue(NETISR_IP, m);
	}

	return error;

	cantsend:
	m_freem(m);
	return error;
	}

	static int
	div_attach(struct socket so, int proto, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
	if (td != NULL) {
	error = priv_check(td, PRIV_NETINET_DIVERT);
	if (error)
	return (error);
	}
	error = soreserve(so, div_sendspace, div_recvspace);
	if (error)
	return error;
	INP_INFO_WLOCK(&V_divcbinfo);
	error = in_pcballoc(so, &V_divcbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_divcbinfo);
	return error;
	}
	inp = (struct inpcb *)so->so_pcb;
	INP_INFO_WUNLOCK(&V_divcbinfo);
	inp->inp_ip_p = proto;
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_flags \|= INP_HDRINCL;
	INP_WUNLOCK(inp);
	return 0;
	}

	static void
	div_detach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
	INP_INFO_WLOCK(&V_divcbinfo);
	INP_WLOCK(inp);
	in_pcbdetach(inp);
	in_pcbfree(inp);
	INP_INFO_WUNLOCK(&V_divcbinfo);
	}

	static int
	div_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
	/* in_pcbbind assumes that nam is a sockaddr_in
	* and in_pcbbind requires a valid address. Since divert
	* sockets don't we need to make sure the address is
	* filled in properly.
	* XXX -- divert should not be abusing in_pcbind
	* and should probably have its own family.
	*/
	if (nam->sa_family != AF_INET)
	return EAFNOSUPPORT;
	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
	INP_INFO_WLOCK(&V_divcbinfo);
	INP_WLOCK(inp);
	error = in_pcbbind(inp, nam, td->td_ucred);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_divcbinfo);
	return error;
	}

	static int
	div_shutdown(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
	INP_WLOCK(inp);
	socantsendmore(so);
	INP_WUNLOCK(inp);
	return 0;
	}

	static int
	div_send(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	+
	/* Packet must have a header (but that's about it) */
	if (m->m_len < sizeof (struct ip) &&
	(m = m_pullup(m, sizeof (struct ip))) == 0) {
	V_ipstat.ips_toosmall++;
	m_freem(m);
	return EINVAL;
	}

	/* Send packet */
	return div_output(so, m, (struct sockaddr_in *)nam, control);
	}

	void
	div_ctlinput(int cmd, struct sockaddr sa, void vip)
	{
	struct in_addr faddr;

	faddr = ((struct sockaddr_in *)sa)->sin_addr;
	if (sa->sa_family != AF_INET \|\| faddr.s_addr == INADDR_ANY)
	return;
	if (PRC_IS_REDIRECT(cmd))
	return;
	}

	static int
	div_pcblist(SYSCTL_HANDLER_ARGS)
	{
	int error, i, n;
	struct inpcb inp, *inp_list;
	inp_gen_t gencnt;
	struct xinpgen xig;

	/*
	* The process of preparing the TCB list is too time-consuming and
	* resource-intensive to repeat twice on every request.
	*/
	if (req->oldptr == 0) {
	n = V_divcbinfo.ipi_count;
	req->oldidx = 2 * (sizeof xig)
	+ (n + n/8) * sizeof(struct xinpcb);
	return 0;
	}

	if (req->newptr != 0)
	return EPERM;

	/*
	* OK, now we're committed to doing something.
	*/
	INP_INFO_RLOCK(&V_divcbinfo);
	gencnt = V_divcbinfo.ipi_gencnt;
	n = V_divcbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_divcbinfo);

	error = sysctl_wire_old_buffer(req,
	2 * sizeof(xig) + n*sizeof(struct xinpcb));
	if (error != 0)
	return (error);

	xig.xig_len = sizeof xig;
	xig.xig_count = n;
	xig.xig_gen = gencnt;
	xig.xig_sogen = so_gencnt;
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	if (error)
	return error;

	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
	if (inp_list == 0)
	return ENOMEM;

	INP_INFO_RLOCK(&V_divcbinfo);
	for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n;
	inp = LIST_NEXT(inp, inp_list)) {
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt &&
	cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
	inp_list[i++] = inp;
	INP_RUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(&V_divcbinfo);
	n = i;

	error = 0;
	for (i = 0; i < n; i++) {
	inp = inp_list[i];
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt) {
	struct xinpcb xi;
	bzero(&xi, sizeof(xi));
	xi.xi_len = sizeof xi;
	/* XXX should avoid extra copy */
	bcopy(inp, &xi.xi_inp, sizeof *inp);
	if (inp->inp_socket)
	sotoxsocket(inp->inp_socket, &xi.xi_socket);
	INP_RUNLOCK(inp);
	error = SYSCTL_OUT(req, &xi, sizeof xi);
	} else
	INP_RUNLOCK(inp);
	}
	if (!error) {
	/*
	* Give the user an updated idea of our state.
	* If the generation differs from what we told
	* her before, she knows that something happened
	* while we were processing this request, and it
	* might be necessary to retry.
	*/
	INP_INFO_RLOCK(&V_divcbinfo);
	xig.xig_gen = V_divcbinfo.ipi_gencnt;
	xig.xig_sogen = so_gencnt;
	xig.xig_count = V_divcbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_divcbinfo);
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	}
	free(inp_list, M_TEMP);
	return error;
	}

	#ifdef SYSCTL_NODE
	SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT");
	SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0,
	div_pcblist, "S,xinpcb", "List of active divert sockets");
	#endif

	struct pr_usrreqs div_usrreqs = {
	.pru_attach = div_attach,
	.pru_bind = div_bind,
	.pru_control = in_control,
	.pru_detach = div_detach,
	.pru_peeraddr = in_getpeeraddr,
	.pru_send = div_send,
	.pru_shutdown = div_shutdown,
	.pru_sockaddr = in_getsockaddr,
	.pru_sosetlabel = in_pcbsosetlabel
	};

	struct protosw div_protosw = {
	.pr_type = SOCK_RAW,
	.pr_protocol = IPPROTO_DIVERT,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = div_input,
	.pr_ctlinput = div_ctlinput,
	.pr_ctloutput = ip_ctloutput,
	.pr_init = div_init,
	.pr_usrreqs = &div_usrreqs
	};

	static int
	div_modevent(module_t mod, int type, void *unused)
	{
	int err = 0;
	int n;

	switch (type) {
	case MOD_LOAD:
	/*
	* Protocol will be initialized by pf_proto_register().
	* We don't have to register ip_protox because we are not
	* a true IP protocol that goes over the wire.
	*/
	err = pf_proto_register(PF_INET, &div_protosw);
	ip_divert_ptr = divert_packet;
	break;
	case MOD_QUIESCE:
	/*
	* IPDIVERT may normally not be unloaded because of the
	* potential race conditions. Tell kldunload we can't be
	* unloaded unless the unload is forced.
	*/
	err = EPERM;
	break;
	case MOD_UNLOAD:
	/*
	* Forced unload.
	*
	* Module ipdivert can only be unloaded if no sockets are
	* connected. Maybe this can be changed later to forcefully
	* disconnect any open sockets.
	*
	* XXXRW: Note that there is a slight race here, as a new
	* socket open request could be spinning on the lock and then
	* we destroy the lock.
	*/
	INP_INFO_WLOCK(&V_divcbinfo);
	n = V_divcbinfo.ipi_count;
	if (n != 0) {
	err = EBUSY;
	INP_INFO_WUNLOCK(&V_divcbinfo);
	break;
	}
	ip_divert_ptr = NULL;
	err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
	INP_INFO_WUNLOCK(&V_divcbinfo);
	INP_INFO_LOCK_DESTROY(&V_divcbinfo);
	uma_zdestroy(V_divcbinfo.ipi_zone);
	break;
	default:
	err = EOPNOTSUPP;
	break;
	}
	return err;
	}

	static moduledata_t ipdivertmod = {
	"ipdivert",
	div_modevent,
	0
	};

	DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
	MODULE_DEPEND(dummynet, ipfw, 2, 2, 2);
	MODULE_VERSION(ipdivert, 1);
	Index: head/sys/netinet/ip_fastfwd.c
	===================================================================
	--- head/sys/netinet/ip_fastfwd.c (revision 183549)
	+++ head/sys/netinet/ip_fastfwd.c (revision 183550)
	@@ -1,614 +1,616 @@
	/*-
	* Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* ip_fastforward gets its speed from processing the forwarded packet to
	* completion (if_output on the other side) without any queues or netisr's.
	* The receiving interface DMAs the packet into memory, the upper half of
	* driver calls ip_fastforward, we do our routing table lookup and directly
	* send it off to the outgoing interface, which DMAs the packet to the
	* network card. The only part of the packet we touch with the CPU is the
	* IP header (unless there are complex firewall rules touching other parts
	* of the packet, but that is up to you). We are essentially limited by bus
	* bandwidth and how fast the network card/driver can set up receives and
	* transmits.
	*
	* We handle basic errors, IP header errors, checksum errors,
	* destination unreachable, fragmentation and fragmentation needed and
	* report them via ICMP to the sender.
	*
	* Else if something is not pure IPv4 unicast forwarding we fall back to
	* the normal ip_input processing path. We should only be called from
	* interfaces connected to the outside world.
	*
	* Firewalling is fully supported including divert, ipfw fwd and ipfilter
	* ipnat and address rewrite.
	*
	* IPSEC is not supported if this host is a tunnel broker. IPSEC is
	* supported for connections to/from local host.
	*
	* We try to do the least expensive (in CPU ops) checks and operations
	* first to catch junk with as little overhead as possible.
	*
	* We take full advantage of hardware support for IP checksum and
	* fragmentation offloading.
	*
	* We don't do ICMP redirect in the fast forwarding path. I have had my own
	* cases where two core routers with Zebra routing suite would send millions
	* ICMP redirects to connected hosts if the destination router was not the
	* default gateway. In one case it was filling the routing table of a host
	* with approximately 300.000 cloned redirect entries until it ran out of
	* kernel memory. However the networking code proved very robust and it didn't
	* crash or fail in other ways.
	*/

	/*
	* Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which
	* is being followed here.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipfw.h"
	#include "opt_ipstealth.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/pfil.h>
	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/if_dl.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/ip_options.h>

	#include <machine/in_cksum.h>

	static int ipfastforward_active = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW,
	- &ipfastforward_active, 0, "Enable fast IP forwarding");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fastforwarding,
	+ CTLFLAG_RW, ipfastforward_active, 0, "Enable fast IP forwarding");

	static struct sockaddr_in *
	ip_findroute(struct route ro, struct in_addr dest, struct mbuf m)
	{
	+ INIT_VNET_INET(curvnet);
	struct sockaddr_in *dst;
	struct rtentry *rt;

	/*
	* Find route to destination.
	*/
	bzero(ro, sizeof(*ro));
	dst = (struct sockaddr_in *)&ro->ro_dst;
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr.s_addr = dest.s_addr;
	in_rtalloc_ign(ro, RTF_CLONING, M_GETFIB(m));

	/*
	* Route there and interface still up?
	*/
	rt = ro->ro_rt;
	if (rt && (rt->rt_flags & RTF_UP) &&
	(rt->rt_ifp->if_flags & IFF_UP) &&
	(rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) {
	if (rt->rt_flags & RTF_GATEWAY)
	dst = (struct sockaddr_in *)rt->rt_gateway;
	} else {
	V_ipstat.ips_noroute++;
	V_ipstat.ips_cantforward++;
	if (rt)
	RTFREE(rt);
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
	return NULL;
	}
	return dst;
	}

	/*
	* Try to forward a packet based on the destination address.
	* This is a fast path optimized for the plain forwarding case.
	* If the packet is handled (and consumed) here then we return 1;
	* otherwise 0 is returned and the packet should be delivered
	* to ip_input for full processing.
	*/
	struct mbuf *
	ip_fastforward(struct mbuf *m)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip;
	struct mbuf *m0 = NULL;
	struct route ro;
	struct sockaddr_in *dst = NULL;
	struct ifnet *ifp;
	struct in_addr odest, dest;
	u_short sum, ip_len;
	int error = 0;
	int hlen, mtu;
	#ifdef IPFIREWALL_FORWARD
	struct m_tag *fwd_tag;
	#endif

	/*
	* Are we active and forwarding packets?
	*/
	if (!V_ipfastforward_active \|\| !V_ipforwarding)
	return m;

	M_ASSERTVALID(m);
	M_ASSERTPKTHDR(m);

	ro.ro_rt = NULL;

	/*
	* Step 1: check for packet drop conditions (and sanity checks)
	*/

	/*
	* Is entire packet big enough?
	*/
	if (m->m_pkthdr.len < sizeof(struct ip)) {
	V_ipstat.ips_tooshort++;
	goto drop;
	}

	/*
	* Is first mbuf large enough for ip header and is header present?
	*/
	if (m->m_len < sizeof (struct ip) &&
	(m = m_pullup(m, sizeof (struct ip))) == NULL) {
	V_ipstat.ips_toosmall++;
	return NULL; /* mbuf already free'd */
	}

	ip = mtod(m, struct ip *);

	/*
	* Is it IPv4?
	*/
	if (ip->ip_v != IPVERSION) {
	V_ipstat.ips_badvers++;
	goto drop;
	}

	/*
	* Is IP header length correct and is it in first mbuf?
	*/
	hlen = ip->ip_hl << 2;
	if (hlen < sizeof(struct ip)) { /* minimum header length */
	V_ipstat.ips_badlen++;
	goto drop;
	}
	if (hlen > m->m_len) {
	if ((m = m_pullup(m, hlen)) == NULL) {
	V_ipstat.ips_badhlen++;
	return NULL; /* mbuf already free'd */
	}
	ip = mtod(m, struct ip *);
	}

	/*
	* Checksum correct?
	*/
	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED)
	sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
	else {
	if (hlen == sizeof(struct ip))
	sum = in_cksum_hdr(ip);
	else
	sum = in_cksum(m, hlen);
	}
	if (sum) {
	V_ipstat.ips_badsum++;
	goto drop;
	}

	/*
	* Remember that we have checked the IP header and found it valid.
	*/
	m->m_pkthdr.csum_flags \|= (CSUM_IP_CHECKED \| CSUM_IP_VALID);

	ip_len = ntohs(ip->ip_len);

	/*
	* Is IP length longer than packet we have got?
	*/
	if (m->m_pkthdr.len < ip_len) {
	V_ipstat.ips_tooshort++;
	goto drop;
	}

	/*
	* Is packet longer than IP header tells us? If yes, truncate packet.
	*/
	if (m->m_pkthdr.len > ip_len) {
	if (m->m_len == m->m_pkthdr.len) {
	m->m_len = ip_len;
	m->m_pkthdr.len = ip_len;
	} else
	m_adj(m, ip_len - m->m_pkthdr.len);
	}

	/*
	* Is packet from or to 127/8?
	*/
	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET \|\|
	(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
	V_ipstat.ips_badaddr++;
	goto drop;
	}

	#ifdef ALTQ
	/*
	* Is packet dropped by traffic conditioner?
	*/
	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
	goto drop;
	#endif

	/*
	* Step 2: fallback conditions to normal ip_input path processing
	*/

	/*
	* Only IP packets without options
	*/
	if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
	if (ip_doopts == 1)
	return m;
	else if (ip_doopts == 2) {
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
	0, 0);
	return NULL; /* mbuf already free'd */
	}
	/* else ignore IP options and continue */
	}

	/*
	* Only unicast IP, not from loopback, no L2 or IP broadcast,
	* no multicast, no INADDR_ANY
	*
	* XXX: Probably some of these checks could be direct drop
	* conditions. However it is not clear whether there are some
	* hacks or obscure behaviours which make it neccessary to
	* let ip_input handle it. We play safe here and let ip_input
	* deal with it until it is proven that we can directly drop it.
	*/
	if ((m->m_flags & (M_BCAST\|M_MCAST)) \|\|
	(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) \|\|
	ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST \|\|
	ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST \|\|
	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) \|\|
	IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) \|\|
	ip->ip_src.s_addr == INADDR_ANY \|\|
	ip->ip_dst.s_addr == INADDR_ANY )
	return m;

	/*
	* Is it for a local address on this host?
	*/
	if (in_localip(ip->ip_dst))
	return m;

	V_ipstat.ips_total++;

	/*
	* Step 3: incoming packet firewall processing
	*/

	/*
	* Convert to host representation
	*/
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);

	odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;

	/*
	* Run through list of ipfilter hooks for input packets
	*/
	if (!PFIL_HOOKED(&inet_pfil_hook))
	goto passin;

	if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) \|\|
	m == NULL)
	goto drop;

	M_ASSERTVALID(m);
	M_ASSERTPKTHDR(m);

	ip = mtod(m, struct ip ); / m may have changed by pfil hook */
	dest.s_addr = ip->ip_dst.s_addr;

	/*
	* Destination address changed?
	*/
	if (odest.s_addr != dest.s_addr) {
	/*
	* Is it now for a local address on this host?
	*/
	if (in_localip(dest))
	goto forwardlocal;
	/*
	* Go on with new destination address
	*/
	}
	#ifdef IPFIREWALL_FORWARD
	if (m->m_flags & M_FASTFWD_OURS) {
	/*
	* ipfw changed it for a local address on this host.
	*/
	goto forwardlocal;
	}
	#endif /* IPFIREWALL_FORWARD */

	passin:
	/*
	* Step 4: decrement TTL and look up route
	*/

	/*
	* Check TTL
	*/
	#ifdef IPSTEALTH
	if (!V_ipstealth) {
	#endif
	if (ip->ip_ttl <= IPTTLDEC) {
	icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
	return NULL; /* mbuf already free'd */
	}

	/*
	* Decrement the TTL and incrementally change the IP header checksum.
	* Don't bother doing this with hw checksum offloading, it's faster
	* doing it right here.
	*/
	ip->ip_ttl -= IPTTLDEC;
	if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
	ip->ip_sum -= ~htons(IPTTLDEC << 8);
	else
	ip->ip_sum += htons(IPTTLDEC << 8);
	#ifdef IPSTEALTH
	}
	#endif

	/*
	* Find route to destination.
	*/
	if ((dst = ip_findroute(&ro, dest, m)) == NULL)
	return NULL; /* icmp unreach already sent */
	ifp = ro.ro_rt->rt_ifp;

	/*
	* Immediately drop blackholed traffic, and directed broadcasts
	* for either the all-ones or all-zero subnet addresses on
	* locally attached networks.
	*/
	if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE\|RTF_BROADCAST)) != 0)
	goto drop;

	/*
	* Step 5: outgoing firewall packet processing
	*/

	/*
	* Run through list of hooks for output packets.
	*/
	if (!PFIL_HOOKED(&inet_pfil_hook))
	goto passout;

	if (pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) \|\| m == NULL) {
	goto drop;
	}

	M_ASSERTVALID(m);
	M_ASSERTPKTHDR(m);

	ip = mtod(m, struct ip *);
	dest.s_addr = ip->ip_dst.s_addr;

	/*
	* Destination address changed?
	*/
	#ifndef IPFIREWALL_FORWARD
	if (odest.s_addr != dest.s_addr) {
	#else
	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
	if (odest.s_addr != dest.s_addr \|\| fwd_tag != NULL) {
	#endif /* IPFIREWALL_FORWARD */
	/*
	* Is it now for a local address on this host?
	*/
	#ifndef IPFIREWALL_FORWARD
	if (in_localip(dest)) {
	#else
	if (m->m_flags & M_FASTFWD_OURS \|\| in_localip(dest)) {
	#endif /* IPFIREWALL_FORWARD */
	forwardlocal:
	/*
	* Return packet for processing by ip_input().
	* Keep host byte order as expected at ip_input's
	* "ours"-label.
	*/
	m->m_flags \|= M_FASTFWD_OURS;
	if (ro.ro_rt)
	RTFREE(ro.ro_rt);
	return m;
	}
	/*
	* Redo route lookup with new destination address
	*/
	#ifdef IPFIREWALL_FORWARD
	if (fwd_tag) {
	dest.s_addr = ((struct sockaddr_in *)
	(fwd_tag + 1))->sin_addr.s_addr;
	m_tag_delete(m, fwd_tag);
	}
	#endif /* IPFIREWALL_FORWARD */
	RTFREE(ro.ro_rt);
	if ((dst = ip_findroute(&ro, dest, m)) == NULL)
	return NULL; /* icmp unreach already sent */
	ifp = ro.ro_rt->rt_ifp;
	}

	passout:
	/*
	* Step 6: send off the packet
	*/

	/*
	* Check if route is dampned (when ARP is unable to resolve)
	*/
	if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
	(ro.ro_rt->rt_rmx.rmx_expire == 0 \|\|
	time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
	goto consumed;
	}

	#ifndef ALTQ
	/*
	* Check if there is enough space in the interface queue
	*/
	if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >=
	ifp->if_snd.ifq_maxlen) {
	V_ipstat.ips_odropped++;
	/* would send source quench here but that is depreciated */
	goto drop;
	}
	#endif

	/*
	* Check if media link state of interface is not down
	*/
	if (ifp->if_link_state == LINK_STATE_DOWN) {
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
	goto consumed;
	}

	/*
	* Check if packet fits MTU or if hardware will fragment for us
	*/
	if (ro.ro_rt->rt_rmx.rmx_mtu)
	mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
	else
	mtu = ifp->if_mtu;

	if (ip->ip_len <= mtu \|\|
	(ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) {
	/*
	* Restore packet header fields to original values
	*/
	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);
	/*
	* Send off the packet via outgoing interface
	*/
	error = (*ifp->if_output)(ifp, m,
	(struct sockaddr *)dst, ro.ro_rt);
	} else {
	/*
	* Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
	*/
	if (ip->ip_off & IP_DF) {
	V_ipstat.ips_cantfrag++;
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
	0, mtu);
	goto consumed;
	} else {
	/*
	* We have to fragment the packet
	*/
	m->m_pkthdr.csum_flags \|= CSUM_IP;
	/*
	* ip_fragment expects ip_len and ip_off in host byte
	* order but returns all packets in network byte order
	*/
	if (ip_fragment(ip, &m, mtu, ifp->if_hwassist,
	(~ifp->if_hwassist & CSUM_DELAY_IP))) {
	goto drop;
	}
	KASSERT(m != NULL, ("null mbuf and no error"));
	/*
	* Send off the fragments via outgoing interface
	*/
	error = 0;
	do {
	m0 = m->m_nextpkt;
	m->m_nextpkt = NULL;

	error = (*ifp->if_output)(ifp, m,
	(struct sockaddr *)dst, ro.ro_rt);
	if (error)
	break;
	} while ((m = m0) != NULL);
	if (error) {
	/* Reclaim remaining fragments */
	for (m = m0; m; m = m0) {
	m0 = m->m_nextpkt;
	m_freem(m);
	}
	} else
	V_ipstat.ips_fragmented++;
	}
	}

	if (error != 0)
	V_ipstat.ips_odropped++;
	else {
	ro.ro_rt->rt_rmx.rmx_pksent++;
	V_ipstat.ips_forward++;
	V_ipstat.ips_fastforward++;
	}
	consumed:
	RTFREE(ro.ro_rt);
	return NULL;
	drop:
	if (m)
	m_freem(m);
	if (ro.ro_rt)
	RTFREE(ro.ro_rt);
	return NULL;
	}
	Index: head/sys/netinet/ip_fw.h
	===================================================================
	--- head/sys/netinet/ip_fw.h (revision 183549)
	+++ head/sys/netinet/ip_fw.h (revision 183550)
	@@ -1,679 +1,760 @@
	/*-
	* Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _IPFW2_H
	#define _IPFW2_H

	/*
	* The default rule number. By the design of ip_fw, the default rule
	* is the last one, so its number can also serve as the highest number
	* allowed for a rule. The ip_fw code relies on both meanings of this
	* constant.
	*/
	#define IPFW_DEFAULT_RULE 65535

	/*
	* The number of ipfw tables. The maximum allowed table number is the
	* (IPFW_TABLES_MAX - 1).
	*/
	#define IPFW_TABLES_MAX 128

	/*
	* The kernel representation of ipfw rules is made of a list of
	* 'instructions' (for all practical purposes equivalent to BPF
	* instructions), which specify which fields of the packet
	* (or its metadata) should be analysed.
	*
	* Each instruction is stored in a structure which begins with
	* "ipfw_insn", and can contain extra fields depending on the
	* instruction type (listed below).
	* Note that the code is written so that individual instructions
	* have a size which is a multiple of 32 bits. This means that, if
	* such structures contain pointers or other 64-bit entities,
	* (there is just one instance now) they may end up unaligned on
	* 64-bit architectures, so the must be handled with care.
	*
	* "enum ipfw_opcodes" are the opcodes supported. We can have up
	* to 256 different opcodes. When adding new opcodes, they should
	* be appended to the end of the opcode list before O_LAST_OPCODE,
	* this will prevent the ABI from being broken, otherwise users
	* will have to recompile ipfw(8) when they update the kernel.
	*/

	enum ipfw_opcodes { /* arguments (4 byte each) */
	O_NOP,

	O_IP_SRC, /* u32 = IP */
	O_IP_SRC_MASK, /* ip = IP/mask */
	O_IP_SRC_ME, /* none */
	O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */

	O_IP_DST, /* u32 = IP */
	O_IP_DST_MASK, /* ip = IP/mask */
	O_IP_DST_ME, /* none */
	O_IP_DST_SET, /* u32=base, arg1=len, bitmap */

	O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */
	O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */
	O_PROTO, /* arg1=protocol */

	O_MACADDR2, /* 2 mac addr:mask */
	O_MAC_TYPE, /* same as srcport */

	O_LAYER2, /* none */
	O_IN, /* none */
	O_FRAG, /* none */

	O_RECV, /* none */
	O_XMIT, /* none */
	O_VIA, /* none */

	O_IPOPT, /* arg1 = 2u8 bitmap /
	O_IPLEN, /* arg1 = len */
	O_IPID, /* arg1 = id */

	O_IPTOS, /* arg1 = id */
	O_IPPRECEDENCE, /* arg1 = precedence << 5 */
	O_IPTTL, /* arg1 = TTL */

	O_IPVER, /* arg1 = version */
	O_UID, /* u32 = id */
	O_GID, /* u32 = id */
	O_ESTAB, /* none (tcp established) */
	O_TCPFLAGS, /* arg1 = 2u8 bitmap /
	O_TCPWIN, /* arg1 = desired win */
	O_TCPSEQ, /* u32 = desired seq. */
	O_TCPACK, /* u32 = desired seq. */
	O_ICMPTYPE, /* u32 = icmp bitmap */
	O_TCPOPTS, /* arg1 = 2u8 bitmap /

	O_VERREVPATH, /* none */
	O_VERSRCREACH, /* none */

	O_PROBE_STATE, /* none */
	O_KEEP_STATE, /* none */
	O_LIMIT, /* ipfw_insn_limit */
	O_LIMIT_PARENT, /* dyn_type, not an opcode. */

	/*
	* These are really 'actions'.
	*/

	O_LOG, /* ipfw_insn_log */
	O_PROB, /* u32 = match probability */

	O_CHECK_STATE, /* none */
	O_ACCEPT, /* none */
	O_DENY, /* none */
	O_REJECT, /* arg1=icmp arg (same as deny) */
	O_COUNT, /* none */
	O_SKIPTO, /* arg1=next rule number */
	O_PIPE, /* arg1=pipe number */
	O_QUEUE, /* arg1=queue number */
	O_DIVERT, /* arg1=port number */
	O_TEE, /* arg1=port number */
	O_FORWARD_IP, /* fwd sockaddr */
	O_FORWARD_MAC, /* fwd mac */
	O_NAT, /* nope */

	/*
	* More opcodes.
	*/
	O_IPSEC, /* has ipsec history */
	O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */
	O_IP_DST_LOOKUP, /* arg1=table number, u32=value */
	O_ANTISPOOF, /* none */
	O_JAIL, /* u32 = id */
	O_ALTQ, /* u32 = altq classif. qid */
	O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */
	O_TCPDATALEN, /* arg1 = tcp data len */
	O_IP6_SRC, /* address without mask */
	O_IP6_SRC_ME, /* my addresses */
	O_IP6_SRC_MASK, /* address with the mask */
	O_IP6_DST,
	O_IP6_DST_ME,
	O_IP6_DST_MASK,
	O_FLOW6ID, /* for flow id tag in the ipv6 pkt */
	O_ICMP6TYPE, /* icmp6 packet type filtering */
	O_EXT_HDR, /* filtering for ipv6 extension header */
	O_IP6,

	/*
	* actions for ng_ipfw
	*/
	O_NETGRAPH, /* send to ng_ipfw */
	O_NGTEE, /* copy to ng_ipfw */

	O_IP4,

	O_UNREACH6, /* arg1=icmpv6 code arg (deny) */

	O_TAG, /* arg1=tag number */
	O_TAGGED, /* arg1=tag number */

	O_SETFIB, /* arg1=FIB number */
	O_FIB, /* arg1=FIB desired fib number */

	O_LAST_OPCODE /* not an opcode! */
	};

	/*
	* The extension header are filtered only for presence using a bit
	* vector with a flag for each header.
	*/
	#define EXT_FRAGMENT 0x1
	#define EXT_HOPOPTS 0x2
	#define EXT_ROUTING 0x4
	#define EXT_AH 0x8
	#define EXT_ESP 0x10
	#define EXT_DSTOPTS 0x20
	#define EXT_RTHDR0 0x40
	#define EXT_RTHDR2 0x80

	/*
	* Template for instructions.
	*
	* ipfw_insn is used for all instructions which require no operands,
	* a single 16-bit value (arg1), or a couple of 8-bit values.
	*
	* For other instructions which require different/larger arguments
	* we have derived structures, ipfw_insn_*.
	*
	* The size of the instruction (in 32-bit words) is in the low
	* 6 bits of "len". The 2 remaining bits are used to implement
	* NOT and OR on individual instructions. Given a type, you can
	* compute the length to be put in "len" using F_INSN_SIZE(t)
	*
	* F_NOT negates the match result of the instruction.
	*
	* F_OR is used to build or blocks. By default, instructions
	* are evaluated as part of a logical AND. An "or" block
	* { X or Y or Z } contains F_OR set in all but the last
	* instruction of the block. A match will cause the code
	* to skip past the last instruction of the block.
	*
	* NOTA BENE: in a couple of places we assume that
	* sizeof(ipfw_insn) == sizeof(u_int32_t)
	* this needs to be fixed.
	*
	*/
	typedef struct _ipfw_insn { /* template for instructions */
	enum ipfw_opcodes opcode:8;
	u_int8_t len; /* numer of 32-byte words */
	#define F_NOT 0x80
	#define F_OR 0x40
	#define F_LEN_MASK 0x3f
	#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK)

	u_int16_t arg1;
	} ipfw_insn;

	/*
	* The F_INSN_SIZE(type) computes the size, in 4-byte words, of
	* a given type.
	*/
	#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t))

	#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */

	/*
	* This is used to store an array of 16-bit entries (ports etc.)
	*/
	typedef struct _ipfw_insn_u16 {
	ipfw_insn o;
	u_int16_t ports[2]; /* there may be more */
	} ipfw_insn_u16;

	/*
	* This is used to store an array of 32-bit entries
	* (uid, single IPv4 addresses etc.)
	*/
	typedef struct _ipfw_insn_u32 {
	ipfw_insn o;
	u_int32_t d[1]; /* one or more */
	} ipfw_insn_u32;

	/*
	* This is used to store IP addr-mask pairs.
	*/
	typedef struct _ipfw_insn_ip {
	ipfw_insn o;
	struct in_addr addr;
	struct in_addr mask;
	} ipfw_insn_ip;

	/*
	* This is used to forward to a given address (ip).
	*/
	typedef struct _ipfw_insn_sa {
	ipfw_insn o;
	struct sockaddr_in sa;
	} ipfw_insn_sa;

	/*
	* This is used for MAC addr-mask pairs.
	*/
	typedef struct _ipfw_insn_mac {
	ipfw_insn o;
	u_char addr[12]; /* dst[6] + src[6] */
	u_char mask[12]; /* dst[6] + src[6] */
	} ipfw_insn_mac;

	/*
	* This is used for interface match rules (recv xx, xmit xx).
	*/
	typedef struct _ipfw_insn_if {
	ipfw_insn o;
	union {
	struct in_addr ip;
	int glob;
	} p;
	char name[IFNAMSIZ];
	} ipfw_insn_if;

	/*
	* This is used for storing an altq queue id number.
	*/
	typedef struct _ipfw_insn_altq {
	ipfw_insn o;
	u_int32_t qid;
	} ipfw_insn_altq;

	/*
	* This is used for limit rules.
	*/
	typedef struct _ipfw_insn_limit {
	ipfw_insn o;
	u_int8_t _pad;
	u_int8_t limit_mask; /* combination of DYN_* below */
	#define DYN_SRC_ADDR 0x1
	#define DYN_SRC_PORT 0x2
	#define DYN_DST_ADDR 0x4
	#define DYN_DST_PORT 0x8

	u_int16_t conn_limit;
	} ipfw_insn_limit;

	/*
	* This is used for log instructions.
	*/
	typedef struct _ipfw_insn_log {
	ipfw_insn o;
	u_int32_t max_log; /* how many do we log -- 0 = all */
	u_int32_t log_left; /* how many left to log */
	} ipfw_insn_log;

	/*
	* Data structures required by both ipfw(8) and ipfw(4) but not part of the
	* management API are protected by IPFW_INTERNAL.
	*/
	#ifdef IPFW_INTERNAL
	/* Server pool support (LSNAT). */
	struct cfg_spool {
	LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */
	struct in_addr addr;
	u_short port;
	};
	#endif

	/* Redirect modes id. */
	#define REDIR_ADDR 0x01
	#define REDIR_PORT 0x02
	#define REDIR_PROTO 0x04

	#ifdef IPFW_INTERNAL
	/* Nat redirect configuration. */
	struct cfg_redir {
	LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */
	u_int16_t mode; /* type of redirect mode */
	struct in_addr laddr; /* local ip address */
	struct in_addr paddr; /* public ip address */
	struct in_addr raddr; /* remote ip address */
	u_short lport; /* local port */
	u_short pport; /* public port */
	u_short rport; /* remote port */
	u_short pport_cnt; /* number of public ports */
	u_short rport_cnt; /* number of remote ports */
	int proto; /* protocol: tcp/udp */
	struct alias_link **alink;
	/* num of entry in spool chain */
	u_int16_t spool_cnt;
	/* chain of spool instances */
	LIST_HEAD(spool_chain, cfg_spool) spool_chain;
	};
	#endif

	#define NAT_BUF_LEN 1024

	#ifdef IPFW_INTERNAL
	/* Nat configuration data struct. */
	struct cfg_nat {
	/* chain of nat instances */
	LIST_ENTRY(cfg_nat) _next;
	int id; /* nat id */
	struct in_addr ip; /* nat ip address */
	char if_name[IF_NAMESIZE]; /* interface name */
	int mode; /* aliasing mode */
	struct libalias lib; / libalias instance */
	/* number of entry in spool chain */
	int redir_cnt;
	/* chain of redir instances */
	LIST_HEAD(redir_chain, cfg_redir) redir_chain;
	};
	#endif

	#define SOF_NAT sizeof(struct cfg_nat)
	#define SOF_REDIR sizeof(struct cfg_redir)
	#define SOF_SPOOL sizeof(struct cfg_spool)

	/* Nat command. */
	typedef struct _ipfw_insn_nat {
	ipfw_insn o;
	struct cfg_nat *nat;
	} ipfw_insn_nat;

	/* Apply ipv6 mask on ipv6 addr */
	#define APPLY_MASK(addr,mask) \
	(addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \
	(addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \
	(addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \
	(addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3];

	/* Structure for ipv6 */
	typedef struct _ipfw_insn_ip6 {
	ipfw_insn o;
	struct in6_addr addr6;
	struct in6_addr mask6;
	} ipfw_insn_ip6;

	/* Used to support icmp6 types */
	typedef struct _ipfw_insn_icmp6 {
	ipfw_insn o;
	uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h
	* define ICMP6_MAXTYPE
	* as follows: n = ICMP6_MAXTYPE/32 + 1
	* Actually is 203
	*/
	} ipfw_insn_icmp6;

	/*
	* Here we have the structure representing an ipfw rule.
	*
	* It starts with a general area (with link fields and counters)
	* followed by an array of one or more instructions, which the code
	* accesses as an array of 32-bit values.
	*
	* Given a rule pointer r:
	*
	* r->cmd is the start of the first instruction.
	* ACTION_PTR(r) is the start of the first action (things to do
	* once a rule matched).
	*
	* When assembling instruction, remember the following:
	*
	* + if a rule has a "keep-state" (or "limit") option, then the
	* first instruction (at r->cmd) MUST BE an O_PROBE_STATE
	* + if a rule has a "log" option, then the first action
	* (at ACTION_PTR(r)) MUST be O_LOG
	* + if a rule has an "altq" option, it comes after "log"
	* + if a rule has an O_TAG option, it comes after "log" and "altq"
	*
	* NOTE: we use a simple linked list of rules because we never need
	* to delete a rule without scanning the list. We do not use
	* queue(3) macros for portability and readability.
	*/

	struct ip_fw {
	struct ip_fw next; / linked list of rules */
	struct ip_fw next_rule; / ptr to next [skipto] rule */
	/* 'next_rule' is used to pass up 'set_disable' status */

	u_int16_t act_ofs; /* offset of action in 32-bit units */
	u_int16_t cmd_len; /* # of 32-bit words in cmd */
	u_int16_t rulenum; /* rule number */
	u_int8_t set; /* rule set (0..31) */
	#define RESVD_SET 31 /* set for default and persistent rules */
	u_int8_t _pad; /* padding */

	/* These fields are present in all rules. */
	u_int64_t pcnt; /* Packet counter */
	u_int64_t bcnt; /* Byte counter */
	u_int32_t timestamp; /* tv_sec of last match */

	ipfw_insn cmd[1]; /* storage for commands */
	};

	#define ACTION_PTR(rule) \
	(ipfw_insn )( (u_int32_t )((rule)->cmd) + ((rule)->act_ofs) )

	#define RULESIZE(rule) (sizeof(struct ip_fw) + \
	((struct ip_fw )(rule))->cmd_len 4 - 4)

	/*
	* This structure is used as a flow mask and a flow id for various
	* parts of the code.
	*/
	struct ipfw_flow_id {
	u_int32_t dst_ip;
	u_int32_t src_ip;
	u_int16_t dst_port;
	u_int16_t src_port;
	u_int8_t fib;
	u_int8_t proto;
	u_int8_t flags; /* protocol-specific flags */
	uint8_t addr_type; /* 4 = ipv4, 6 = ipv6, 1=ether ? */
	struct in6_addr dst_ip6; /* could also store MAC addr! */
	struct in6_addr src_ip6;
	u_int32_t flow_id6;
	u_int32_t frag_id6;
	};

	#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6)

	/*
	* Dynamic ipfw rule.
	*/
	typedef struct _ipfw_dyn_rule ipfw_dyn_rule;

	struct _ipfw_dyn_rule {
	ipfw_dyn_rule next; / linked list of rules. */
	struct ip_fw rule; / pointer to rule */
	/* 'rule' is used to pass up the rule number (from the parent) */

	ipfw_dyn_rule parent; / pointer to parent rule */
	u_int64_t pcnt; /* packet match counter */
	u_int64_t bcnt; /* byte match counter */
	struct ipfw_flow_id id; /* (masked) flow id */
	u_int32_t expire; /* expire time */
	u_int32_t bucket; /* which bucket in hash table */
	u_int32_t state; /* state of this rule (typically a
	* combination of TCP flags)
	*/
	u_int32_t ack_fwd; /* most recent ACKs in forward */
	u_int32_t ack_rev; /* and reverse directions (used */
	/* to generate keepalives) */
	u_int16_t dyn_type; /* rule type */
	u_int16_t count; /* refcount */
	};

	/*
	* Definitions for IP option names.
	*/
	#define IP_FW_IPOPT_LSRR 0x01
	#define IP_FW_IPOPT_SSRR 0x02
	#define IP_FW_IPOPT_RR 0x04
	#define IP_FW_IPOPT_TS 0x08

	/*
	* Definitions for TCP option names.
	*/
	#define IP_FW_TCPOPT_MSS 0x01
	#define IP_FW_TCPOPT_WINDOW 0x02
	#define IP_FW_TCPOPT_SACK 0x04
	#define IP_FW_TCPOPT_TS 0x08
	#define IP_FW_TCPOPT_CC 0x10

	#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */
	#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */

	/*
	* These are used for lookup tables.
	*/
	typedef struct _ipfw_table_entry {
	in_addr_t addr; /* network address */
	u_int32_t value; /* value */
	u_int16_t tbl; /* table number */
	u_int8_t masklen; /* mask length */
	} ipfw_table_entry;

	typedef struct _ipfw_table {
	u_int32_t size; /* size of entries in bytes */
	u_int32_t cnt; /* # of entries */
	u_int16_t tbl; /* table number */
	ipfw_table_entry ent[0]; /* entries */
	} ipfw_table;

	#define IP_FW_TABLEARG 65535

	/*
	* Main firewall chains definitions and global var's definitions.
	*/
	#ifdef _KERNEL

	/* Return values from ipfw_chk() */
	enum {
	IP_FW_PASS = 0,
	IP_FW_DENY,
	IP_FW_DIVERT,
	IP_FW_TEE,
	IP_FW_DUMMYNET,
	IP_FW_NETGRAPH,
	IP_FW_NGTEE,
	IP_FW_NAT,
	};

	/* flags for divert mtag */
	#define IP_FW_DIVERT_LOOPBACK_FLAG 0x00080000
	#define IP_FW_DIVERT_OUTPUT_FLAG 0x00100000

	/*
	* Structure for collecting parameters to dummynet for ip6_output forwarding
	*/
	struct _ip6dn_args {
	struct ip6_pktopts *opt_or;
	struct route_in6 ro_or;
	int flags_or;
	struct ip6_moptions *im6o_or;
	struct ifnet *origifp_or;
	struct ifnet *ifp_or;
	struct sockaddr_in6 dst_or;
	u_long mtu_or;
	struct route_in6 ro_pmtu_or;
	};

	/*
	* Arguments for calling ipfw_chk() and dummynet_io(). We put them
	* all into a structure because this way it is easier and more
	* efficient to pass variables around and extend the interface.
	*/
	struct ip_fw_args {
	struct mbuf m; / the mbuf chain */
	struct ifnet oif; / output interface */
	struct sockaddr_in next_hop; / forward address */
	struct ip_fw rule; / matching rule */
	struct ether_header eh; / for bridged packets */

	struct ipfw_flow_id f_id; /* grabbed from IP header */
	u_int32_t cookie; /* a cookie depending on rule action */
	struct inpcb *inp;

	struct _ip6dn_args dummypar; /* dummynet->ip6_output */
	struct sockaddr_in hopstore; /* store here if cannot use a pointer */
	};

	/*
	* Function definitions.
	*/

	/* Firewall hooks */
	struct sockopt;
	struct dn_flow_set;

	int ipfw_check_in(void , struct mbuf , struct ifnet , int, struct inpcb *inp);
	int ipfw_check_out(void , struct mbuf , struct ifnet , int, struct inpcb *inp);

	int ipfw_chk(struct ip_fw_args *);

	int ipfw_init(void);
	void ipfw_destroy(void);
	+#ifdef NOTYET
	+void ipfw_nat_destroy(void);
	+#endif

	typedef int ip_fw_ctl_t(struct sockopt *);
	extern ip_fw_ctl_t *ip_fw_ctl_ptr;
	+
	+#ifndef VIMAGE
	extern int fw_one_pass;
	extern int fw_enable;
	#ifdef INET6
	extern int fw6_enable;
	#endif
	+#endif

	/* For kernel ipfw_ether and ipfw_bridge. */
	typedef int ip_fw_chk_t(struct ip_fw_args *args);
	extern ip_fw_chk_t *ip_fw_chk_ptr;
	#define IPFW_LOADED (ip_fw_chk_ptr != NULL)

	#ifdef IPFW_INTERNAL

	struct ip_fw_chain {
	struct ip_fw rules; / list of rules */
	struct ip_fw reap; / list of rules to reap */
	LIST_HEAD(, cfg_nat) nat; /* list of nat entries */
	struct radix_node_head *tables[IPFW_TABLES_MAX];
	struct rwlock rwmtx;
	};
	#define IPFW_LOCK_INIT(_chain) \
	rw_init(&(_chain)->rwmtx, "IPFW static rules")
	#define IPFW_LOCK_DESTROY(_chain) rw_destroy(&(_chain)->rwmtx)
	#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED)

	#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
	#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
	#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
	#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)

	#define LOOKUP_NAT(l, i, p) do { \
	LIST_FOREACH((p), &(l.nat), _next) { \
	if ((p)->id == (i)) { \
	break; \
	} \
	} \
	} while (0)

	typedef int ipfw_nat_t(struct ip_fw_args , struct cfg_nat , struct mbuf *);
	typedef int ipfw_nat_cfg_t(struct sockopt *);
	#endif
	+
	+/*
	+ * Stack virtualization support.
	+ */
	+#ifdef VIMAGE
	+struct vnet_ipfw {
	+ int _fw_one_pass;
	+ int _fw_enable;
	+ int _fw6_enable;
	+ u_int32_t _set_disable;
	+ int _fw_deny_unknown_exthdrs;
	+ int _fw_verbose;
	+ int _verbose_limit;
	+ int _fw_debug;
	+ int _autoinc_step;
	+ ipfw_dyn_rule **_ipfw_dyn_v;
	+ struct ip_fw_chain _layer3_chain;
	+ u_int32_t _dyn_buckets;
	+ u_int32_t _curr_dyn_buckets;
	+ u_int32_t _dyn_ack_lifetime;
	+ u_int32_t _dyn_syn_lifetime;
	+ u_int32_t _dyn_fin_lifetime;
	+ u_int32_t _dyn_rst_lifetime;
	+ u_int32_t _dyn_udp_lifetime;
	+ u_int32_t _dyn_short_lifetime;
	+ u_int32_t _dyn_keepalive_interval;
	+ u_int32_t _dyn_keepalive_period;
	+ u_int32_t _dyn_keepalive;
	+ u_int32_t _static_count;
	+ u_int32_t _static_len;
	+ u_int32_t _dyn_count;
	+ u_int32_t _dyn_max;
	+ u_int64_t _norule_counter;
	+ struct callout _ipfw_timeout;
	+ eventhandler_tag _ifaddr_event_tag;
	+};
	+#endif
	+
	+/*
	+ * Symbol translation macros
	+ */
	+#define INIT_VNET_IPFW(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_IPFW, struct vnet_ipfw, vnet_ipfw)
	+
	+#define VNET_IPFW(sym) VSYM(vnet_ipfw, sym)
	+
	+#define V_fw_one_pass VNET_IPFW(fw_one_pass)
	+#define V_fw_enable VNET_IPFW(fw_enable)
	+#define V_fw6_enable VNET_IPFW(fw6_enable)
	+#define V_set_disable VNET_IPFW(set_disable)
	+#define V_fw_deny_unknown_exthdrs VNET_IPFW(fw_deny_unknown_exthdrs)
	+#define V_fw_verbose VNET_IPFW(fw_verbose)
	+#define V_verbose_limit VNET_IPFW(verbose_limit)
	+#define V_fw_debug VNET_IPFW(fw_debug)
	+#define V_autoinc_step VNET_IPFW(autoinc_step)
	+#define V_ipfw_dyn_v VNET_IPFW(ipfw_dyn_v)
	+#define V_layer3_chain VNET_IPFW(layer3_chain)
	+#define V_dyn_buckets VNET_IPFW(dyn_buckets)
	+#define V_curr_dyn_buckets VNET_IPFW(curr_dyn_buckets)
	+#define V_dyn_ack_lifetime VNET_IPFW(dyn_ack_lifetime)
	+#define V_dyn_syn_lifetime VNET_IPFW(dyn_syn_lifetime)
	+#define V_dyn_fin_lifetime VNET_IPFW(dyn_fin_lifetime)
	+#define V_dyn_rst_lifetime VNET_IPFW(dyn_rst_lifetime)
	+#define V_dyn_udp_lifetime VNET_IPFW(dyn_udp_lifetime)
	+#define V_dyn_short_lifetime VNET_IPFW(dyn_short_lifetime)
	+#define V_dyn_keepalive_interval VNET_IPFW(dyn_keepalive_interval)
	+#define V_dyn_keepalive_period VNET_IPFW(dyn_keepalive_period)
	+#define V_dyn_keepalive VNET_IPFW(dyn_keepalive)
	+#define V_static_count VNET_IPFW(static_count)
	+#define V_static_len VNET_IPFW(static_len)
	+#define V_dyn_count VNET_IPFW(dyn_count)
	+#define V_dyn_max VNET_IPFW(dyn_max)
	+#define V_norule_counter VNET_IPFW(norule_counter)
	+#define V_ipfw_timeout VNET_IPFW(ipfw_timeout)
	+#define V_ifaddr_event_tag VNET_IPFW(ifaddr_event_tag)

	#endif /* _KERNEL */
	#endif /* _IPFW2_H */
	Index: head/sys/netinet/ip_fw2.c
	===================================================================
	--- head/sys/netinet/ip_fw2.c (revision 183549)
	+++ head/sys/netinet/ip_fw2.c (revision 183550)
	@@ -1,4630 +1,4652 @@
	/*-
	* Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#define DEB(x)
	#define DDB(x) x

	/*
	* Implement IP packet firewall (new version)
	*/

	#if !defined(KLD_MODULE)
	#include "opt_ipfw.h"
	#include "opt_ipdivert.h"
	#include "opt_ipdn.h"
	#include "opt_inet.h"
	#ifndef INET
	#error IPFIREWALL requires INET.
	#endif /* INET */
	#endif
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/condvar.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/jail.h>
	#include <sys/module.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/ucred.h>
	#include <sys/vimage.h>
	#include <net/if.h>
	#include <net/radix.h>
	#include <net/route.h>
	#include <net/pf_mtag.h>

	#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/ip_fw.h>
	#include <netinet/ip_divert.h>
	#include <netinet/ip_dummynet.h>
	#include <netinet/ip_carp.h>
	#include <netinet/pim.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>
	#include <netinet/sctp.h>
	#include <netgraph/ng_ipfw.h>

	#include <altq/if_altq.h>

	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#ifdef INET6
	#include <netinet6/scope6_var.h>
	#endif

	#include <netinet/if_ether.h> /* XXX for ETHERTYPE_IP */

	#include <machine/in_cksum.h> /* XXX for in_cksum */

	#include <security/mac/mac_framework.h>

	/*
	* set_disable contains one bit per set value (0..31).
	* If the bit is set, all rules with the corresponding set
	* are disabled. Set RESVD_SET(31) is reserved for the default rule
	* and rules that are not deleted by the flush command,
	* and CANNOT be disabled.
	* Rules in set RESVD_SET can only be deleted explicitly.
	*/
	static u_int32_t set_disable;

	static int fw_verbose;
	static int verbose_limit;

	static struct callout ipfw_timeout;
	static uma_zone_t ipfw_dyn_rule_zone;

	/*
	* Data structure to cache our ucred related
	* information. This structure only gets used if
	* the user specified UID/GID based constraints in
	* a firewall rule.
	*/
	struct ip_fw_ugid {
	gid_t fw_groups[NGROUPS];
	int fw_ngroups;
	uid_t fw_uid;
	int fw_prid;
	};

	/*
	* list of rules for layer 3
	*/
	struct ip_fw_chain layer3_chain;

	MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
	MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables");
	#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL)
	ipfw_nat_t *ipfw_nat_ptr = NULL;
	ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
	ipfw_nat_cfg_t *ipfw_nat_del_ptr;
	ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
	ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;

	struct table_entry {
	struct radix_node rn[2];
	struct sockaddr_in addr, mask;
	u_int32_t value;
	};

	static int fw_debug = 1;
	static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */

	extern int ipfw_chg_hook(SYSCTL_HANDLER_ARGS);

	#ifdef SYSCTL_NODE
	SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
	-SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable,
	- CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_SECURE3, &fw_enable, 0,
	+SYSCTL_V_PROC(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, enable,
	+ CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_SECURE3, fw_enable, 0,
	ipfw_chg_hook, "I", "Enable ipfw");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW,
	- &autoinc_step, 0, "Rule number autincrement step");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
	- CTLFLAG_RW \| CTLFLAG_SECURE3,
	- &fw_one_pass, 0,
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, autoinc_step,
	+ CTLFLAG_RW, autoinc_step, 0, "Rule number autincrement step");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, one_pass,
	+ CTLFLAG_RW \| CTLFLAG_SECURE3, fw_one_pass, 0,
	"Only do a single pass through ipfw when using dummynet(4)");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
	- &fw_debug, 0, "Enable printing of debug ip_fw statements");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, debug, CTLFLAG_RW,
	+ fw_debug, 0, "Enable printing of debug ip_fw statements");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, verbose,
	CTLFLAG_RW \| CTLFLAG_SECURE3,
	- &fw_verbose, 0, "Log matches to ipfw rules");
	+ fw_verbose, 0, "Log matches to ipfw rules");
	SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW,
	&verbose_limit, 0, "Set upper limit of matches of ipfw rules logged");
	SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
	NULL, IPFW_DEFAULT_RULE, "The default/max possible rule number.");
	SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD,
	NULL, IPFW_TABLES_MAX, "The maximum number of tables.");

	/*
	* Description of dynamic rules.
	*
	* Dynamic rules are stored in lists accessed through a hash table
	* (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can
	* be modified through the sysctl variable dyn_buckets which is
	* updated when the table becomes empty.
	*
	* XXX currently there is only one list, ipfw_dyn.
	*
	* When a packet is received, its address fields are first masked
	* with the mask defined for the rule, then hashed, then matched
	* against the entries in the corresponding list.
	* Dynamic rules can be used for different purposes:
	* + stateful rules;
	* + enforcing limits on the number of sessions;
	* + in-kernel NAT (not implemented yet)
	*
	* The lifetime of dynamic rules is regulated by dyn_*_lifetime,
	* measured in seconds and depending on the flags.
	*
	* The total number of dynamic rules is stored in dyn_count.
	* The max number of dynamic rules is dyn_max. When we reach
	* the maximum number of rules we do not create anymore. This is
	* done to avoid consuming too much memory, but also too much
	* time when searching on each packet (ideally, we should try instead
	* to put a limit on the length of the list on each bucket...).
	*
	* Each dynamic rule holds a pointer to the parent ipfw rule so
	* we know what action to perform. Dynamic rules are removed when
	* the parent rule is deleted. XXX we should make them survive.
	*
	* There are some limitations with dynamic rules -- we do not
	* obey the 'randomized match', and we do not do multiple
	* passes through the firewall. XXX check the latter!!!
	*/
	static ipfw_dyn_rule **ipfw_dyn_v = NULL;
	static u_int32_t dyn_buckets = 256; /* must be power of 2 */
	static u_int32_t curr_dyn_buckets = 256; /* must be power of 2 */

	static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */
	#define IPFW_DYN_LOCK_INIT() \
	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
	#define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx)
	#define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx)
	#define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx)
	#define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED)

	/*
	* Timeouts for various events in handing dynamic rules.
	*/
	static u_int32_t dyn_ack_lifetime = 300;
	static u_int32_t dyn_syn_lifetime = 20;
	static u_int32_t dyn_fin_lifetime = 1;
	static u_int32_t dyn_rst_lifetime = 1;
	static u_int32_t dyn_udp_lifetime = 10;
	static u_int32_t dyn_short_lifetime = 5;

	/*
	* Keepalives are sent if dyn_keepalive is set. They are sent every
	* dyn_keepalive_period seconds, in the last dyn_keepalive_interval
	* seconds of lifetime of a rule.
	* dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower
	* than dyn_keepalive_period.
	*/

	static u_int32_t dyn_keepalive_interval = 20;
	static u_int32_t dyn_keepalive_period = 5;
	static u_int32_t dyn_keepalive = 1; /* do send keepalives */

	static u_int32_t static_count; /* # of static rules */
	static u_int32_t static_len; /* size in bytes of static rules */
	static u_int32_t dyn_count; /* # of dynamic rules */
	static u_int32_t dyn_max = 4096; /* max # of dynamic rules */

	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW,
	- &dyn_buckets, 0, "Number of dyn. buckets");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD,
	- &curr_dyn_buckets, 0, "Current Number of dyn. buckets");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD,
	- &dyn_count, 0, "Number of dyn. rules");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW,
	- &dyn_max, 0, "Max number of dyn. rules");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD,
	- &static_count, 0, "Number of static rules");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW,
	- &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW,
	- &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW,
	- &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW,
	- &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW,
	- &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW,
	- &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations");
	-SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW,
	- &dyn_keepalive, 0, "Enable keepalives for dyn. rules");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_buckets,
	+ CTLFLAG_RW, dyn_buckets, 0, "Number of dyn. buckets");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
	+ CTLFLAG_RD, curr_dyn_buckets, 0, "Current Number of dyn. buckets");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_count,
	+ CTLFLAG_RD, dyn_count, 0, "Number of dyn. rules");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_max,
	+ CTLFLAG_RW, dyn_max, 0, "Max number of dyn. rules");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, static_count,
	+ CTLFLAG_RD, static_count, 0, "Number of static rules");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
	+ CTLFLAG_RW, dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime,
	+ CTLFLAG_RW, dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime,
	+ CTLFLAG_RW, dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime,
	+ CTLFLAG_RW, dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime,
	+ CTLFLAG_RW, dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_short_lifetime,
	+ CTLFLAG_RW, dyn_short_lifetime, 0,
	+ "Lifetime of dyn. rules for other situations");
	+SYSCTL_V_INT(V_NET, vnet_ipfw, _net_inet_ip_fw, OID_AUTO, dyn_keepalive,
	+ CTLFLAG_RW, dyn_keepalive, 0, "Enable keepalives for dyn. rules");

	+
	#ifdef INET6
	/*
	* IPv6 specific variables
	*/
	SYSCTL_DECL(_net_inet6_ip6);

	static struct sysctl_ctx_list ip6_fw_sysctl_ctx;
	static struct sysctl_oid *ip6_fw_sysctl_tree;
	#endif /* INET6 */
	#endif /* SYSCTL_NODE */

	static int fw_deny_unknown_exthdrs = 1;


	/*
	* L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
	* Other macros just cast void * into the appropriate type
	*/
	#define L3HDR(T, ip) ((T )((u_int32_t )(ip) + (ip)->ip_hl))
	#define TCP(p) ((struct tcphdr *)(p))
	#define SCTP(p) ((struct sctphdr *)(p))
	#define UDP(p) ((struct udphdr *)(p))
	#define ICMP(p) ((struct icmphdr *)(p))
	#define ICMP6(p) ((struct icmp6_hdr *)(p))

	static __inline int
	icmptype_match(struct icmphdr icmp, ipfw_insn_u32 cmd)
	{
	int type = icmp->icmp_type;

	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
	}

	#define TT ( (1 << ICMP_ECHO) \| (1 << ICMP_ROUTERSOLICIT) \| \
	(1 << ICMP_TSTAMP) \| (1 << ICMP_IREQ) \| (1 << ICMP_MASKREQ) )

	static int
	is_icmp_query(struct icmphdr *icmp)
	{
	int type = icmp->icmp_type;

	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
	}
	#undef TT

	/*
	* The following checks use two arrays of 8 or 16 bits to store the
	* bits that we want set or clear, respectively. They are in the
	* low and high half of cmd->arg1 or cmd->d[0].
	*
	* We scan options and store the bits we find set. We succeed if
	*
	* (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
	*
	* The code is sometimes optimized not to store additional variables.
	*/

	static int
	flags_match(ipfw_insn *cmd, u_int8_t bits)
	{
	u_char want_clear;
	bits = ~bits;

	if ( ((cmd->arg1 & 0xff) & bits) != 0)
	return 0; /* some bits we want set were clear */
	want_clear = (cmd->arg1 >> 8) & 0xff;
	if ( (want_clear & bits) != want_clear)
	return 0; /* some bits we want clear were set */
	return 1;
	}

	static int
	ipopts_match(struct ip ip, ipfw_insn cmd)
	{
	int optlen, bits = 0;
	u_char cp = (u_char )(ip + 1);
	int x = (ip->ip_hl << 2) - sizeof (struct ip);

	for (; x > 0; x -= optlen, cp += optlen) {
	int opt = cp[IPOPT_OPTVAL];

	if (opt == IPOPT_EOL)
	break;
	if (opt == IPOPT_NOP)
	optlen = 1;
	else {
	optlen = cp[IPOPT_OLEN];
	if (optlen <= 0 \|\| optlen > x)
	return 0; /* invalid or truncated */
	}
	switch (opt) {

	default:
	break;

	case IPOPT_LSRR:
	bits \|= IP_FW_IPOPT_LSRR;
	break;

	case IPOPT_SSRR:
	bits \|= IP_FW_IPOPT_SSRR;
	break;

	case IPOPT_RR:
	bits \|= IP_FW_IPOPT_RR;
	break;

	case IPOPT_TS:
	bits \|= IP_FW_IPOPT_TS;
	break;
	}
	}
	return (flags_match(cmd, bits));
	}

	static int
	tcpopts_match(struct tcphdr tcp, ipfw_insn cmd)
	{
	int optlen, bits = 0;
	u_char cp = (u_char )(tcp + 1);
	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);

	for (; x > 0; x -= optlen, cp += optlen) {
	int opt = cp[0];
	if (opt == TCPOPT_EOL)
	break;
	if (opt == TCPOPT_NOP)
	optlen = 1;
	else {
	optlen = cp[1];
	if (optlen <= 0)
	break;
	}

	switch (opt) {

	default:
	break;

	case TCPOPT_MAXSEG:
	bits \|= IP_FW_TCPOPT_MSS;
	break;

	case TCPOPT_WINDOW:
	bits \|= IP_FW_TCPOPT_WINDOW;
	break;

	case TCPOPT_SACK_PERMITTED:
	case TCPOPT_SACK:
	bits \|= IP_FW_TCPOPT_SACK;
	break;

	case TCPOPT_TIMESTAMP:
	bits \|= IP_FW_TCPOPT_TS;
	break;

	}
	}
	return (flags_match(cmd, bits));
	}

	static int
	iface_match(struct ifnet ifp, ipfw_insn_if cmd)
	{
	if (ifp == NULL) /* no iface with this packet, match fails */
	return 0;
	/* Check by name or by IP address */
	if (cmd->name[0] != '\0') { /* match by name */
	/* Check name */
	if (cmd->p.glob) {
	if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
	return(1);
	} else {
	if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
	return(1);
	}
	} else {
	struct ifaddr *ia;

	/* XXX lock? */
	TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
	if (ia->ifa_addr->sa_family != AF_INET)
	continue;
	if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
	(ia->ifa_addr))->sin_addr.s_addr)
	return(1); /* match */
	}
	}
	return(0); /* no match, fail ... */
	}

	/*
	* The verify_path function checks if a route to the src exists and
	* if it is reachable via ifp (when provided).
	*
	* The 'verrevpath' option checks that the interface that an IP packet
	* arrives on is the same interface that traffic destined for the
	* packet's source address would be routed out of. The 'versrcreach'
	* option just checks that the source address is reachable via any route
	* (except default) in the routing table. These two are a measure to block
	* forged packets. This is also commonly known as "anti-spoofing" or Unicast
	* Reverse Path Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
	* is purposely reminiscent of the Cisco IOS command,
	*
	* ip verify unicast reverse-path
	* ip verify unicast source reachable-via any
	*
	* which implements the same functionality. But note that syntax is
	* misleading. The check may be performed on all IP packets whether unicast,
	* multicast, or broadcast.
	*/
	static int
	verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
	{
	struct route ro;
	struct sockaddr_in *dst;

	bzero(&ro, sizeof(ro));

	dst = (struct sockaddr_in *)&(ro.ro_dst);
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = src;
	in_rtalloc_ign(&ro, RTF_CLONING, fib);

	if (ro.ro_rt == NULL)
	return 0;

	/*
	* If ifp is provided, check for equality with rtentry.
	* We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
	* in order to pass packets injected back by if_simloop():
	* if useloopback == 1 routing entry (via lo0) for our own address
	* may exist, so we need to handle routing assymetry.
	*/
	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
	RTFREE(ro.ro_rt);
	return 0;
	}

	/* if no ifp provided, check if rtentry is not default route */
	if (ifp == NULL &&
	satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) {
	RTFREE(ro.ro_rt);
	return 0;
	}

	/* or if this is a blackhole/reject route */
	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT\|RTF_BLACKHOLE)) {
	RTFREE(ro.ro_rt);
	return 0;
	}

	/* found valid route */
	RTFREE(ro.ro_rt);
	return 1;
	}

	#ifdef INET6
	/*
	* ipv6 specific rules here...
	*/
	static __inline int
	icmp6type_match (int type, ipfw_insn_u32 *cmd)
	{
	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
	}

	static int
	flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
	{
	int i;
	for (i=0; i <= cmd->o.arg1; ++i )
	if (curr_flow == cmd->d[i] )
	return 1;
	return 0;
	}

	/* support for IP6__ME opcodes /
	static int
	search_ip6_addr_net (struct in6_addr * ip6_addr)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *mdc;
	struct ifaddr *mdc2;
	struct in6_ifaddr *fdm;
	struct in6_addr copia;

	TAILQ_FOREACH(mdc, &V_ifnet, if_link)
	TAILQ_FOREACH(mdc2, &mdc->if_addrlist, ifa_list) {
	if (mdc2->ifa_addr->sa_family == AF_INET6) {
	fdm = (struct in6_ifaddr *)mdc2;
	copia = fdm->ia_addr.sin6_addr;
	/* need for leaving scope_id in the sock_addr */
	in6_clearscope(&copia);
	if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia))
	return 1;
	}
	}
	return 0;
	}

	static int
	verify_path6(struct in6_addr src, struct ifnet ifp)
	{
	struct route_in6 ro;
	struct sockaddr_in6 *dst;

	bzero(&ro, sizeof(ro));

	dst = (struct sockaddr_in6 * )&(ro.ro_dst);
	dst->sin6_family = AF_INET6;
	dst->sin6_len = sizeof(*dst);
	dst->sin6_addr = *src;
	/* XXX MRT 0 for ipv6 at this time */
	rtalloc_ign((struct route *)&ro, RTF_CLONING);

	if (ro.ro_rt == NULL)
	return 0;

	/*
	* if ifp is provided, check for equality with rtentry
	* We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
	* to support the case of sending packets to an address of our own.
	* (where the former interface is the first argument of if_simloop()
	* (=ifp), the latter is lo0)
	*/
	if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) {
	RTFREE(ro.ro_rt);
	return 0;
	}

	/* if no ifp provided, check if rtentry is not default route */
	if (ifp == NULL &&
	IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) {
	RTFREE(ro.ro_rt);
	return 0;
	}

	/* or if this is a blackhole/reject route */
	if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT\|RTF_BLACKHOLE)) {
	RTFREE(ro.ro_rt);
	return 0;
	}

	/* found valid route */
	RTFREE(ro.ro_rt);
	return 1;

	}
	static __inline int
	hash_packet6(struct ipfw_flow_id *id)
	{
	u_int32_t i;
	i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^
	(id->dst_ip6.__u6_addr.__u6_addr32[3]) ^
	(id->src_ip6.__u6_addr.__u6_addr32[2]) ^
	(id->src_ip6.__u6_addr.__u6_addr32[3]) ^
	(id->dst_port) ^ (id->src_port);
	return i;
	}

	static int
	is_icmp6_query(int icmp6_type)
	{
	if ((icmp6_type <= ICMP6_MAXTYPE) &&
	(icmp6_type == ICMP6_ECHO_REQUEST \|\|
	icmp6_type == ICMP6_MEMBERSHIP_QUERY \|\|
	icmp6_type == ICMP6_WRUREQUEST \|\|
	icmp6_type == ICMP6_FQDN_QUERY \|\|
	icmp6_type == ICMP6_NI_QUERY))
	return (1);

	return (0);
	}

	static void
	send_reject6(struct ip_fw_args args, int code, u_int hlen, struct ip6_hdr ip6)
	{
	struct mbuf *m;

	m = args->m;
	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
	struct tcphdr *tcp;
	tcp_seq ack, seq;
	int flags;
	struct {
	struct ip6_hdr ip6;
	struct tcphdr th;
	} ti;
	tcp = (struct tcphdr )((char )ip6 + hlen);

	if ((tcp->th_flags & TH_RST) != 0) {
	m_freem(m);
	args->m = NULL;
	return;
	}

	ti.ip6 = *ip6;
	ti.th = *tcp;
	ti.th.th_seq = ntohl(ti.th.th_seq);
	ti.th.th_ack = ntohl(ti.th.th_ack);
	ti.ip6.ip6_nxt = IPPROTO_TCP;

	if (ti.th.th_flags & TH_ACK) {
	ack = 0;
	seq = ti.th.th_ack;
	flags = TH_RST;
	} else {
	ack = ti.th.th_seq;
	if ((m->m_flags & M_PKTHDR) != 0) {
	/*
	* total new data to ACK is:
	* total packet length,
	* minus the header length,
	* minus the tcp header length.
	*/
	ack += m->m_pkthdr.len - hlen
	- (ti.th.th_off << 2);
	} else if (ip6->ip6_plen) {
	ack += ntohs(ip6->ip6_plen) + sizeof(*ip6) -
	hlen - (ti.th.th_off << 2);
	} else {
	m_freem(m);
	return;
	}
	if (tcp->th_flags & TH_SYN)
	ack++;
	seq = 0;
	flags = TH_RST\|TH_ACK;
	}
	bcopy(&ti, ip6, sizeof(ti));
	/*
	* m is only used to recycle the mbuf
	* The data in it is never read so we don't need
	* to correct the offsets or anything
	*/
	tcp_respond(NULL, ip6, tcp, m, ack, seq, flags);
	} else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */
	#if 0
	/*
	* Unlike above, the mbufs need to line up with the ip6 hdr,
	* as the contents are read. We need to m_adj() the
	* needed amount.
	* The mbuf will however be thrown away so we can adjust it.
	* Remember we did an m_pullup on it already so we
	* can make some assumptions about contiguousness.
	*/
	if (args->L3offset)
	m_adj(m, args->L3offset);
	#endif
	icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
	} else
	m_freem(m);

	args->m = NULL;
	}

	#endif /* INET6 */

	static u_int64_t norule_counter; /* counter for ipfw_log(NULL...) */

	#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
	#define SNP(buf) buf, sizeof(buf)

	/*
	* We enter here when we have a rule with O_LOG.
	* XXX this function alone takes about 2Kbytes of code!
	*/
	static void
	ipfw_log(struct ip_fw f, u_int hlen, struct ip_fw_args args,
	struct mbuf m, struct ifnet oif, u_short offset, uint32_t tablearg,
	struct ip *ip)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ether_header *eh = args->eh;
	char *action;
	int limit_reached = 0;
	char action2[40], proto[128], fragment[32];

	fragment[0] = '\0';
	proto[0] = '\0';

	if (f == NULL) { /* bogus pkt */
	if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
	return;
	V_norule_counter++;
	if (V_norule_counter == V_verbose_limit)
	limit_reached = V_verbose_limit;
	action = "Refuse";
	} else { /* O_LOG is the first action, find the real one */
	ipfw_insn *cmd = ACTION_PTR(f);
	ipfw_insn_log l = (ipfw_insn_log )cmd;

	if (l->max_log != 0 && l->log_left == 0)
	return;
	l->log_left--;
	if (l->log_left == 0)
	limit_reached = l->max_log;
	cmd += F_LEN(cmd); /* point to first action */
	if (cmd->opcode == O_ALTQ) {
	ipfw_insn_altq altq = (ipfw_insn_altq )cmd;

	snprintf(SNPARGS(action2, 0), "Altq %d",
	altq->qid);
	cmd += F_LEN(cmd);
	}
	if (cmd->opcode == O_PROB)
	cmd += F_LEN(cmd);

	if (cmd->opcode == O_TAG)
	cmd += F_LEN(cmd);

	action = action2;
	switch (cmd->opcode) {
	case O_DENY:
	action = "Deny";
	break;

	case O_REJECT:
	if (cmd->arg1==ICMP_REJECT_RST)
	action = "Reset";
	else if (cmd->arg1==ICMP_UNREACH_HOST)
	action = "Reject";
	else
	snprintf(SNPARGS(action2, 0), "Unreach %d",
	cmd->arg1);
	break;

	case O_UNREACH6:
	if (cmd->arg1==ICMP6_UNREACH_RST)
	action = "Reset";
	else
	snprintf(SNPARGS(action2, 0), "Unreach %d",
	cmd->arg1);
	break;

	case O_ACCEPT:
	action = "Accept";
	break;
	case O_COUNT:
	action = "Count";
	break;
	case O_DIVERT:
	snprintf(SNPARGS(action2, 0), "Divert %d",
	cmd->arg1);
	break;
	case O_TEE:
	snprintf(SNPARGS(action2, 0), "Tee %d",
	cmd->arg1);
	break;
	case O_SETFIB:
	snprintf(SNPARGS(action2, 0), "SetFib %d",
	cmd->arg1);
	break;
	case O_SKIPTO:
	snprintf(SNPARGS(action2, 0), "SkipTo %d",
	cmd->arg1);
	break;
	case O_PIPE:
	snprintf(SNPARGS(action2, 0), "Pipe %d",
	cmd->arg1);
	break;
	case O_QUEUE:
	snprintf(SNPARGS(action2, 0), "Queue %d",
	cmd->arg1);
	break;
	case O_FORWARD_IP: {
	ipfw_insn_sa sa = (ipfw_insn_sa )cmd;
	int len;
	struct in_addr dummyaddr;
	if (sa->sa.sin_addr.s_addr == INADDR_ANY)
	dummyaddr.s_addr = htonl(tablearg);
	else
	dummyaddr.s_addr = sa->sa.sin_addr.s_addr;

	len = snprintf(SNPARGS(action2, 0), "Forward to %s",
	inet_ntoa(dummyaddr));

	if (sa->sa.sin_port)
	snprintf(SNPARGS(action2, len), ":%d",
	sa->sa.sin_port);
	}
	break;
	case O_NETGRAPH:
	snprintf(SNPARGS(action2, 0), "Netgraph %d",
	cmd->arg1);
	break;
	case O_NGTEE:
	snprintf(SNPARGS(action2, 0), "Ngtee %d",
	cmd->arg1);
	break;
	case O_NAT:
	action = "Nat";
	break;
	default:
	action = "UNKNOWN";
	break;
	}
	}

	if (hlen == 0) { /* non-ip */
	snprintf(SNPARGS(proto, 0), "MAC");

	} else {
	int len;
	char src[48], dst[48];
	struct icmphdr *icmp;
	struct tcphdr *tcp;
	struct udphdr *udp;
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	struct icmp6_hdr *icmp6;
	#endif
	src[0] = '\0';
	dst[0] = '\0';
	#ifdef INET6
	if (IS_IP6_FLOW_ID(&(args->f_id))) {
	char ip6buf[INET6_ADDRSTRLEN];
	snprintf(src, sizeof(src), "[%s]",
	ip6_sprintf(ip6buf, &args->f_id.src_ip6));
	snprintf(dst, sizeof(dst), "[%s]",
	ip6_sprintf(ip6buf, &args->f_id.dst_ip6));

	ip6 = (struct ip6_hdr *)ip;
	tcp = (struct tcphdr )(((char )ip) + hlen);
	udp = (struct udphdr )(((char )ip) + hlen);
	} else
	#endif
	{
	tcp = L3HDR(struct tcphdr, ip);
	udp = L3HDR(struct udphdr, ip);

	inet_ntoa_r(ip->ip_src, src);
	inet_ntoa_r(ip->ip_dst, dst);
	}

	switch (args->f_id.proto) {
	case IPPROTO_TCP:
	len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
	if (offset == 0)
	snprintf(SNPARGS(proto, len), ":%d %s:%d",
	ntohs(tcp->th_sport),
	dst,
	ntohs(tcp->th_dport));
	else
	snprintf(SNPARGS(proto, len), " %s", dst);
	break;

	case IPPROTO_UDP:
	len = snprintf(SNPARGS(proto, 0), "UDP %s", src);
	if (offset == 0)
	snprintf(SNPARGS(proto, len), ":%d %s:%d",
	ntohs(udp->uh_sport),
	dst,
	ntohs(udp->uh_dport));
	else
	snprintf(SNPARGS(proto, len), " %s", dst);
	break;

	case IPPROTO_ICMP:
	icmp = L3HDR(struct icmphdr, ip);
	if (offset == 0)
	len = snprintf(SNPARGS(proto, 0),
	"ICMP:%u.%u ",
	icmp->icmp_type, icmp->icmp_code);
	else
	len = snprintf(SNPARGS(proto, 0), "ICMP ");
	len += snprintf(SNPARGS(proto, len), "%s", src);
	snprintf(SNPARGS(proto, len), " %s", dst);
	break;
	#ifdef INET6
	case IPPROTO_ICMPV6:
	icmp6 = (struct icmp6_hdr )(((char )ip) + hlen);
	if (offset == 0)
	len = snprintf(SNPARGS(proto, 0),
	"ICMPv6:%u.%u ",
	icmp6->icmp6_type, icmp6->icmp6_code);
	else
	len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
	len += snprintf(SNPARGS(proto, len), "%s", src);
	snprintf(SNPARGS(proto, len), " %s", dst);
	break;
	#endif
	default:
	len = snprintf(SNPARGS(proto, 0), "P:%d %s",
	args->f_id.proto, src);
	snprintf(SNPARGS(proto, len), " %s", dst);
	break;
	}

	#ifdef INET6
	if (IS_IP6_FLOW_ID(&(args->f_id))) {
	if (offset & (IP6F_OFF_MASK \| IP6F_MORE_FRAG))
	snprintf(SNPARGS(fragment, 0),
	" (frag %08x:%d@%d%s)",
	args->f_id.frag_id6,
	ntohs(ip6->ip6_plen) - hlen,
	ntohs(offset & IP6F_OFF_MASK) << 3,
	(offset & IP6F_MORE_FRAG) ? "+" : "");
	} else
	#endif
	{
	int ip_off, ip_len;
	if (eh != NULL) { /* layer 2 packets are as on the wire */
	ip_off = ntohs(ip->ip_off);
	ip_len = ntohs(ip->ip_len);
	} else {
	ip_off = ip->ip_off;
	ip_len = ip->ip_len;
	}
	if (ip_off & (IP_MF \| IP_OFFMASK))
	snprintf(SNPARGS(fragment, 0),
	" (frag %d:%d@%d%s)",
	ntohs(ip->ip_id), ip_len - (ip->ip_hl << 2),
	offset << 3,
	(ip_off & IP_MF) ? "+" : "");
	}
	}
	if (oif \|\| m->m_pkthdr.rcvif)
	log(LOG_SECURITY \| LOG_INFO,
	"ipfw: %d %s %s %s via %s%s\n",
	f ? f->rulenum : -1,
	action, proto, oif ? "out" : "in",
	oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname,
	fragment);
	else
	log(LOG_SECURITY \| LOG_INFO,
	"ipfw: %d %s %s [no if info]%s\n",
	f ? f->rulenum : -1,
	action, proto, fragment);
	if (limit_reached)
	log(LOG_SECURITY \| LOG_NOTICE,
	"ipfw: limit %d reached on entry %d\n",
	limit_reached, f ? f->rulenum : -1);
	}

	/*
	* IMPORTANT: the hash function for dynamic rules must be commutative
	* in source and destination (ip,port), because rules are bidirectional
	* and we want to find both in the same bucket.
	*/
	static __inline int
	hash_packet(struct ipfw_flow_id *id)
	{
	+ INIT_VNET_IPFW(curvnet);
	u_int32_t i;

	#ifdef INET6
	if (IS_IP6_FLOW_ID(id))
	i = hash_packet6(id);
	else
	#endif /* INET6 */
	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
	i &= (V_curr_dyn_buckets - 1);
	return i;
	}

	/**
	* unlink a dynamic rule from a chain. prev is a pointer to
	* the previous one, q is a pointer to the rule to delete,
	* head is a pointer to the head of the queue.
	* Modifies q and potentially also head.
	*/
	#define UNLINK_DYN_RULE(prev, head, q) { \
	ipfw_dyn_rule *old_q = q; \
	\
	/* remove a refcount to the parent */ \
	if (q->dyn_type == O_LIMIT) \
	q->parent->count--; \
	DEB(printf("ipfw: unlink entry 0x%08x %d -> 0x%08x %d, %d left\n",\
	(q->id.src_ip), (q->id.src_port), \
	(q->id.dst_ip), (q->id.dst_port), V_dyn_count-1 ); ) \
	if (prev != NULL) \
	prev->next = q = q->next; \
	else \
	head = q = q->next; \
	V_dyn_count--; \
	uma_zfree(ipfw_dyn_rule_zone, old_q); }

	#define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0)

	/**
	* Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
	*
	* If keep_me == NULL, rules are deleted even if not expired,
	* otherwise only expired rules are removed.
	*
	* The value of the second parameter is also used to point to identify
	* a rule we absolutely do not want to remove (e.g. because we are
	* holding a reference to it -- this is the case with O_LIMIT_PARENT
	* rules). The pointer is only used for comparison, so any non-null
	* value will do.
	*/
	static void
	remove_dyn_rule(struct ip_fw rule, ipfw_dyn_rule keep_me)
	{
	+ INIT_VNET_IPFW(curvnet);
	static u_int32_t last_remove = 0;

	#define FORCE (keep_me == NULL)

	ipfw_dyn_rule prev, q;
	int i, pass = 0, max_pass = 0;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v == NULL \|\| V_dyn_count == 0)
	return;
	/* do not expire more than once per second, it is useless */
	if (!FORCE && last_remove == time_uptime)
	return;
	last_remove = time_uptime;

	/*
	* because O_LIMIT refer to parent rules, during the first pass only
	* remove child and mark any pending LIMIT_PARENT, and remove
	* them in a second pass.
	*/
	next_pass:
	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
	for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
	/*
	* Logic can become complex here, so we split tests.
	*/
	if (q == keep_me)
	goto next;
	if (rule != NULL && rule != q->rule)
	goto next; /* not the one we are looking for */
	if (q->dyn_type == O_LIMIT_PARENT) {
	/*
	* handle parent in the second pass,
	* record we need one.
	*/
	max_pass = 1;
	if (pass == 0)
	goto next;
	if (FORCE && q->count != 0 ) {
	/* XXX should not happen! */
	printf("ipfw: OUCH! cannot remove rule,"
	" count %d\n", q->count);
	}
	} else {
	if (!FORCE &&
	!TIME_LEQ( q->expire, time_uptime ))
	goto next;
	}
	if (q->dyn_type != O_LIMIT_PARENT \|\| !q->count) {
	UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
	continue;
	}
	next:
	prev=q;
	q=q->next;
	}
	}
	if (pass++ < max_pass)
	goto next_pass;
	}


	/**
	* lookup a dynamic rule.
	*/
	static ipfw_dyn_rule *
	lookup_dyn_rule_locked(struct ipfw_flow_id pkt, int match_direction,
	struct tcphdr *tcp)
	{
	+ INIT_VNET_IPFW(curvnet);
	/*
	* stateful ipfw extensions.
	* Lookup into dynamic session queue
	*/
	#define MATCH_REVERSE 0
	#define MATCH_FORWARD 1
	#define MATCH_NONE 2
	#define MATCH_UNKNOWN 3
	int i, dir = MATCH_NONE;
	ipfw_dyn_rule prev, q=NULL;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v == NULL)
	goto done; /* not found */
	i = hash_packet( pkt );
	for (prev=NULL, q = V_ipfw_dyn_v[i] ; q != NULL ; ) {
	if (q->dyn_type == O_LIMIT_PARENT && q->count)
	goto next;
	if (TIME_LEQ( q->expire, time_uptime)) { /* expire entry */
	UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
	continue;
	}
	if (pkt->proto == q->id.proto &&
	q->dyn_type != O_LIMIT_PARENT) {
	if (IS_IP6_FLOW_ID(pkt)) {
	if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
	&(q->id.src_ip6)) &&
	IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
	&(q->id.dst_ip6)) &&
	pkt->src_port == q->id.src_port &&
	pkt->dst_port == q->id.dst_port ) {
	dir = MATCH_FORWARD;
	break;
	}
	if (IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
	&(q->id.dst_ip6)) &&
	IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
	&(q->id.src_ip6)) &&
	pkt->src_port == q->id.dst_port &&
	pkt->dst_port == q->id.src_port ) {
	dir = MATCH_REVERSE;
	break;
	}
	} else {
	if (pkt->src_ip == q->id.src_ip &&
	pkt->dst_ip == q->id.dst_ip &&
	pkt->src_port == q->id.src_port &&
	pkt->dst_port == q->id.dst_port ) {
	dir = MATCH_FORWARD;
	break;
	}
	if (pkt->src_ip == q->id.dst_ip &&
	pkt->dst_ip == q->id.src_ip &&
	pkt->src_port == q->id.dst_port &&
	pkt->dst_port == q->id.src_port ) {
	dir = MATCH_REVERSE;
	break;
	}
	}
	}
	next:
	prev = q;
	q = q->next;
	}
	if (q == NULL)
	goto done; /* q = NULL, not found */

	if ( prev != NULL) { /* found and not in front */
	prev->next = q->next;
	q->next = V_ipfw_dyn_v[i];
	V_ipfw_dyn_v[i] = q;
	}
	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
	u_char flags = pkt->flags & (TH_FIN\|TH_SYN\|TH_RST);

	#define BOTH_SYN (TH_SYN \| (TH_SYN << 8))
	#define BOTH_FIN (TH_FIN \| (TH_FIN << 8))
	q->state \|= (dir == MATCH_FORWARD ) ? flags : (flags << 8);
	switch (q->state) {
	case TH_SYN: /* opening */
	q->expire = time_uptime + V_dyn_syn_lifetime;
	break;

	case BOTH_SYN: /* move to established */
	case BOTH_SYN \| TH_FIN : /* one side tries to close */
	case BOTH_SYN \| (TH_FIN << 8) :
	if (tcp) {
	#define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0)
	u_int32_t ack = ntohl(tcp->th_ack);
	if (dir == MATCH_FORWARD) {
	if (q->ack_fwd == 0 \|\| _SEQ_GE(ack, q->ack_fwd))
	q->ack_fwd = ack;
	else { /* ignore out-of-sequence */
	break;
	}
	} else {
	if (q->ack_rev == 0 \|\| _SEQ_GE(ack, q->ack_rev))
	q->ack_rev = ack;
	else { /* ignore out-of-sequence */
	break;
	}
	}
	}
	q->expire = time_uptime + V_dyn_ack_lifetime;
	break;

	case BOTH_SYN \| BOTH_FIN: /* both sides closed */
	if (V_dyn_fin_lifetime >= V_dyn_keepalive_period)
	V_dyn_fin_lifetime = V_dyn_keepalive_period - 1;
	q->expire = time_uptime + V_dyn_fin_lifetime;
	break;

	default:
	#if 0
	/*
	* reset or some invalid combination, but can also
	* occur if we use keep-state the wrong way.
	*/
	if ( (q->state & ((TH_RST << 8)\|TH_RST)) == 0)
	printf("invalid state: 0x%x\n", q->state);
	#endif
	if (V_dyn_rst_lifetime >= V_dyn_keepalive_period)
	V_dyn_rst_lifetime = V_dyn_keepalive_period - 1;
	q->expire = time_uptime + V_dyn_rst_lifetime;
	break;
	}
	} else if (pkt->proto == IPPROTO_UDP) {
	q->expire = time_uptime + V_dyn_udp_lifetime;
	} else {
	/* other protocols */
	q->expire = time_uptime + V_dyn_short_lifetime;
	}
	done:
	if (match_direction)
	*match_direction = dir;
	return q;
	}

	static ipfw_dyn_rule *
	lookup_dyn_rule(struct ipfw_flow_id pkt, int match_direction,
	struct tcphdr *tcp)
	{
	ipfw_dyn_rule *q;

	IPFW_DYN_LOCK();
	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
	if (q == NULL)
	IPFW_DYN_UNLOCK();
	/* NB: return table locked when q is not NULL */
	return q;
	}

	static void
	realloc_dynamic_table(void)
	{
	+ INIT_VNET_IPFW(curvnet);
	IPFW_DYN_LOCK_ASSERT();

	/*
	* Try reallocation, make sure we have a power of 2 and do
	* not allow more than 64k entries. In case of overflow,
	* default to 1024.
	*/

	if (V_dyn_buckets > 65536)
	V_dyn_buckets = 1024;
	if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
	V_dyn_buckets = V_curr_dyn_buckets; /* reset */
	return;
	}
	V_curr_dyn_buckets = V_dyn_buckets;
	if (V_ipfw_dyn_v != NULL)
	free(V_ipfw_dyn_v, M_IPFW);
	for (;;) {
	V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
	M_IPFW, M_NOWAIT \| M_ZERO);
	if (V_ipfw_dyn_v != NULL \|\| V_curr_dyn_buckets <= 2)
	break;
	V_curr_dyn_buckets /= 2;
	}
	}

	/**
	* Install state of type 'type' for a dynamic session.
	* The hash table contains two type of rules:
	* - regular rules (O_KEEP_STATE)
	* - rules for sessions with limited number of sess per user
	* (O_LIMIT). When they are created, the parent is
	* increased by 1, and decreased on delete. In this case,
	* the third parameter is the parent rule and not the chain.
	* - "parent" rules for the above (O_LIMIT_PARENT).
	*/
	static ipfw_dyn_rule *
	add_dyn_rule(struct ipfw_flow_id id, u_int8_t dyn_type, struct ip_fw rule)
	{
	+ INIT_VNET_IPFW(curvnet);
	ipfw_dyn_rule *r;
	int i;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v == NULL \|\|
	(V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
	realloc_dynamic_table();
	if (V_ipfw_dyn_v == NULL)
	return NULL; /* failed ! */
	}
	i = hash_packet(id);

	r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT \| M_ZERO);
	if (r == NULL) {
	printf ("ipfw: sorry cannot allocate state\n");
	return NULL;
	}

	/* increase refcount on parent, and set pointer */
	if (dyn_type == O_LIMIT) {
	ipfw_dyn_rule parent = (ipfw_dyn_rule )rule;
	if ( parent->dyn_type != O_LIMIT_PARENT)
	panic("invalid parent");
	parent->count++;
	r->parent = parent;
	rule = parent->rule;
	}

	r->id = *id;
	r->expire = time_uptime + V_dyn_syn_lifetime;
	r->rule = rule;
	r->dyn_type = dyn_type;
	r->pcnt = r->bcnt = 0;
	r->count = 0;

	r->bucket = i;
	r->next = V_ipfw_dyn_v[i];
	V_ipfw_dyn_v[i] = r;
	V_dyn_count++;
	DEB(printf("ipfw: add dyn entry ty %d 0x%08x %d -> 0x%08x %d, total %d\n",
	dyn_type,
	(r->id.src_ip), (r->id.src_port),
	(r->id.dst_ip), (r->id.dst_port),
	V_dyn_count ); )
	return r;
	}

	/**
	* lookup dynamic parent rule using pkt and rule as search keys.
	* If the lookup fails, then install one.
	*/
	static ipfw_dyn_rule *
	lookup_dyn_parent(struct ipfw_flow_id pkt, struct ip_fw rule)
	{
	+ INIT_VNET_IPFW(curvnet);
	ipfw_dyn_rule *q;
	int i;

	IPFW_DYN_LOCK_ASSERT();

	if (V_ipfw_dyn_v) {
	int is_v6 = IS_IP6_FLOW_ID(pkt);
	i = hash_packet( pkt );
	for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
	if (q->dyn_type == O_LIMIT_PARENT &&
	rule== q->rule &&
	pkt->proto == q->id.proto &&
	pkt->src_port == q->id.src_port &&
	pkt->dst_port == q->id.dst_port &&
	(
	(is_v6 &&
	IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
	&(q->id.src_ip6)) &&
	IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
	&(q->id.dst_ip6))) \|\|
	(!is_v6 &&
	pkt->src_ip == q->id.src_ip &&
	pkt->dst_ip == q->id.dst_ip)
	)
	) {
	q->expire = time_uptime + V_dyn_short_lifetime;
	DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);)
	return q;
	}
	}
	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
	}

	/**
	* Install dynamic state for rule type cmd->o.opcode
	*
	* Returns 1 (failure) if state is not installed because of errors or because
	* session limitations are enforced.
	*/
	static int
	install_state(struct ip_fw rule, ipfw_insn_limit cmd,
	struct ip_fw_args *args, uint32_t tablearg)
	{
	+ INIT_VNET_IPFW(curvnet);
	static int last_log;
	ipfw_dyn_rule *q;
	struct in_addr da;
	char src[48], dst[48];

	src[0] = '\0';
	dst[0] = '\0';

	DEB(
	printf("ipfw: %s: type %d 0x%08x %u -> 0x%08x %u\n",
	__func__, cmd->o.opcode,
	(args->f_id.src_ip), (args->f_id.src_port),
	(args->f_id.dst_ip), (args->f_id.dst_port));
	)

	IPFW_DYN_LOCK();

	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);

	if (q != NULL) { /* should never occur */
	if (last_log != time_uptime) {
	last_log = time_uptime;
	printf("ipfw: %s: entry already present, done\n",
	__func__);
	}
	IPFW_DYN_UNLOCK();
	return (0);
	}

	if (V_dyn_count >= V_dyn_max)
	/* Run out of slots, try to remove any expired rule. */
	remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);

	if (V_dyn_count >= V_dyn_max) {
	if (last_log != time_uptime) {
	last_log = time_uptime;
	printf("ipfw: %s: Too many dynamic rules\n", __func__);
	}
	IPFW_DYN_UNLOCK();
	return (1); /* cannot install, notify caller */
	}

	switch (cmd->o.opcode) {
	case O_KEEP_STATE: /* bidir rule */
	add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
	break;

	case O_LIMIT: { /* limit number of sessions */
	struct ipfw_flow_id id;
	ipfw_dyn_rule *parent;
	uint32_t conn_limit;
	uint16_t limit_mask = cmd->limit_mask;

	conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
	tablearg : cmd->conn_limit;

	DEB(
	if (cmd->conn_limit == IP_FW_TABLEARG)
	printf("ipfw: %s: O_LIMIT rule, conn_limit: %u "
	"(tablearg)\n", __func__, conn_limit);
	else
	printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n",
	__func__, conn_limit);
	)

	id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0;
	id.proto = args->f_id.proto;
	id.addr_type = args->f_id.addr_type;
	id.fib = M_GETFIB(args->m);

	if (IS_IP6_FLOW_ID (&(args->f_id))) {
	if (limit_mask & DYN_SRC_ADDR)
	id.src_ip6 = args->f_id.src_ip6;
	if (limit_mask & DYN_DST_ADDR)
	id.dst_ip6 = args->f_id.dst_ip6;
	} else {
	if (limit_mask & DYN_SRC_ADDR)
	id.src_ip = args->f_id.src_ip;
	if (limit_mask & DYN_DST_ADDR)
	id.dst_ip = args->f_id.dst_ip;
	}
	if (limit_mask & DYN_SRC_PORT)
	id.src_port = args->f_id.src_port;
	if (limit_mask & DYN_DST_PORT)
	id.dst_port = args->f_id.dst_port;
	if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
	printf("ipfw: %s: add parent failed\n", __func__);
	IPFW_DYN_UNLOCK();
	return (1);
	}

	if (parent->count >= conn_limit) {
	/* See if we can remove some expired rule. */
	remove_dyn_rule(rule, parent);
	if (parent->count >= conn_limit) {
	if (V_fw_verbose && last_log != time_uptime) {
	last_log = time_uptime;
	#ifdef INET6
	/*
	* XXX IPv6 flows are not
	* supported yet.
	*/
	if (IS_IP6_FLOW_ID(&(args->f_id))) {
	char ip6buf[INET6_ADDRSTRLEN];
	snprintf(src, sizeof(src),
	"[%s]", ip6_sprintf(ip6buf,
	&args->f_id.src_ip6));
	snprintf(dst, sizeof(dst),
	"[%s]", ip6_sprintf(ip6buf,
	&args->f_id.dst_ip6));
	} else
	#endif
	{
	da.s_addr =
	htonl(args->f_id.src_ip);
	inet_ntoa_r(da, src);
	da.s_addr =
	htonl(args->f_id.dst_ip);
	inet_ntoa_r(da, dst);
	}
	log(LOG_SECURITY \| LOG_DEBUG,
	"ipfw: %d %s %s:%u -> %s:%u, %s\n",
	parent->rule->rulenum,
	"drop session",
	src, (args->f_id.src_port),
	dst, (args->f_id.dst_port),
	"too many entries");
	}
	IPFW_DYN_UNLOCK();
	return (1);
	}
	}
	add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
	break;
	}
	default:
	printf("ipfw: %s: unknown dynamic rule type %u\n",
	__func__, cmd->o.opcode);
	IPFW_DYN_UNLOCK();
	return (1);
	}

	/* XXX just set lifetime */
	lookup_dyn_rule_locked(&args->f_id, NULL, NULL);

	IPFW_DYN_UNLOCK();
	return (0);
	}

	/*
	* Generate a TCP packet, containing either a RST or a keepalive.
	* When flags & TH_RST, we are sending a RST packet, because of a
	* "reset" action matched the packet.
	* Otherwise we are sending a keepalive, and flags & TH_
	* The 'replyto' mbuf is the mbuf being replied to, if any, and is required
	* so that MAC can label the reply appropriately.
	*/
	static struct mbuf *
	send_pkt(struct mbuf replyto, struct ipfw_flow_id id, u_int32_t seq,
	u_int32_t ack, int flags)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *m;
	struct ip *ip;
	struct tcphdr *tcp;

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == 0)
	return (NULL);
	m->m_pkthdr.rcvif = (struct ifnet *)0;

	M_SETFIB(m, id->fib);
	#ifdef MAC
	if (replyto != NULL)
	mac_netinet_firewall_reply(replyto, m);
	else
	mac_netinet_firewall_send(m);
	#else
	(void)replyto; /* don't warn about unused arg */
	#endif

	m->m_pkthdr.len = m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
	m->m_data += max_linkhdr;

	ip = mtod(m, struct ip *);
	bzero(ip, m->m_len);
	tcp = (struct tcphdr )(ip + 1); / no IP options */
	ip->ip_p = IPPROTO_TCP;
	tcp->th_off = 5;
	/*
	* Assume we are sending a RST (or a keepalive in the reverse
	* direction), swap src and destination addresses and ports.
	*/
	ip->ip_src.s_addr = htonl(id->dst_ip);
	ip->ip_dst.s_addr = htonl(id->src_ip);
	tcp->th_sport = htons(id->dst_port);
	tcp->th_dport = htons(id->src_port);
	if (flags & TH_RST) { /* we are sending a RST */
	if (flags & TH_ACK) {
	tcp->th_seq = htonl(ack);
	tcp->th_ack = htonl(0);
	tcp->th_flags = TH_RST;
	} else {
	if (flags & TH_SYN)
	seq++;
	tcp->th_seq = htonl(0);
	tcp->th_ack = htonl(seq);
	tcp->th_flags = TH_RST \| TH_ACK;
	}
	} else {
	/*
	* We are sending a keepalive. flags & TH_SYN determines
	* the direction, forward if set, reverse if clear.
	* NOTE: seq and ack are always assumed to be correct
	* as set by the caller. This may be confusing...
	*/
	if (flags & TH_SYN) {
	/*
	* we have to rewrite the correct addresses!
	*/
	ip->ip_dst.s_addr = htonl(id->dst_ip);
	ip->ip_src.s_addr = htonl(id->src_ip);
	tcp->th_dport = htons(id->dst_port);
	tcp->th_sport = htons(id->src_port);
	}
	tcp->th_seq = htonl(seq);
	tcp->th_ack = htonl(ack);
	tcp->th_flags = TH_ACK;
	}
	/*
	* set ip_len to the payload size so we can compute
	* the tcp checksum on the pseudoheader
	* XXX check this, could save a couple of words ?
	*/
	ip->ip_len = htons(sizeof(struct tcphdr));
	tcp->th_sum = in_cksum(m, m->m_pkthdr.len);
	/*
	* now fill fields left out earlier
	*/
	ip->ip_ttl = V_ip_defttl;
	ip->ip_len = m->m_pkthdr.len;
	m->m_flags \|= M_SKIP_FIREWALL;
	return (m);
	}

	/*
	* sends a reject message, consuming the mbuf passed as an argument.
	*/
	static void
	send_reject(struct ip_fw_args args, int code, int ip_len, struct ip ip)
	{

	#if 0
	/* XXX When ip is not guaranteed to be at mtod() we will
	* need to account for this */
	* The mbuf will however be thrown away so we can adjust it.
	* Remember we did an m_pullup on it already so we
	* can make some assumptions about contiguousness.
	*/
	if (args->L3offset)
	m_adj(m, args->L3offset);
	#endif
	if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */
	/* We need the IP header in host order for icmp_error(). */
	if (args->eh != NULL) {
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);
	}
	icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
	} else if (args->f_id.proto == IPPROTO_TCP) {
	struct tcphdr *const tcp =
	L3HDR(struct tcphdr, mtod(args->m, struct ip *));
	if ( (tcp->th_flags & TH_RST) == 0) {
	struct mbuf *m;
	m = send_pkt(args->m, &(args->f_id),
	ntohl(tcp->th_seq), ntohl(tcp->th_ack),
	tcp->th_flags \| TH_RST);
	if (m != NULL)
	ip_output(m, NULL, NULL, 0, NULL, NULL);
	}
	m_freem(args->m);
	} else
	m_freem(args->m);
	args->m = NULL;
	}

	/**
	*
	* Given an ip_fw *, lookup_next_rule will return a pointer
	* to the next rule, which can be either the jump
	* target (for skipto instructions) or the next one in the list (in
	* all other cases including a missing jump target).
	* The result is also written in the "next_rule" field of the rule.
	* Backward jumps are not allowed, so start looking from the next
	* rule...
	*
	* This never returns NULL -- in case we do not have an exact match,
	* the next rule is returned. When the ruleset is changed,
	* pointers are flushed so we are always correct.
	*/

	static struct ip_fw *
	lookup_next_rule(struct ip_fw *me, u_int32_t tablearg)
	{
	struct ip_fw *rule = NULL;
	ipfw_insn *cmd;
	u_int16_t rulenum;

	/* look for action, in case it is a skipto */
	cmd = ACTION_PTR(me);
	if (cmd->opcode == O_LOG)
	cmd += F_LEN(cmd);
	if (cmd->opcode == O_ALTQ)
	cmd += F_LEN(cmd);
	if (cmd->opcode == O_TAG)
	cmd += F_LEN(cmd);
	if (cmd->opcode == O_SKIPTO ) {
	if (tablearg != 0) {
	rulenum = (u_int16_t)tablearg;
	} else {
	rulenum = cmd->arg1;
	}
	for (rule = me->next; rule ; rule = rule->next) {
	if (rule->rulenum >= rulenum) {
	break;
	}
	}
	}
	if (rule == NULL) /* failure or not a skipto */
	rule = me->next;
	me->next_rule = rule;
	return rule;
	}

	static int
	add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
	uint8_t mlen, uint32_t value)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct radix_node_head *rnh;
	struct table_entry *ent;

	if (tbl >= IPFW_TABLES_MAX)
	return (EINVAL);
	rnh = ch->tables[tbl];
	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT \| M_ZERO);
	if (ent == NULL)
	return (ENOMEM);
	ent->value = value;
	ent->addr.sin_len = ent->mask.sin_len = 8;
	ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
	ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
	IPFW_WLOCK(&V_layer3_chain);
	if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) ==
	NULL) {
	IPFW_WUNLOCK(&V_layer3_chain);
	free(ent, M_IPFW_TBL);
	return (EEXIST);
	}
	IPFW_WUNLOCK(&V_layer3_chain);
	return (0);
	}

	static int
	del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
	uint8_t mlen)
	{
	struct radix_node_head *rnh;
	struct table_entry *ent;
	struct sockaddr_in sa, mask;

	if (tbl >= IPFW_TABLES_MAX)
	return (EINVAL);
	rnh = ch->tables[tbl];
	sa.sin_len = mask.sin_len = 8;
	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
	IPFW_WLOCK(ch);
	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
	if (ent == NULL) {
	IPFW_WUNLOCK(ch);
	return (ESRCH);
	}
	IPFW_WUNLOCK(ch);
	free(ent, M_IPFW_TBL);
	return (0);
	}

	static int
	flush_table_entry(struct radix_node rn, void arg)
	{
	struct radix_node_head * const rnh = arg;
	struct table_entry *ent;

	ent = (struct table_entry *)
	rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
	if (ent != NULL)
	free(ent, M_IPFW_TBL);
	return (0);
	}

	static int
	flush_table(struct ip_fw_chain *ch, uint16_t tbl)
	{
	struct radix_node_head *rnh;

	IPFW_WLOCK_ASSERT(ch);

	if (tbl >= IPFW_TABLES_MAX)
	return (EINVAL);
	rnh = ch->tables[tbl];
	KASSERT(rnh != NULL, ("NULL IPFW table"));
	rnh->rnh_walktree(rnh, flush_table_entry, rnh);
	return (0);
	}

	static void
	flush_tables(struct ip_fw_chain *ch)
	{
	uint16_t tbl;

	IPFW_WLOCK_ASSERT(ch);

	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
	flush_table(ch, tbl);
	}

	static int
	init_tables(struct ip_fw_chain *ch)
	{
	int i;
	uint16_t j;

	for (i = 0; i < IPFW_TABLES_MAX; i++) {
	if (!rn_inithead((void **)&ch->tables[i], 32)) {
	for (j = 0; j < i; j++) {
	(void) flush_table(ch, j);
	}
	return (ENOMEM);
	}
	}
	return (0);
	}

	static int
	lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
	uint32_t *val)
	{
	struct radix_node_head *rnh;
	struct table_entry *ent;
	struct sockaddr_in sa;

	if (tbl >= IPFW_TABLES_MAX)
	return (0);
	rnh = ch->tables[tbl];
	sa.sin_len = 8;
	sa.sin_addr.s_addr = addr;
	ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
	if (ent != NULL) {
	*val = ent->value;
	return (1);
	}
	return (0);
	}

	static int
	count_table_entry(struct radix_node rn, void arg)
	{
	u_int32_t * const cnt = arg;

	(*cnt)++;
	return (0);
	}

	static int
	count_table(struct ip_fw_chain ch, uint32_t tbl, uint32_t cnt)
	{
	struct radix_node_head *rnh;

	if (tbl >= IPFW_TABLES_MAX)
	return (EINVAL);
	rnh = ch->tables[tbl];
	*cnt = 0;
	rnh->rnh_walktree(rnh, count_table_entry, cnt);
	return (0);
	}

	static int
	dump_table_entry(struct radix_node rn, void arg)
	{
	struct table_entry * const n = (struct table_entry *)rn;
	ipfw_table * const tbl = arg;
	ipfw_table_entry *ent;

	if (tbl->cnt == tbl->size)
	return (1);
	ent = &tbl->ent[tbl->cnt];
	ent->tbl = tbl->tbl;
	if (in_nullhost(n->mask.sin_addr))
	ent->masklen = 0;
	else
	ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr));
	ent->addr = n->addr.sin_addr.s_addr;
	ent->value = n->value;
	tbl->cnt++;
	return (0);
	}

	static int
	dump_table(struct ip_fw_chain ch, ipfw_table tbl)
	{
	struct radix_node_head *rnh;

	if (tbl->tbl >= IPFW_TABLES_MAX)
	return (EINVAL);
	rnh = ch->tables[tbl->tbl];
	tbl->cnt = 0;
	rnh->rnh_walktree(rnh, dump_table_entry, tbl);
	return (0);
	}

	static void
	fill_ugid_cache(struct inpcb inp, struct ip_fw_ugid ugp)
	{
	struct ucred *cr;

	if (inp->inp_socket != NULL) {
	cr = inp->inp_socket->so_cred;
	ugp->fw_prid = jailed(cr) ?
	cr->cr_prison->pr_id : -1;
	ugp->fw_uid = cr->cr_uid;
	ugp->fw_ngroups = cr->cr_ngroups;
	bcopy(cr->cr_groups, ugp->fw_groups,
	sizeof(ugp->fw_groups));
	}
	}

	static int
	check_uidgid(ipfw_insn_u32 insn, int proto, struct ifnet oif,
	struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip,
	u_int16_t src_port, struct ip_fw_ugid ugp, int ugid_lookupp,
	struct inpcb *inp)
	{
	+ INIT_VNET_INET(curvnet);
	struct inpcbinfo *pi;
	int wildcard;
	struct inpcb *pcb;
	int match;
	gid_t *gp;

	/*
	* Check to see if the UDP or TCP stack supplied us with
	* the PCB. If so, rather then holding a lock and looking
	* up the PCB, we can use the one that was supplied.
	*/
	if (inp && *ugid_lookupp == 0) {
	INP_LOCK_ASSERT(inp);
	if (inp->inp_socket != NULL) {
	fill_ugid_cache(inp, ugp);
	*ugid_lookupp = 1;
	} else
	*ugid_lookupp = -1;
	}
	/*
	* If we have already been here and the packet has no
	* PCB entry associated with it, then we can safely
	* assume that this is a no match.
	*/
	if (*ugid_lookupp == -1)
	return (0);
	if (proto == IPPROTO_TCP) {
	wildcard = 0;
	pi = &V_tcbinfo;
	} else if (proto == IPPROTO_UDP) {
	wildcard = INPLOOKUP_WILDCARD;
	pi = &V_udbinfo;
	} else
	return 0;
	match = 0;
	if (*ugid_lookupp == 0) {
	INP_INFO_RLOCK(pi);
	pcb = (oif) ?
	in_pcblookup_hash(pi,
	dst_ip, htons(dst_port),
	src_ip, htons(src_port),
	wildcard, oif) :
	in_pcblookup_hash(pi,
	src_ip, htons(src_port),
	dst_ip, htons(dst_port),
	wildcard, NULL);
	if (pcb != NULL) {
	INP_RLOCK(pcb);
	if (pcb->inp_socket != NULL) {
	fill_ugid_cache(pcb, ugp);
	*ugid_lookupp = 1;
	}
	INP_RUNLOCK(pcb);
	}
	INP_INFO_RUNLOCK(pi);
	if (*ugid_lookupp == 0) {
	/*
	* If the lookup did not yield any results, there
	* is no sense in coming back and trying again. So
	* we can set lookup to -1 and ensure that we wont
	* bother the pcb system again.
	*/
	*ugid_lookupp = -1;
	return (0);
	}
	}
	if (insn->o.opcode == O_UID)
	match = (ugp->fw_uid == (uid_t)insn->d[0]);
	else if (insn->o.opcode == O_GID) {
	for (gp = ugp->fw_groups;
	gp < &ugp->fw_groups[ugp->fw_ngroups]; gp++)
	if (*gp == (gid_t)insn->d[0]) {
	match = 1;
	break;
	}
	} else if (insn->o.opcode == O_JAIL)
	match = (ugp->fw_prid == (int)insn->d[0]);
	return match;
	}

	/*
	* The main check routine for the firewall.
	*
	* All arguments are in args so we can modify them and return them
	* back to the caller.
	*
	* Parameters:
	*
	* args->m (in/out) The packet; we set to NULL when/if we nuke it.
	* Starts with the IP header.
	* args->eh (in) Mac header if present, or NULL for layer3 packet.
	* args->L3offset Number of bytes bypassed if we came from L2.
	* e.g. often sizeof(eh) NOTYET
	* args->oif Outgoing interface, or NULL if packet is incoming.
	* The incoming interface is in the mbuf. (in)
	* args->divert_rule (in/out)
	* Skip up to the first rule past this rule number;
	* upon return, non-zero port number for divert or tee.
	*
	* args->rule Pointer to the last matching rule (in/out)
	* args->next_hop Socket we are forwarding to (out).
	* args->f_id Addresses grabbed from the packet (out)
	* args->cookie a cookie depending on rule action
	*
	* Return value:
	*
	* IP_FW_PASS the packet must be accepted
	* IP_FW_DENY the packet must be dropped
	* IP_FW_DIVERT divert packet, port in m_tag
	* IP_FW_TEE tee packet, port in m_tag
	* IP_FW_DUMMYNET to dummynet, pipe in args->cookie
	* IP_FW_NETGRAPH into netgraph, cookie args->cookie
	*
	*/
	int
	ipfw_chk(struct ip_fw_args *args)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_IPFW(curvnet);
	+
	/*
	* Local variables holding state during the processing of a packet:
	*
	* IMPORTANT NOTE: to speed up the processing of rules, there
	* are some assumption on the values of the variables, which
	* are documented here. Should you change them, please check
	* the implementation of the various instructions to make sure
	* that they still work.
	*
	* args->eh The MAC header. It is non-null for a layer2
	* packet, it is NULL for a layer-3 packet.
	* notyet
	* args->L3offset Offset in the packet to the L3 (IP or equiv.) header.
	*
	* m \| args->m Pointer to the mbuf, as received from the caller.
	* It may change if ipfw_chk() does an m_pullup, or if it
	* consumes the packet because it calls send_reject().
	* XXX This has to change, so that ipfw_chk() never modifies
	* or consumes the buffer.
	* ip is the beginning of the ip(4 or 6) header.
	* Calculated by adding the L3offset to the start of data.
	* (Until we start using L3offset, the packet is
	* supposed to start with the ip header).
	*/
	struct mbuf *m = args->m;
	struct ip ip = mtod(m, struct ip );

	/*
	* For rules which contain uid/gid or jail constraints, cache
	* a copy of the users credentials after the pcb lookup has been
	* executed. This will speed up the processing of rules with
	* these types of constraints, as well as decrease contention
	* on pcb related locks.
	*/
	struct ip_fw_ugid fw_ugid_cache;
	int ugid_lookup = 0;

	/*
	* divinput_flags If non-zero, set to the IP_FW_DIVERT_*_FLAG
	* associated with a packet input on a divert socket. This
	* will allow to distinguish traffic and its direction when
	* it originates from a divert socket.
	*/
	u_int divinput_flags = 0;

	/*
	* oif \| args->oif If NULL, ipfw_chk has been called on the
	* inbound path (ether_input, ip_input).
	* If non-NULL, ipfw_chk has been called on the outbound path
	* (ether_output, ip_output).
	*/
	struct ifnet *oif = args->oif;

	struct ip_fw f = NULL; / matching rule */
	int retval = 0;

	/*
	* hlen The length of the IP header.
	*/
	u_int hlen = 0; /* hlen >0 means we have an IP pkt */

	/*
	* offset The offset of a fragment. offset != 0 means that
	* we have a fragment at this offset of an IPv4 packet.
	* offset == 0 means that (if this is an IPv4 packet)
	* this is the first or only fragment.
	* For IPv6 offset == 0 means there is no Fragment Header.
	* If offset != 0 for IPv6 always use correct mask to
	* get the correct offset because we add IP6F_MORE_FRAG
	* to be able to dectect the first fragment which would
	* otherwise have offset = 0.
	*/
	u_short offset = 0;

	/*
	* Local copies of addresses. They are only valid if we have
	* an IP packet.
	*
	* proto The protocol. Set to 0 for non-ip packets,
	* or to the protocol read from the packet otherwise.
	* proto != 0 means that we have an IPv4 packet.
	*
	* src_port, dst_port port numbers, in HOST format. Only
	* valid for TCP and UDP packets.
	*
	* src_ip, dst_ip ip addresses, in NETWORK format.
	* Only valid for IPv4 packets.
	*/
	u_int8_t proto;
	u_int16_t src_port = 0, dst_port = 0; /* NOTE: host format */
	struct in_addr src_ip, dst_ip; /* NOTE: network format */
	u_int16_t ip_len=0;
	int pktlen;
	u_int16_t etype = 0; /* Host order stored ether type */

	/*
	* dyn_dir = MATCH_UNKNOWN when rules unchecked,
	* MATCH_NONE when checked and not matched (q = NULL),
	* MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL)
	*/
	int dyn_dir = MATCH_UNKNOWN;
	ipfw_dyn_rule *q = NULL;
	struct ip_fw_chain *chain = &V_layer3_chain;
	struct m_tag *mtag;

	/*
	* We store in ulp a pointer to the upper layer protocol header.
	* In the ipv4 case this is easy to determine from the header,
	* but for ipv6 we might have some additional headers in the middle.
	* ulp is NULL if not found.
	*/
	void ulp = NULL; / upper layer protocol pointer. */
	/* XXX ipv6 variables */
	int is_ipv6 = 0;
	u_int16_t ext_hd = 0; /* bits vector for extension header filtering */
	/* end of ipv6 variables */
	int is_ipv4 = 0;

	if (m->m_flags & M_SKIP_FIREWALL)
	return (IP_FW_PASS); /* accept */

	pktlen = m->m_pkthdr.len;
	args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */
	proto = args->f_id.proto = 0; /* mark f_id invalid */
	/* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */

	/*
	* PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
	* then it sets p to point at the offset "len" in the mbuf. WARNING: the
	* pointer might become stale after other pullups (but we never use it
	* this way).
	*/
	#define PULLUP_TO(len, p, T) \
	do { \
	int x = (len) + sizeof(T); \
	if ((m)->m_len < x) { \
	args->m = m = m_pullup(m, x); \
	if (m == NULL) \
	goto pullup_failed; \
	} \
	p = (mtod(m, char *) + (len)); \
	} while (0)

	/*
	* if we have an ether header,
	*/
	if (args->eh)
	etype = ntohs(args->eh->ether_type);

	/* Identify IP packets and fill up variables. */
	if (pktlen >= sizeof(struct ip6_hdr) &&
	(args->eh == NULL \|\| etype == ETHERTYPE_IPV6) && ip->ip_v == 6) {
	struct ip6_hdr ip6 = (struct ip6_hdr )ip;
	is_ipv6 = 1;
	args->f_id.addr_type = 6;
	hlen = sizeof(struct ip6_hdr);
	proto = ip6->ip6_nxt;

	/* Search extension headers to find upper layer protocols */
	while (ulp == NULL) {
	switch (proto) {
	case IPPROTO_ICMPV6:
	PULLUP_TO(hlen, ulp, struct icmp6_hdr);
	args->f_id.flags = ICMP6(ulp)->icmp6_type;
	break;

	case IPPROTO_TCP:
	PULLUP_TO(hlen, ulp, struct tcphdr);
	dst_port = TCP(ulp)->th_dport;
	src_port = TCP(ulp)->th_sport;
	args->f_id.flags = TCP(ulp)->th_flags;
	break;

	case IPPROTO_SCTP:
	PULLUP_TO(hlen, ulp, struct sctphdr);
	src_port = SCTP(ulp)->src_port;
	dst_port = SCTP(ulp)->dest_port;
	break;

	case IPPROTO_UDP:
	PULLUP_TO(hlen, ulp, struct udphdr);
	dst_port = UDP(ulp)->uh_dport;
	src_port = UDP(ulp)->uh_sport;
	break;

	case IPPROTO_HOPOPTS: /* RFC 2460 */
	PULLUP_TO(hlen, ulp, struct ip6_hbh);
	ext_hd \|= EXT_HOPOPTS;
	hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
	proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
	ulp = NULL;
	break;

	case IPPROTO_ROUTING: /* RFC 2460 */
	PULLUP_TO(hlen, ulp, struct ip6_rthdr);
	switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
	case 0:
	ext_hd \|= EXT_RTHDR0;
	break;
	case 2:
	ext_hd \|= EXT_RTHDR2;
	break;
	default:
	printf("IPFW2: IPV6 - Unknown Routing "
	"Header type(%d)\n",
	((struct ip6_rthdr *)ulp)->ip6r_type);
	if (V_fw_deny_unknown_exthdrs)
	return (IP_FW_DENY);
	break;
	}
	ext_hd \|= EXT_ROUTING;
	hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
	proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
	ulp = NULL;
	break;

	case IPPROTO_FRAGMENT: /* RFC 2460 */
	PULLUP_TO(hlen, ulp, struct ip6_frag);
	ext_hd \|= EXT_FRAGMENT;
	hlen += sizeof (struct ip6_frag);
	proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
	offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
	IP6F_OFF_MASK;
	/* Add IP6F_MORE_FRAG for offset of first
	* fragment to be != 0. */
	offset \|= ((struct ip6_frag *)ulp)->ip6f_offlg &
	IP6F_MORE_FRAG;
	if (offset == 0) {
	printf("IPFW2: IPV6 - Invalid Fragment "
	"Header\n");
	if (V_fw_deny_unknown_exthdrs)
	return (IP_FW_DENY);
	break;
	}
	args->f_id.frag_id6 =
	ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
	ulp = NULL;
	break;

	case IPPROTO_DSTOPTS: /* RFC 2460 */
	PULLUP_TO(hlen, ulp, struct ip6_hbh);
	ext_hd \|= EXT_DSTOPTS;
	hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
	proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
	ulp = NULL;
	break;

	case IPPROTO_AH: /* RFC 2402 */
	PULLUP_TO(hlen, ulp, struct ip6_ext);
	ext_hd \|= EXT_AH;
	hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
	proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
	ulp = NULL;
	break;

	case IPPROTO_ESP: /* RFC 2406 */
	PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */
	/* Anything past Seq# is variable length and
	* data past this ext. header is encrypted. */
	ext_hd \|= EXT_ESP;
	break;

	case IPPROTO_NONE: /* RFC 2460 */
	/*
	* Packet ends here, and IPv6 header has
	* already been pulled up. If ip6e_len!=0
	* then octets must be ignored.
	*/
	ulp = ip; /* non-NULL to get out of loop. */
	break;

	case IPPROTO_OSPFIGP:
	/* XXX OSPF header check? */
	PULLUP_TO(hlen, ulp, struct ip6_ext);
	break;

	case IPPROTO_PIM:
	/* XXX PIM header check? */
	PULLUP_TO(hlen, ulp, struct pim);
	break;

	case IPPROTO_CARP:
	PULLUP_TO(hlen, ulp, struct carp_header);
	if (((struct carp_header *)ulp)->carp_version !=
	CARP_VERSION)
	return (IP_FW_DENY);
	if (((struct carp_header *)ulp)->carp_type !=
	CARP_ADVERTISEMENT)
	return (IP_FW_DENY);
	break;

	case IPPROTO_IPV6: /* RFC 2893 */
	PULLUP_TO(hlen, ulp, struct ip6_hdr);
	break;

	case IPPROTO_IPV4: /* RFC 2893 */
	PULLUP_TO(hlen, ulp, struct ip);
	break;

	default:
	printf("IPFW2: IPV6 - Unknown Extension "
	"Header(%d), ext_hd=%x\n", proto, ext_hd);
	if (V_fw_deny_unknown_exthdrs)
	return (IP_FW_DENY);
	PULLUP_TO(hlen, ulp, struct ip6_ext);
	break;
	} /switch /
	}
	ip = mtod(m, struct ip *);
	ip6 = (struct ip6_hdr *)ip;
	args->f_id.src_ip6 = ip6->ip6_src;
	args->f_id.dst_ip6 = ip6->ip6_dst;
	args->f_id.src_ip = 0;
	args->f_id.dst_ip = 0;
	args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
	} else if (pktlen >= sizeof(struct ip) &&
	(args->eh == NULL \|\| etype == ETHERTYPE_IP) && ip->ip_v == 4) {
	is_ipv4 = 1;
	hlen = ip->ip_hl << 2;
	args->f_id.addr_type = 4;

	/*
	* Collect parameters into local variables for faster matching.
	*/
	proto = ip->ip_p;
	src_ip = ip->ip_src;
	dst_ip = ip->ip_dst;
	if (args->eh != NULL) { /* layer 2 packets are as on the wire */
	offset = ntohs(ip->ip_off) & IP_OFFMASK;
	ip_len = ntohs(ip->ip_len);
	} else {
	offset = ip->ip_off & IP_OFFMASK;
	ip_len = ip->ip_len;
	}
	pktlen = ip_len < pktlen ? ip_len : pktlen;

	if (offset == 0) {
	switch (proto) {
	case IPPROTO_TCP:
	PULLUP_TO(hlen, ulp, struct tcphdr);
	dst_port = TCP(ulp)->th_dport;
	src_port = TCP(ulp)->th_sport;
	args->f_id.flags = TCP(ulp)->th_flags;
	break;

	case IPPROTO_UDP:
	PULLUP_TO(hlen, ulp, struct udphdr);
	dst_port = UDP(ulp)->uh_dport;
	src_port = UDP(ulp)->uh_sport;
	break;

	case IPPROTO_ICMP:
	PULLUP_TO(hlen, ulp, struct icmphdr);
	args->f_id.flags = ICMP(ulp)->icmp_type;
	break;

	default:
	break;
	}
	}

	ip = mtod(m, struct ip *);
	args->f_id.src_ip = ntohl(src_ip.s_addr);
	args->f_id.dst_ip = ntohl(dst_ip.s_addr);
	}
	#undef PULLUP_TO
	if (proto) { /* we may have port numbers, store them */
	args->f_id.proto = proto;
	args->f_id.src_port = src_port = ntohs(src_port);
	args->f_id.dst_port = dst_port = ntohs(dst_port);
	}

	IPFW_RLOCK(chain);
	mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL);
	if (args->rule) {
	/*
	* Packet has already been tagged. Look for the next rule
	* to restart processing.
	*
	* If fw_one_pass != 0 then just accept it.
	* XXX should not happen here, but optimized out in
	* the caller.
	*/
	if (V_fw_one_pass) {
	IPFW_RUNLOCK(chain);
	return (IP_FW_PASS);
	}

	f = args->rule->next_rule;
	if (f == NULL)
	f = lookup_next_rule(args->rule, 0);
	} else {
	/*
	* Find the starting rule. It can be either the first
	* one, or the one after divert_rule if asked so.
	*/
	int skipto = mtag ? divert_cookie(mtag) : 0;

	f = chain->rules;
	if (args->eh == NULL && skipto != 0) {
	if (skipto >= IPFW_DEFAULT_RULE) {
	IPFW_RUNLOCK(chain);
	return (IP_FW_DENY); /* invalid */
	}
	while (f && f->rulenum <= skipto)
	f = f->next;
	if (f == NULL) { /* drop packet */
	IPFW_RUNLOCK(chain);
	return (IP_FW_DENY);
	}
	}
	}
	/* reset divert rule to avoid confusion later */
	if (mtag) {
	divinput_flags = divert_info(mtag) &
	(IP_FW_DIVERT_OUTPUT_FLAG \| IP_FW_DIVERT_LOOPBACK_FLAG);
	m_tag_delete(m, mtag);
	}

	/*
	* Now scan the rules, and parse microinstructions for each rule.
	*/
	for (; f; f = f->next) {
	ipfw_insn *cmd;
	uint32_t tablearg = 0;
	int l, cmdlen, skip_or; /* skip rest of OR block */

	again:
	if (V_set_disable & (1 << f->set) )
	continue;

	skip_or = 0;
	for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
	l -= cmdlen, cmd += cmdlen) {
	int match;

	/*
	* check_body is a jump target used when we find a
	* CHECK_STATE, and need to jump to the body of
	* the target rule.
	*/

	check_body:
	cmdlen = F_LEN(cmd);
	/*
	* An OR block (insn_1 \|\| .. \|\| insn_n) has the
	* F_OR bit set in all but the last instruction.
	* The first match will set "skip_or", and cause
	* the following instructions to be skipped until
	* past the one with the F_OR bit clear.
	*/
	if (skip_or) { /* skip this instruction */
	if ((cmd->len & F_OR) == 0)
	skip_or = 0; /* next one is good */
	continue;
	}
	match = 0; /* set to 1 if we succeed */

	switch (cmd->opcode) {
	/*
	* The first set of opcodes compares the packet's
	* fields with some pattern, setting 'match' if a
	* match is found. At the end of the loop there is
	* logic to deal with F_NOT and F_OR flags associated
	* with the opcode.
	*/
	case O_NOP:
	match = 1;
	break;

	case O_FORWARD_MAC:
	printf("ipfw: opcode %d unimplemented\n",
	cmd->opcode);
	break;

	case O_GID:
	case O_UID:
	case O_JAIL:
	/*
	* We only check offset == 0 && proto != 0,
	* as this ensures that we have a
	* packet with the ports info.
	*/
	if (offset!=0)
	break;
	if (is_ipv6) /* XXX to be fixed later */
	break;
	if (proto == IPPROTO_TCP \|\|
	proto == IPPROTO_UDP)
	match = check_uidgid(
	(ipfw_insn_u32 *)cmd,
	proto, oif,
	dst_ip, dst_port,
	src_ip, src_port, &fw_ugid_cache,
	&ugid_lookup, args->inp);
	break;

	case O_RECV:
	match = iface_match(m->m_pkthdr.rcvif,
	(ipfw_insn_if *)cmd);
	break;

	case O_XMIT:
	match = iface_match(oif, (ipfw_insn_if *)cmd);
	break;

	case O_VIA:
	match = iface_match(oif ? oif :
	m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd);
	break;

	case O_MACADDR2:
	if (args->eh != NULL) { /* have MAC header */
	u_int32_t want = (u_int32_t )
	((ipfw_insn_mac *)cmd)->addr;
	u_int32_t mask = (u_int32_t )
	((ipfw_insn_mac *)cmd)->mask;
	u_int32_t hdr = (u_int32_t )args->eh;

	match =
	( want[0] == (hdr[0] & mask[0]) &&
	want[1] == (hdr[1] & mask[1]) &&
	want[2] == (hdr[2] & mask[2]) );
	}
	break;

	case O_MAC_TYPE:
	if (args->eh != NULL) {
	u_int16_t *p =
	((ipfw_insn_u16 *)cmd)->ports;
	int i;

	for (i = cmdlen - 1; !match && i>0;
	i--, p += 2)
	match = (etype >= p[0] &&
	etype <= p[1]);
	}
	break;

	case O_FRAG:
	match = (offset != 0);
	break;

	case O_IN: /* "out" is "not in" */
	match = (oif == NULL);
	break;

	case O_LAYER2:
	match = (args->eh != NULL);
	break;

	case O_DIVERTED:
	match = (cmd->arg1 & 1 && divinput_flags &
	IP_FW_DIVERT_LOOPBACK_FLAG) \|\|
	(cmd->arg1 & 2 && divinput_flags &
	IP_FW_DIVERT_OUTPUT_FLAG);
	break;

	case O_PROTO:
	/*
	* We do not allow an arg of 0 so the
	* check of "proto" only suffices.
	*/
	match = (proto == cmd->arg1);
	break;

	case O_IP_SRC:
	match = is_ipv4 &&
	(((ipfw_insn_ip *)cmd)->addr.s_addr ==
	src_ip.s_addr);
	break;

	case O_IP_SRC_LOOKUP:
	case O_IP_DST_LOOKUP:
	if (is_ipv4) {
	uint32_t a =
	(cmd->opcode == O_IP_DST_LOOKUP) ?
	dst_ip.s_addr : src_ip.s_addr;
	uint32_t v;

	match = lookup_table(chain, cmd->arg1, a,
	&v);
	if (!match)
	break;
	if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
	match =
	((ipfw_insn_u32 *)cmd)->d[0] == v;
	else
	tablearg = v;
	}
	break;

	case O_IP_SRC_MASK:
	case O_IP_DST_MASK:
	if (is_ipv4) {
	uint32_t a =
	(cmd->opcode == O_IP_DST_MASK) ?
	dst_ip.s_addr : src_ip.s_addr;
	uint32_t p = ((ipfw_insn_u32 )cmd)->d;
	int i = cmdlen-1;

	for (; !match && i>0; i-= 2, p+= 2)
	match = (p[0] == (a & p[1]));
	}
	break;

	case O_IP_SRC_ME:
	if (is_ipv4) {
	struct ifnet *tif;

	INADDR_TO_IFP(src_ip, tif);
	match = (tif != NULL);
	}
	break;

	case O_IP_DST_SET:
	case O_IP_SRC_SET:
	if (is_ipv4) {
	u_int32_t d = (u_int32_t )(cmd+1);
	u_int32_t addr =
	cmd->opcode == O_IP_DST_SET ?
	args->f_id.dst_ip :
	args->f_id.src_ip;

	if (addr < d[0])
	break;
	addr -= d[0]; /* subtract base */
	match = (addr < cmd->arg1) &&
	( d[ 1 + (addr>>5)] &
	(1<<(addr & 0x1f)) );
	}
	break;

	case O_IP_DST:
	match = is_ipv4 &&
	(((ipfw_insn_ip *)cmd)->addr.s_addr ==
	dst_ip.s_addr);
	break;

	case O_IP_DST_ME:
	if (is_ipv4) {
	struct ifnet *tif;

	INADDR_TO_IFP(dst_ip, tif);
	match = (tif != NULL);
	}
	break;

	case O_IP_SRCPORT:
	case O_IP_DSTPORT:
	/*
	* offset == 0 && proto != 0 is enough
	* to guarantee that we have a
	* packet with port info.
	*/
	if ((proto==IPPROTO_UDP \|\| proto==IPPROTO_TCP)
	&& offset == 0) {
	u_int16_t x =
	(cmd->opcode == O_IP_SRCPORT) ?
	src_port : dst_port ;
	u_int16_t *p =
	((ipfw_insn_u16 *)cmd)->ports;
	int i;

	for (i = cmdlen - 1; !match && i>0;
	i--, p += 2)
	match = (x>=p[0] && x<=p[1]);
	}
	break;

	case O_ICMPTYPE:
	match = (offset == 0 && proto==IPPROTO_ICMP &&
	icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
	break;

	#ifdef INET6
	case O_ICMP6TYPE:
	match = is_ipv6 && offset == 0 &&
	proto==IPPROTO_ICMPV6 &&
	icmp6type_match(
	ICMP6(ulp)->icmp6_type,
	(ipfw_insn_u32 *)cmd);
	break;
	#endif /* INET6 */

	case O_IPOPT:
	match = (is_ipv4 &&
	ipopts_match(ip, cmd) );
	break;

	case O_IPVER:
	match = (is_ipv4 &&
	cmd->arg1 == ip->ip_v);
	break;

	case O_IPID:
	case O_IPLEN:
	case O_IPTTL:
	if (is_ipv4) { /* only for IP packets */
	uint16_t x;
	uint16_t *p;
	int i;

	if (cmd->opcode == O_IPLEN)
	x = ip_len;
	else if (cmd->opcode == O_IPTTL)
	x = ip->ip_ttl;
	else /* must be IPID */
	x = ntohs(ip->ip_id);
	if (cmdlen == 1) {
	match = (cmd->arg1 == x);
	break;
	}
	/* otherwise we have ranges */
	p = ((ipfw_insn_u16 *)cmd)->ports;
	i = cmdlen - 1;
	for (; !match && i>0; i--, p += 2)
	match = (x >= p[0] && x <= p[1]);
	}
	break;

	case O_IPPRECEDENCE:
	match = (is_ipv4 &&
	(cmd->arg1 == (ip->ip_tos & 0xe0)) );
	break;

	case O_IPTOS:
	match = (is_ipv4 &&
	flags_match(cmd, ip->ip_tos));
	break;

	case O_TCPDATALEN:
	if (proto == IPPROTO_TCP && offset == 0) {
	struct tcphdr *tcp;
	uint16_t x;
	uint16_t *p;
	int i;

	tcp = TCP(ulp);
	x = ip_len -
	((ip->ip_hl + tcp->th_off) << 2);
	if (cmdlen == 1) {
	match = (cmd->arg1 == x);
	break;
	}
	/* otherwise we have ranges */
	p = ((ipfw_insn_u16 *)cmd)->ports;
	i = cmdlen - 1;
	for (; !match && i>0; i--, p += 2)
	match = (x >= p[0] && x <= p[1]);
	}
	break;

	case O_TCPFLAGS:
	match = (proto == IPPROTO_TCP && offset == 0 &&
	flags_match(cmd, TCP(ulp)->th_flags));
	break;

	case O_TCPOPTS:
	match = (proto == IPPROTO_TCP && offset == 0 &&
	tcpopts_match(TCP(ulp), cmd));
	break;

	case O_TCPSEQ:
	match = (proto == IPPROTO_TCP && offset == 0 &&
	((ipfw_insn_u32 *)cmd)->d[0] ==
	TCP(ulp)->th_seq);
	break;

	case O_TCPACK:
	match = (proto == IPPROTO_TCP && offset == 0 &&
	((ipfw_insn_u32 *)cmd)->d[0] ==
	TCP(ulp)->th_ack);
	break;

	case O_TCPWIN:
	match = (proto == IPPROTO_TCP && offset == 0 &&
	cmd->arg1 == TCP(ulp)->th_win);
	break;

	case O_ESTAB:
	/* reject packets which have SYN only */
	/* XXX should i also check for TH_ACK ? */
	match = (proto == IPPROTO_TCP && offset == 0 &&
	(TCP(ulp)->th_flags &
	(TH_RST \| TH_ACK \| TH_SYN)) != TH_SYN);
	break;

	case O_ALTQ: {
	struct pf_mtag *at;
	ipfw_insn_altq altq = (ipfw_insn_altq )cmd;

	match = 1;
	at = pf_find_mtag(m);
	if (at != NULL && at->qid != 0)
	break;
	at = pf_get_mtag(m);
	if (at == NULL) {
	/*
	* Let the packet fall back to the
	* default ALTQ.
	*/
	break;
	}
	at->qid = altq->qid;
	if (is_ipv4)
	at->af = AF_INET;
	else
	at->af = AF_LINK;
	at->hdr = ip;
	break;
	}

	case O_LOG:
	if (V_fw_verbose)
	ipfw_log(f, hlen, args, m,
	oif, offset, tablearg, ip);
	match = 1;
	break;

	case O_PROB:
	match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
	break;

	case O_VERREVPATH:
	/* Outgoing packets automatically pass/match */
	match = ((oif != NULL) \|\|
	(m->m_pkthdr.rcvif == NULL) \|\|
	(
	#ifdef INET6
	is_ipv6 ?
	verify_path6(&(args->f_id.src_ip6),
	m->m_pkthdr.rcvif) :
	#endif
	verify_path(src_ip, m->m_pkthdr.rcvif,
	args->f_id.fib)));
	break;

	case O_VERSRCREACH:
	/* Outgoing packets automatically pass/match */
	match = (hlen > 0 && ((oif != NULL) \|\|
	#ifdef INET6
	is_ipv6 ?
	verify_path6(&(args->f_id.src_ip6),
	NULL) :
	#endif
	verify_path(src_ip, NULL, args->f_id.fib)));
	break;

	case O_ANTISPOOF:
	/* Outgoing packets automatically pass/match */
	if (oif == NULL && hlen > 0 &&
	( (is_ipv4 && in_localaddr(src_ip))
	#ifdef INET6
	\|\| (is_ipv6 &&
	in6_localaddr(&(args->f_id.src_ip6)))
	#endif
	))
	match =
	#ifdef INET6
	is_ipv6 ? verify_path6(
	&(args->f_id.src_ip6),
	m->m_pkthdr.rcvif) :
	#endif
	verify_path(src_ip,
	m->m_pkthdr.rcvif,
	args->f_id.fib);
	else
	match = 1;
	break;

	case O_IPSEC:
	#ifdef IPSEC
	match = (m_tag_find(m,
	PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
	#endif
	/* otherwise no match */
	break;

	#ifdef INET6
	case O_IP6_SRC:
	match = is_ipv6 &&
	IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
	&((ipfw_insn_ip6 *)cmd)->addr6);
	break;

	case O_IP6_DST:
	match = is_ipv6 &&
	IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
	&((ipfw_insn_ip6 *)cmd)->addr6);
	break;
	case O_IP6_SRC_MASK:
	case O_IP6_DST_MASK:
	if (is_ipv6) {
	int i = cmdlen - 1;
	struct in6_addr p;
	struct in6_addr *d =
	&((ipfw_insn_ip6 *)cmd)->addr6;

	for (; !match && i > 0; d += 2,
	i -= F_INSN_SIZE(struct in6_addr)
	* 2) {
	p = (cmd->opcode ==
	O_IP6_SRC_MASK) ?
	args->f_id.src_ip6:
	args->f_id.dst_ip6;
	APPLY_MASK(&p, &d[1]);
	match =
	IN6_ARE_ADDR_EQUAL(&d[0],
	&p);
	}
	}
	break;

	case O_IP6_SRC_ME:
	match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6);
	break;

	case O_IP6_DST_ME:
	match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6);
	break;

	case O_FLOW6ID:
	match = is_ipv6 &&
	flow6id_match(args->f_id.flow_id6,
	(ipfw_insn_u32 *) cmd);
	break;

	case O_EXT_HDR:
	match = is_ipv6 &&
	(ext_hd & ((ipfw_insn *) cmd)->arg1);
	break;

	case O_IP6:
	match = is_ipv6;
	break;
	#endif

	case O_IP4:
	match = is_ipv4;
	break;

	case O_TAG: {
	uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
	tablearg : cmd->arg1;

	/* Packet is already tagged with this tag? */
	mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);

	/* We have `untag' action when F_NOT flag is
	* present. And we must remove this mtag from
	* mbuf and reset `match' to zero (`match' will
	* be inversed later).
	* Otherwise we should allocate new mtag and
	* push it into mbuf.
	*/
	if (cmd->len & F_NOT) { /* `untag' action */
	if (mtag != NULL)
	m_tag_delete(m, mtag);
	} else if (mtag == NULL) {
	if ((mtag = m_tag_alloc(MTAG_IPFW,
	tag, 0, M_NOWAIT)) != NULL)
	m_tag_prepend(m, mtag);
	}
	match = (cmd->len & F_NOT) ? 0: 1;
	break;
	}

	case O_FIB: /* try match the specified fib */
	if (args->f_id.fib == cmd->arg1)
	match = 1;
	break;

	case O_TAGGED: {
	uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ?
	tablearg : cmd->arg1;

	if (cmdlen == 1) {
	match = m_tag_locate(m, MTAG_IPFW,
	tag, NULL) != NULL;
	break;
	}

	/* we have ranges */
	for (mtag = m_tag_first(m);
	mtag != NULL && !match;
	mtag = m_tag_next(m, mtag)) {
	uint16_t *p;
	int i;

	if (mtag->m_tag_cookie != MTAG_IPFW)
	continue;

	p = ((ipfw_insn_u16 *)cmd)->ports;
	i = cmdlen - 1;
	for(; !match && i > 0; i--, p += 2)
	match =
	mtag->m_tag_id >= p[0] &&
	mtag->m_tag_id <= p[1];
	}
	break;
	}

	/*
	* The second set of opcodes represents 'actions',
	* i.e. the terminal part of a rule once the packet
	* matches all previous patterns.
	* Typically there is only one action for each rule,
	* and the opcode is stored at the end of the rule
	* (but there are exceptions -- see below).
	*
	* In general, here we set retval and terminate the
	* outer loop (would be a 'break 3' in some language,
	* but we need to do a 'goto done').
	*
	* Exceptions:
	* O_COUNT and O_SKIPTO actions:
	* instead of terminating, we jump to the next rule
	* ('goto next_rule', equivalent to a 'break 2'),
	* or to the SKIPTO target ('goto again' after
	* having set f, cmd and l), respectively.
	*
	* O_TAG, O_LOG and O_ALTQ action parameters:
	* perform some action and set match = 1;
	*
	* O_LIMIT and O_KEEP_STATE: these opcodes are
	* not real 'actions', and are stored right
	* before the 'action' part of the rule.
	* These opcodes try to install an entry in the
	* state tables; if successful, we continue with
	* the next opcode (match=1; break;), otherwise
	* the packet * must be dropped
	* ('goto done' after setting retval);
	*
	* O_PROBE_STATE and O_CHECK_STATE: these opcodes
	* cause a lookup of the state table, and a jump
	* to the 'action' part of the parent rule
	* ('goto check_body') if an entry is found, or
	* (CHECK_STATE only) a jump to the next rule if
	* the entry is not found ('goto next_rule').
	* The result of the lookup is cached to make
	* further instances of these opcodes are
	* effectively NOPs.
	*/
	case O_LIMIT:
	case O_KEEP_STATE:
	if (install_state(f,
	(ipfw_insn_limit *)cmd, args, tablearg)) {
	retval = IP_FW_DENY;
	goto done; /* error/limit violation */
	}
	match = 1;
	break;

	case O_PROBE_STATE:
	case O_CHECK_STATE:
	/*
	* dynamic rules are checked at the first
	* keep-state or check-state occurrence,
	* with the result being stored in dyn_dir.
	* The compiler introduces a PROBE_STATE
	* instruction for us when we have a
	* KEEP_STATE (because PROBE_STATE needs
	* to be run first).
	*/
	if (dyn_dir == MATCH_UNKNOWN &&
	(q = lookup_dyn_rule(&args->f_id,
	&dyn_dir, proto == IPPROTO_TCP ?
	TCP(ulp) : NULL))
	!= NULL) {
	/*
	* Found dynamic entry, update stats
	* and jump to the 'action' part of
	* the parent rule.
	*/
	q->pcnt++;
	q->bcnt += pktlen;
	f = q->rule;
	cmd = ACTION_PTR(f);
	l = f->cmd_len - f->act_ofs;
	IPFW_DYN_UNLOCK();
	goto check_body;
	}
	/*
	* Dynamic entry not found. If CHECK_STATE,
	* skip to next rule, if PROBE_STATE just
	* ignore and continue with next opcode.
	*/
	if (cmd->opcode == O_CHECK_STATE)
	goto next_rule;
	match = 1;
	break;

	case O_ACCEPT:
	retval = 0; /* accept */
	goto done;

	case O_PIPE:
	case O_QUEUE:
	args->rule = f; /* report matching rule */
	if (cmd->arg1 == IP_FW_TABLEARG)
	args->cookie = tablearg;
	else
	args->cookie = cmd->arg1;
	retval = IP_FW_DUMMYNET;
	goto done;

	case O_DIVERT:
	case O_TEE: {
	struct divert_tag *dt;

	if (args->eh) /* not on layer 2 */
	break;
	mtag = m_tag_get(PACKET_TAG_DIVERT,
	sizeof(struct divert_tag),
	M_NOWAIT);
	if (mtag == NULL) {
	/* XXX statistic */
	/* drop packet */
	IPFW_RUNLOCK(chain);
	return (IP_FW_DENY);
	}
	dt = (struct divert_tag *)(mtag+1);
	dt->cookie = f->rulenum;
	if (cmd->arg1 == IP_FW_TABLEARG)
	dt->info = tablearg;
	else
	dt->info = cmd->arg1;
	m_tag_prepend(m, mtag);
	retval = (cmd->opcode == O_DIVERT) ?
	IP_FW_DIVERT : IP_FW_TEE;
	goto done;
	}
	case O_COUNT:
	case O_SKIPTO:
	f->pcnt++; /* update stats */
	f->bcnt += pktlen;
	f->timestamp = time_uptime;
	if (cmd->opcode == O_COUNT)
	goto next_rule;
	/* handle skipto */
	if (cmd->arg1 == IP_FW_TABLEARG) {
	f = lookup_next_rule(f, tablearg);
	} else {
	if (f->next_rule == NULL)
	lookup_next_rule(f, 0);
	f = f->next_rule;
	}
	goto again;

	case O_REJECT:
	/*
	* Drop the packet and send a reject notice
	* if the packet is not ICMP (or is an ICMP
	* query), and it is not multicast/broadcast.
	*/
	if (hlen > 0 && is_ipv4 && offset == 0 &&
	(proto != IPPROTO_ICMP \|\|
	is_icmp_query(ICMP(ulp))) &&
	!(m->m_flags & (M_BCAST\|M_MCAST)) &&
	!IN_MULTICAST(ntohl(dst_ip.s_addr))) {
	send_reject(args, cmd->arg1, ip_len, ip);
	m = args->m;
	}
	/* FALLTHROUGH */
	#ifdef INET6
	case O_UNREACH6:
	if (hlen > 0 && is_ipv6 &&
	((offset & IP6F_OFF_MASK) == 0) &&
	(proto != IPPROTO_ICMPV6 \|\|
	(is_icmp6_query(args->f_id.flags) == 1)) &&
	!(m->m_flags & (M_BCAST\|M_MCAST)) &&
	!IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) {
	send_reject6(
	args, cmd->arg1, hlen,
	(struct ip6_hdr *)ip);
	m = args->m;
	}
	/* FALLTHROUGH */
	#endif
	case O_DENY:
	retval = IP_FW_DENY;
	goto done;

	case O_FORWARD_IP: {
	struct sockaddr_in *sa;
	sa = &(((ipfw_insn_sa *)cmd)->sa);
	if (args->eh) /* not valid on layer2 pkts */
	break;
	if (!q \|\| dyn_dir == MATCH_FORWARD) {
	if (sa->sin_addr.s_addr == INADDR_ANY) {
	bcopy(sa, &args->hopstore,
	sizeof(*sa));
	args->hopstore.sin_addr.s_addr =
	htonl(tablearg);
	args->next_hop =
	&args->hopstore;
	} else {
	args->next_hop = sa;
	}
	}
	retval = IP_FW_PASS;
	}
	goto done;

	case O_NETGRAPH:
	case O_NGTEE:
	args->rule = f; /* report matching rule */
	if (cmd->arg1 == IP_FW_TABLEARG)
	args->cookie = tablearg;
	else
	args->cookie = cmd->arg1;
	retval = (cmd->opcode == O_NETGRAPH) ?
	IP_FW_NETGRAPH : IP_FW_NGTEE;
	goto done;

	case O_SETFIB:
	f->pcnt++; /* update stats */
	f->bcnt += pktlen;
	f->timestamp = time_uptime;
	M_SETFIB(m, cmd->arg1);
	args->f_id.fib = cmd->arg1;
	goto next_rule;

	case O_NAT: {
	struct cfg_nat *t;
	int nat_id;

	if (IPFW_NAT_LOADED) {
	args->rule = f; /* Report matching rule. */
	t = ((ipfw_insn_nat *)cmd)->nat;
	if (t == NULL) {
	nat_id = (cmd->arg1 == IP_FW_TABLEARG) ?
	tablearg : cmd->arg1;
	LOOKUP_NAT(V_layer3_chain, nat_id, t);
	if (t == NULL) {
	retval = IP_FW_DENY;
	goto done;
	}
	if (cmd->arg1 != IP_FW_TABLEARG)
	((ipfw_insn_nat *)cmd)->nat = t;
	}
	retval = ipfw_nat_ptr(args, t, m);
	} else
	retval = IP_FW_DENY;
	goto done;
	}

	default:
	panic("-- unknown opcode %d\n", cmd->opcode);
	} /* end of switch() on opcodes */

	if (cmd->len & F_NOT)
	match = !match;

	if (match) {
	if (cmd->len & F_OR)
	skip_or = 1;
	} else {
	if (!(cmd->len & F_OR)) /* not an OR block, */
	break; /* try next rule */
	}

	} /* end of inner for, scan opcodes */

	next_rule:; /* try next rule */

	} /* end of outer for, scan rules */
	printf("ipfw: ouch!, skip past end of rules, denying packet\n");
	IPFW_RUNLOCK(chain);
	return (IP_FW_DENY);

	done:
	/* Update statistics */
	f->pcnt++;
	f->bcnt += pktlen;
	f->timestamp = time_uptime;
	IPFW_RUNLOCK(chain);
	return (retval);

	pullup_failed:
	if (V_fw_verbose)
	printf("ipfw: pullup failed\n");
	return (IP_FW_DENY);
	}

	/*
	* When a rule is added/deleted, clear the next_rule pointers in all rules.
	* These will be reconstructed on the fly as packets are matched.
	*/
	static void
	flush_rule_ptrs(struct ip_fw_chain *chain)
	{
	struct ip_fw *rule;

	IPFW_WLOCK_ASSERT(chain);

	for (rule = chain->rules; rule; rule = rule->next)
	rule->next_rule = NULL;
	}

	/*
	* Add a new rule to the list. Copy the rule into a malloc'ed area, then
	* possibly create a rule number and add the rule to the list.
	* Update the rule_number in the input struct so the caller knows it as well.
	*/
	static int
	add_rule(struct ip_fw_chain chain, struct ip_fw input_rule)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ip_fw rule, f, *prev;
	int l = RULESIZE(input_rule);

	if (chain->rules == NULL && input_rule->rulenum != IPFW_DEFAULT_RULE)
	return (EINVAL);

	rule = malloc(l, M_IPFW, M_NOWAIT \| M_ZERO);
	if (rule == NULL)
	return (ENOSPC);

	bcopy(input_rule, rule, l);

	rule->next = NULL;
	rule->next_rule = NULL;

	rule->pcnt = 0;
	rule->bcnt = 0;
	rule->timestamp = 0;

	IPFW_WLOCK(chain);

	if (chain->rules == NULL) { /* default rule */
	chain->rules = rule;
	goto done;
	}

	/*
	* If rulenum is 0, find highest numbered rule before the
	* default rule, and add autoinc_step
	*/
	if (V_autoinc_step < 1)
	V_autoinc_step = 1;
	else if (V_autoinc_step > 1000)
	V_autoinc_step = 1000;
	if (rule->rulenum == 0) {
	/*
	* locate the highest numbered rule before default
	*/
	for (f = chain->rules; f; f = f->next) {
	if (f->rulenum == IPFW_DEFAULT_RULE)
	break;
	rule->rulenum = f->rulenum;
	}
	if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
	rule->rulenum += V_autoinc_step;
	input_rule->rulenum = rule->rulenum;
	}

	/*
	* Now insert the new rule in the right place in the sorted list.
	*/
	for (prev = NULL, f = chain->rules; f; prev = f, f = f->next) {
	if (f->rulenum > rule->rulenum) { /* found the location */
	if (prev) {
	rule->next = f;
	prev->next = rule;
	} else { /* head insert */
	rule->next = chain->rules;
	chain->rules = rule;
	}
	break;
	}
	}
	flush_rule_ptrs(chain);
	done:
	V_static_count++;
	V_static_len += l;
	IPFW_WUNLOCK(chain);
	DEB(printf("ipfw: installed rule %d, static count now %d\n",
	rule->rulenum, V_static_count);)
	return (0);
	}

	/**
	* Remove a static rule (including derived * dynamic rules)
	* and place it on the ``reap list'' for later reclamation.
	* The caller is in charge of clearing rule pointers to avoid
	* dangling pointers.
	* @return a pointer to the next entry.
	* Arguments are not checked, so they better be correct.
	*/
	static struct ip_fw *
	remove_rule(struct ip_fw_chain chain, struct ip_fw rule,
	struct ip_fw *prev)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ip_fw *n;
	int l = RULESIZE(rule);

	IPFW_WLOCK_ASSERT(chain);

	n = rule->next;
	IPFW_DYN_LOCK();
	remove_dyn_rule(rule, NULL /* force removal */);
	IPFW_DYN_UNLOCK();
	if (prev == NULL)
	chain->rules = n;
	else
	prev->next = n;
	V_static_count--;
	V_static_len -= l;

	rule->next = chain->reap;
	chain->reap = rule;

	return n;
	}

	/**
	* Reclaim storage associated with a list of rules. This is
	* typically the list created using remove_rule.
	*/
	static void
	reap_rules(struct ip_fw *head)
	{
	struct ip_fw *rule;

	while ((rule = head) != NULL) {
	head = head->next;
	if (DUMMYNET_LOADED)
	ip_dn_ruledel_ptr(rule);
	free(rule, M_IPFW);
	}
	}

	/*
	* Remove all rules from a chain (except rules in set RESVD_SET
	* unless kill_default = 1). The caller is responsible for
	* reclaiming storage for the rules left in chain->reap.
	*/
	static void
	free_chain(struct ip_fw_chain *chain, int kill_default)
	{
	struct ip_fw prev, rule;

	IPFW_WLOCK_ASSERT(chain);

	flush_rule_ptrs(chain); /* more efficient to do outside the loop */
	for (prev = NULL, rule = chain->rules; rule ; )
	if (kill_default \|\| rule->set != RESVD_SET)
	rule = remove_rule(chain, rule, prev);
	else {
	prev = rule;
	rule = rule->next;
	}
	}

	/**
	* Remove all rules with given number, and also do set manipulation.
	* Assumes chain != NULL && *chain != NULL.
	*
	* The argument is an u_int32_t. The low 16 bit are the rule or set number,
	* the next 8 bits are the new set, the top 8 bits are the command:
	*
	* 0 delete rules with given number
	* 1 delete rules with given set number
	* 2 move rules with given number to new set
	* 3 move rules with given set number to new set
	* 4 swap sets with given numbers
	* 5 delete rules with given number and with given set number
	*/
	static int
	del_entry(struct ip_fw_chain *chain, u_int32_t arg)
	{
	struct ip_fw prev = NULL, rule;
	u_int16_t rulenum; /* rule or old_set */
	u_int8_t cmd, new_set;

	rulenum = arg & 0xffff;
	cmd = (arg >> 24) & 0xff;
	new_set = (arg >> 16) & 0xff;

	if (cmd > 5 \|\| new_set > RESVD_SET)
	return EINVAL;
	if (cmd == 0 \|\| cmd == 2 \|\| cmd == 5) {
	if (rulenum >= IPFW_DEFAULT_RULE)
	return EINVAL;
	} else {
	if (rulenum > RESVD_SET) /* old_set */
	return EINVAL;
	}

	IPFW_WLOCK(chain);
	rule = chain->rules;
	chain->reap = NULL;
	switch (cmd) {
	case 0: /* delete rules with given number */
	/*
	* locate first rule to delete
	*/
	for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
	;
	if (rule->rulenum != rulenum) {
	IPFW_WUNLOCK(chain);
	return EINVAL;
	}

	/*
	* flush pointers outside the loop, then delete all matching
	* rules. prev remains the same throughout the cycle.
	*/
	flush_rule_ptrs(chain);
	while (rule->rulenum == rulenum)
	rule = remove_rule(chain, rule, prev);
	break;

	case 1: /* delete all rules with given set number */
	flush_rule_ptrs(chain);
	rule = chain->rules;
	while (rule->rulenum < IPFW_DEFAULT_RULE)
	if (rule->set == rulenum)
	rule = remove_rule(chain, rule, prev);
	else {
	prev = rule;
	rule = rule->next;
	}
	break;

	case 2: /* move rules with given number to new set */
	rule = chain->rules;
	for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
	if (rule->rulenum == rulenum)
	rule->set = new_set;
	break;

	case 3: /* move rules with given set number to new set */
	for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
	if (rule->set == rulenum)
	rule->set = new_set;
	break;

	case 4: /* swap two sets */
	for (; rule->rulenum < IPFW_DEFAULT_RULE; rule = rule->next)
	if (rule->set == rulenum)
	rule->set = new_set;
	else if (rule->set == new_set)
	rule->set = rulenum;
	break;
	case 5: /* delete rules with given number and with given set number.
	* rulenum - given rule number;
	* new_set - given set number.
	*/
	for (; rule->rulenum < rulenum; prev = rule, rule = rule->next)
	;
	if (rule->rulenum != rulenum) {
	IPFW_WUNLOCK(chain);
	return (EINVAL);
	}
	flush_rule_ptrs(chain);
	while (rule->rulenum == rulenum) {
	if (rule->set == new_set)
	rule = remove_rule(chain, rule, prev);
	else {
	prev = rule;
	rule = rule->next;
	}
	}
	}
	/*
	* Look for rules to reclaim. We grab the list before
	* releasing the lock then reclaim them w/o the lock to
	* avoid a LOR with dummynet.
	*/
	rule = chain->reap;
	chain->reap = NULL;
	IPFW_WUNLOCK(chain);
	if (rule)
	reap_rules(rule);
	return 0;
	}

	/*
	* Clear counters for a specific rule.
	* The enclosing "table" is assumed locked.
	*/
	static void
	clear_counters(struct ip_fw *rule, int log_only)
	{
	ipfw_insn_log l = (ipfw_insn_log )ACTION_PTR(rule);

	if (log_only == 0) {
	rule->bcnt = rule->pcnt = 0;
	rule->timestamp = 0;
	}
	if (l->o.opcode == O_LOG)
	l->log_left = l->max_log;
	}

	/**
	* Reset some or all counters on firewall rules.
	* The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
	* the next 8 bits are the set number, the top 8 bits are the command:
	* 0 work with rules from all set's;
	* 1 work with rules only from specified set.
	* Specified rule number is zero if we want to clear all entries.
	* log_only is 1 if we only want to reset logs, zero otherwise.
	*/
	static int
	zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ip_fw *rule;
	char *msg;

	uint16_t rulenum = arg & 0xffff;
	uint8_t set = (arg >> 16) & 0xff;
	uint8_t cmd = (arg >> 24) & 0xff;

	if (cmd > 1)
	return (EINVAL);
	if (cmd == 1 && set > RESVD_SET)
	return (EINVAL);

	IPFW_WLOCK(chain);
	if (rulenum == 0) {
	V_norule_counter = 0;
	for (rule = chain->rules; rule; rule = rule->next) {
	/* Skip rules from another set. */
	if (cmd == 1 && rule->set != set)
	continue;
	clear_counters(rule, log_only);
	}
	msg = log_only ? "ipfw: All logging counts reset.\n" :
	"ipfw: Accounting cleared.\n";
	} else {
	int cleared = 0;
	/*
	* We can have multiple rules with the same number, so we
	* need to clear them all.
	*/
	for (rule = chain->rules; rule; rule = rule->next)
	if (rule->rulenum == rulenum) {
	while (rule && rule->rulenum == rulenum) {
	if (cmd == 0 \|\| rule->set == set)
	clear_counters(rule, log_only);
	rule = rule->next;
	}
	cleared = 1;
	break;
	}
	if (!cleared) { /* we did not find any matching rules */
	IPFW_WUNLOCK(chain);
	return (EINVAL);
	}
	msg = log_only ? "ipfw: Entry %d logging count reset.\n" :
	"ipfw: Entry %d cleared.\n";
	}
	IPFW_WUNLOCK(chain);

	if (V_fw_verbose)
	log(LOG_SECURITY \| LOG_NOTICE, msg, rulenum);
	return (0);
	}

	/*
	* Check validity of the structure before insert.
	* Fortunately rules are simple, so this mostly need to check rule sizes.
	*/
	static int
	check_ipfw_struct(struct ip_fw *rule, int size)
	{
	int l, cmdlen = 0;
	int have_action=0;
	ipfw_insn *cmd;

	if (size < sizeof(*rule)) {
	printf("ipfw: rule too short\n");
	return (EINVAL);
	}
	/* first, check for valid size */
	l = RULESIZE(rule);
	if (l != size) {
	printf("ipfw: size mismatch (have %d want %d)\n", size, l);
	return (EINVAL);
	}
	if (rule->act_ofs >= rule->cmd_len) {
	printf("ipfw: bogus action offset (%u > %u)\n",
	rule->act_ofs, rule->cmd_len - 1);
	return (EINVAL);
	}
	/*
	* Now go for the individual checks. Very simple ones, basically only
	* instruction sizes.
	*/
	for (l = rule->cmd_len, cmd = rule->cmd ;
	l > 0 ; l -= cmdlen, cmd += cmdlen) {
	cmdlen = F_LEN(cmd);
	if (cmdlen > l) {
	printf("ipfw: opcode %d size truncated\n",
	cmd->opcode);
	return EINVAL;
	}
	DEB(printf("ipfw: opcode %d\n", cmd->opcode);)
	switch (cmd->opcode) {
	case O_PROBE_STATE:
	case O_KEEP_STATE:
	case O_PROTO:
	case O_IP_SRC_ME:
	case O_IP_DST_ME:
	case O_LAYER2:
	case O_IN:
	case O_FRAG:
	case O_DIVERTED:
	case O_IPOPT:
	case O_IPTOS:
	case O_IPPRECEDENCE:
	case O_IPVER:
	case O_TCPWIN:
	case O_TCPFLAGS:
	case O_TCPOPTS:
	case O_ESTAB:
	case O_VERREVPATH:
	case O_VERSRCREACH:
	case O_ANTISPOOF:
	case O_IPSEC:
	#ifdef INET6
	case O_IP6_SRC_ME:
	case O_IP6_DST_ME:
	case O_EXT_HDR:
	case O_IP6:
	#endif
	case O_IP4:
	case O_TAG:
	if (cmdlen != F_INSN_SIZE(ipfw_insn))
	goto bad_size;
	break;

	case O_FIB:
	if (cmdlen != F_INSN_SIZE(ipfw_insn))
	goto bad_size;
	if (cmd->arg1 >= rt_numfibs) {
	printf("ipfw: invalid fib number %d\n",
	cmd->arg1);
	return EINVAL;
	}
	break;

	case O_SETFIB:
	if (cmdlen != F_INSN_SIZE(ipfw_insn))
	goto bad_size;
	if (cmd->arg1 >= rt_numfibs) {
	printf("ipfw: invalid fib number %d\n",
	cmd->arg1);
	return EINVAL;
	}
	goto check_action;

	case O_UID:
	case O_GID:
	case O_JAIL:
	case O_IP_SRC:
	case O_IP_DST:
	case O_TCPSEQ:
	case O_TCPACK:
	case O_PROB:
	case O_ICMPTYPE:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
	goto bad_size;
	break;

	case O_LIMIT:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
	goto bad_size;
	break;

	case O_LOG:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
	goto bad_size;

	((ipfw_insn_log *)cmd)->log_left =
	((ipfw_insn_log *)cmd)->max_log;

	break;

	case O_IP_SRC_MASK:
	case O_IP_DST_MASK:
	/* only odd command lengths */
	if ( !(cmdlen & 1) \|\| cmdlen > 31)
	goto bad_size;
	break;

	case O_IP_SRC_SET:
	case O_IP_DST_SET:
	if (cmd->arg1 == 0 \|\| cmd->arg1 > 256) {
	printf("ipfw: invalid set size %d\n",
	cmd->arg1);
	return EINVAL;
	}
	if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
	(cmd->arg1+31)/32 )
	goto bad_size;
	break;

	case O_IP_SRC_LOOKUP:
	case O_IP_DST_LOOKUP:
	if (cmd->arg1 >= IPFW_TABLES_MAX) {
	printf("ipfw: invalid table number %d\n",
	cmd->arg1);
	return (EINVAL);
	}
	if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
	cmdlen != F_INSN_SIZE(ipfw_insn_u32))
	goto bad_size;
	break;

	case O_MACADDR2:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
	goto bad_size;
	break;

	case O_NOP:
	case O_IPID:
	case O_IPTTL:
	case O_IPLEN:
	case O_TCPDATALEN:
	case O_TAGGED:
	if (cmdlen < 1 \|\| cmdlen > 31)
	goto bad_size;
	break;

	case O_MAC_TYPE:
	case O_IP_SRCPORT:
	case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
	if (cmdlen < 2 \|\| cmdlen > 31)
	goto bad_size;
	break;

	case O_RECV:
	case O_XMIT:
	case O_VIA:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
	goto bad_size;
	break;

	case O_ALTQ:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
	goto bad_size;
	break;

	case O_PIPE:
	case O_QUEUE:
	if (cmdlen != F_INSN_SIZE(ipfw_insn))
	goto bad_size;
	goto check_action;

	case O_FORWARD_IP:
	#ifdef IPFIREWALL_FORWARD
	if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
	goto bad_size;
	goto check_action;
	#else
	return EINVAL;
	#endif

	case O_DIVERT:
	case O_TEE:
	if (ip_divert_ptr == NULL)
	return EINVAL;
	else
	goto check_size;
	case O_NETGRAPH:
	case O_NGTEE:
	if (!NG_IPFW_LOADED)
	return EINVAL;
	else
	goto check_size;
	case O_NAT:
	if (!IPFW_NAT_LOADED)
	return EINVAL;
	if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
	goto bad_size;
	goto check_action;
	case O_FORWARD_MAC: /* XXX not implemented yet */
	case O_CHECK_STATE:
	case O_COUNT:
	case O_ACCEPT:
	case O_DENY:
	case O_REJECT:
	#ifdef INET6
	case O_UNREACH6:
	#endif
	case O_SKIPTO:
	check_size:
	if (cmdlen != F_INSN_SIZE(ipfw_insn))
	goto bad_size;
	check_action:
	if (have_action) {
	printf("ipfw: opcode %d, multiple actions"
	" not allowed\n",
	cmd->opcode);
	return EINVAL;
	}
	have_action = 1;
	if (l != cmdlen) {
	printf("ipfw: opcode %d, action must be"
	" last opcode\n",
	cmd->opcode);
	return EINVAL;
	}
	break;
	#ifdef INET6
	case O_IP6_SRC:
	case O_IP6_DST:
	if (cmdlen != F_INSN_SIZE(struct in6_addr) +
	F_INSN_SIZE(ipfw_insn))
	goto bad_size;
	break;

	case O_FLOW6ID:
	if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
	((ipfw_insn_u32 *)cmd)->o.arg1)
	goto bad_size;
	break;

	case O_IP6_SRC_MASK:
	case O_IP6_DST_MASK:
	if ( !(cmdlen & 1) \|\| cmdlen > 127)
	goto bad_size;
	break;
	case O_ICMP6TYPE:
	if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
	goto bad_size;
	break;
	#endif

	default:
	switch (cmd->opcode) {
	#ifndef INET6
	case O_IP6_SRC_ME:
	case O_IP6_DST_ME:
	case O_EXT_HDR:
	case O_IP6:
	case O_UNREACH6:
	case O_IP6_SRC:
	case O_IP6_DST:
	case O_FLOW6ID:
	case O_IP6_SRC_MASK:
	case O_IP6_DST_MASK:
	case O_ICMP6TYPE:
	printf("ipfw: no IPv6 support in kernel\n");
	return EPROTONOSUPPORT;
	#endif
	default:
	printf("ipfw: opcode %d, unknown opcode\n",
	cmd->opcode);
	return EINVAL;
	}
	}
	}
	if (have_action == 0) {
	printf("ipfw: missing action\n");
	return EINVAL;
	}
	return 0;

	bad_size:
	printf("ipfw: opcode %d size %d wrong\n",
	cmd->opcode, cmdlen);
	return EINVAL;
	}

	/*
	* Copy the static and dynamic rules to the supplied buffer
	* and return the amount of space actually used.
	*/
	static size_t
	ipfw_getrules(struct ip_fw_chain chain, void buf, size_t space)
	{
	+ INIT_VNET_IPFW(curvnet);
	char *bp = buf;
	char *ep = bp + space;
	struct ip_fw *rule;
	int i;
	time_t boot_seconds;

	boot_seconds = boottime.tv_sec;
	/* XXX this can take a long time and locking will block packet flow */
	IPFW_RLOCK(chain);
	for (rule = chain->rules; rule ; rule = rule->next) {
	/*
	* Verify the entry fits in the buffer in case the
	* rules changed between calculating buffer space and
	* now. This would be better done using a generation
	* number but should suffice for now.
	*/
	i = RULESIZE(rule);
	if (bp + i <= ep) {
	bcopy(rule, bp, i);
	/*
	* XXX HACK. Store the disable mask in the "next"
	* pointer in a wild attempt to keep the ABI the same.
	* Why do we do this on EVERY rule?
	*/
	bcopy(&V_set_disable,
	&(((struct ip_fw *)bp)->next_rule),
	sizeof(V_set_disable));
	if (((struct ip_fw *)bp)->timestamp)
	((struct ip_fw *)bp)->timestamp += boot_seconds;
	bp += i;
	}
	}
	IPFW_RUNLOCK(chain);
	if (V_ipfw_dyn_v) {
	ipfw_dyn_rule p, last = NULL;

	IPFW_DYN_LOCK();
	for (i = 0 ; i < V_curr_dyn_buckets; i++)
	for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) {
	if (bp + sizeof *p <= ep) {
	ipfw_dyn_rule *dst =
	(ipfw_dyn_rule *)bp;
	bcopy(p, dst, sizeof *p);
	bcopy(&(p->rule->rulenum), &(dst->rule),
	sizeof(p->rule->rulenum));
	/*
	* store set number into high word of
	* dst->rule pointer.
	*/
	bcopy(&(p->rule->set),
	(char *)&dst->rule +
	sizeof(p->rule->rulenum),
	sizeof(p->rule->set));
	/*
	* store a non-null value in "next".
	* The userland code will interpret a
	* NULL here as a marker
	* for the last dynamic rule.
	*/
	bcopy(&dst, &dst->next, sizeof(dst));
	last = dst;
	dst->expire =
	TIME_LEQ(dst->expire, time_uptime) ?
	0 : dst->expire - time_uptime ;
	bp += sizeof(ipfw_dyn_rule);
	}
	}
	IPFW_DYN_UNLOCK();
	if (last != NULL) /* mark last dynamic rule */
	bzero(&last->next, sizeof(last));
	}
	return (bp - (char *)buf);
	}


	/**
	* {set\|get}sockopt parser.
	*/
	static int
	ipfw_ctl(struct sockopt *sopt)
	{
	#define RULE_MAXSIZE (256*sizeof(u_int32_t))
	+ INIT_VNET_IPFW(curvnet);
	int error;
	size_t size;
	struct ip_fw buf, rule;
	u_int32_t rulenum[2];

	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
	if (error)
	return (error);

	/*
	* Disallow modifications in really-really secure mode, but still allow
	* the logging counters to be reset.
	*/
	if (sopt->sopt_name == IP_FW_ADD \|\|
	(sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) {
	error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
	if (error)
	return (error);
	}

	error = 0;

	switch (sopt->sopt_name) {
	case IP_FW_GET:
	/*
	* pass up a copy of the current rules. Static rules
	* come first (the last of which has number IPFW_DEFAULT_RULE),
	* followed by a possibly empty list of dynamic rule.
	* The last dynamic rule has NULL in the "next" field.
	*
	* Note that the calculated size is used to bound the
	* amount of data returned to the user. The rule set may
	* change between calculating the size and returning the
	* data in which case we'll just return what fits.
	*/
	size = V_static_len; /* size of static rules */
	if (V_ipfw_dyn_v) /* add size of dyn.rules */
	size += (V_dyn_count * sizeof(ipfw_dyn_rule));

	/*
	* XXX todo: if the user passes a short length just to know
	* how much room is needed, do not bother filling up the
	* buffer, just jump to the sooptcopyout.
	*/
	buf = malloc(size, M_TEMP, M_WAITOK);
	error = sooptcopyout(sopt, buf,
	ipfw_getrules(&V_layer3_chain, buf, size));
	free(buf, M_TEMP);
	break;

	case IP_FW_FLUSH:
	/*
	* Normally we cannot release the lock on each iteration.
	* We could do it here only because we start from the head all
	* the times so there is no risk of missing some entries.
	* On the other hand, the risk is that we end up with
	* a very inconsistent ruleset, so better keep the lock
	* around the whole cycle.
	*
	* XXX this code can be improved by resetting the head of
	* the list to point to the default rule, and then freeing
	* the old list without the need for a lock.
	*/

	IPFW_WLOCK(&V_layer3_chain);
	V_layer3_chain.reap = NULL;
	free_chain(&V_layer3_chain, 0 /* keep default rule */);
	rule = V_layer3_chain.reap;
	V_layer3_chain.reap = NULL;
	IPFW_WUNLOCK(&V_layer3_chain);
	if (rule != NULL)
	reap_rules(rule);
	break;

	case IP_FW_ADD:
	rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
	error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
	sizeof(struct ip_fw) );
	if (error == 0)
	error = check_ipfw_struct(rule, sopt->sopt_valsize);
	if (error == 0) {
	error = add_rule(&V_layer3_chain, rule);
	size = RULESIZE(rule);
	if (!error && sopt->sopt_dir == SOPT_GET)
	error = sooptcopyout(sopt, rule, size);
	}
	free(rule, M_TEMP);
	break;

	case IP_FW_DEL:
	/*
	* IP_FW_DEL is used for deleting single rules or sets,
	* and (ab)used to atomically manipulate sets. Argument size
	* is used to distinguish between the two:
	* sizeof(u_int32_t)
	* delete single rule or set of rules,
	* or reassign rules (or sets) to a different set.
	* 2*sizeof(u_int32_t)
	* atomic disable/enable sets.
	* first u_int32_t contains sets to be disabled,
	* second u_int32_t contains sets to be enabled.
	*/
	error = sooptcopyin(sopt, rulenum,
	2*sizeof(u_int32_t), sizeof(u_int32_t));
	if (error)
	break;
	size = sopt->sopt_valsize;
	if (size == sizeof(u_int32_t)) /* delete or reassign */
	error = del_entry(&V_layer3_chain, rulenum[0]);
	else if (size == 2sizeof(u_int32_t)) / set enable/disable */
	V_set_disable =
	(V_set_disable \| rulenum[0]) & ~rulenum[1] &
	~(1<<RESVD_SET); /* set RESVD_SET always enabled */
	else
	error = EINVAL;
	break;

	case IP_FW_ZERO:
	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
	rulenum[0] = 0;
	if (sopt->sopt_val != 0) {
	error = sooptcopyin(sopt, rulenum,
	sizeof(u_int32_t), sizeof(u_int32_t));
	if (error)
	break;
	}
	error = zero_entry(&V_layer3_chain, rulenum[0],
	sopt->sopt_name == IP_FW_RESETLOG);
	break;

	case IP_FW_TABLE_ADD:
	{
	ipfw_table_entry ent;

	error = sooptcopyin(sopt, &ent,
	sizeof(ent), sizeof(ent));
	if (error)
	break;
	error = add_table_entry(&V_layer3_chain, ent.tbl,
	ent.addr, ent.masklen, ent.value);
	}
	break;

	case IP_FW_TABLE_DEL:
	{
	ipfw_table_entry ent;

	error = sooptcopyin(sopt, &ent,
	sizeof(ent), sizeof(ent));
	if (error)
	break;
	error = del_table_entry(&V_layer3_chain, ent.tbl,
	ent.addr, ent.masklen);
	}
	break;

	case IP_FW_TABLE_FLUSH:
	{
	u_int16_t tbl;

	error = sooptcopyin(sopt, &tbl,
	sizeof(tbl), sizeof(tbl));
	if (error)
	break;
	IPFW_WLOCK(&V_layer3_chain);
	error = flush_table(&V_layer3_chain, tbl);
	IPFW_WUNLOCK(&V_layer3_chain);
	}
	break;

	case IP_FW_TABLE_GETSIZE:
	{
	u_int32_t tbl, cnt;

	if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
	sizeof(tbl))))
	break;
	IPFW_RLOCK(&V_layer3_chain);
	error = count_table(&V_layer3_chain, tbl, &cnt);
	IPFW_RUNLOCK(&V_layer3_chain);
	if (error)
	break;
	error = sooptcopyout(sopt, &cnt, sizeof(cnt));
	}
	break;

	case IP_FW_TABLE_LIST:
	{
	ipfw_table *tbl;

	if (sopt->sopt_valsize < sizeof(*tbl)) {
	error = EINVAL;
	break;
	}
	size = sopt->sopt_valsize;
	tbl = malloc(size, M_TEMP, M_WAITOK);
	error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
	if (error) {
	free(tbl, M_TEMP);
	break;
	}
	tbl->size = (size - sizeof(*tbl)) /
	sizeof(ipfw_table_entry);
	IPFW_RLOCK(&V_layer3_chain);
	error = dump_table(&V_layer3_chain, tbl);
	IPFW_RUNLOCK(&V_layer3_chain);
	if (error) {
	free(tbl, M_TEMP);
	break;
	}
	error = sooptcopyout(sopt, tbl, size);
	free(tbl, M_TEMP);
	}
	break;

	case IP_FW_NAT_CFG:
	{
	if (IPFW_NAT_LOADED)
	error = ipfw_nat_cfg_ptr(sopt);
	else {
	printf("IP_FW_NAT_CFG: ipfw_nat not present, please load it.\n");
	error = EINVAL;
	}
	}
	break;

	case IP_FW_NAT_DEL:
	{
	if (IPFW_NAT_LOADED)
	error = ipfw_nat_del_ptr(sopt);
	else {
	printf("IP_FW_NAT_DEL: ipfw_nat not present, please load it.\n");
	printf("ipfw_nat not loaded: %d\n", sopt->sopt_name);
	error = EINVAL;
	}
	}
	break;

	case IP_FW_NAT_GET_CONFIG:
	{
	if (IPFW_NAT_LOADED)
	error = ipfw_nat_get_cfg_ptr(sopt);
	else {
	printf("IP_FW_NAT_GET_CFG: ipfw_nat not present, please load it.\n");
	error = EINVAL;
	}
	}
	break;

	case IP_FW_NAT_GET_LOG:
	{
	if (IPFW_NAT_LOADED)
	error = ipfw_nat_get_log_ptr(sopt);
	else {
	printf("IP_FW_NAT_GET_LOG: ipfw_nat not present, please load it.\n");
	error = EINVAL;
	}
	}
	break;

	default:
	printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
	error = EINVAL;
	}

	return (error);
	#undef RULE_MAXSIZE
	}

	/**
	* dummynet needs a reference to the default rule, because rules can be
	* deleted while packets hold a reference to them. When this happens,
	* dummynet changes the reference to the default rule (it could well be a
	* NULL pointer, but this way we do not need to check for the special
	* case, plus here he have info on the default behaviour).
	*/
	struct ip_fw *ip_fw_default_rule;

	/*
	* This procedure is only used to handle keepalives. It is invoked
	* every dyn_keepalive_period
	*/
	static void
	ipfw_tick(void * __unused unused)
	{
	struct mbuf m0, m, mnext, *mtailp;
	int i;
	ipfw_dyn_rule *q;

	if (V_dyn_keepalive == 0 \|\| V_ipfw_dyn_v == NULL \|\| V_dyn_count == 0)
	goto done;

	/*
	* We make a chain of packets to go out here -- not deferring
	* until after we drop the IPFW dynamic rule lock would result
	* in a lock order reversal with the normal packet input -> ipfw
	* call stack.
	*/
	m0 = NULL;
	mtailp = &m0;
	IPFW_DYN_LOCK();
	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
	for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
	if (q->dyn_type == O_LIMIT_PARENT)
	continue;
	if (q->id.proto != IPPROTO_TCP)
	continue;
	if ( (q->state & BOTH_SYN) != BOTH_SYN)
	continue;
	if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
	q->expire))
	continue; /* too early */
	if (TIME_LEQ(q->expire, time_uptime))
	continue; /* too late, rule expired */

	*mtailp = send_pkt(NULL, &(q->id), q->ack_rev - 1,
	q->ack_fwd, TH_SYN);
	if (*mtailp != NULL)
	mtailp = &(*mtailp)->m_nextpkt;
	*mtailp = send_pkt(NULL, &(q->id), q->ack_fwd - 1,
	q->ack_rev, 0);
	if (*mtailp != NULL)
	mtailp = &(*mtailp)->m_nextpkt;
	}
	}
	IPFW_DYN_UNLOCK();
	for (m = mnext = m0; m != NULL; m = mnext) {
	mnext = m->m_nextpkt;
	m->m_nextpkt = NULL;
	ip_output(m, NULL, NULL, 0, NULL, NULL);
	}
	done:
	callout_reset(&V_ipfw_timeout, V_dyn_keepalive_period * hz,
	ipfw_tick, NULL);
	}

	int
	ipfw_init(void)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ip_fw default_rule;
	int error;

	#ifdef INET6
	/* Setup IPv6 fw sysctl tree. */
	sysctl_ctx_init(&ip6_fw_sysctl_ctx);
	ip6_fw_sysctl_tree = SYSCTL_ADD_NODE(&ip6_fw_sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_net_inet6_ip6), OID_AUTO, "fw",
	CTLFLAG_RW \| CTLFLAG_SECURE, 0, "Firewall");
	SYSCTL_ADD_PROC(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree),
	OID_AUTO, "enable", CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_SECURE3,
	&V_fw6_enable, 0, ipfw_chg_hook, "I", "Enable ipfw+6");
	SYSCTL_ADD_INT(&ip6_fw_sysctl_ctx, SYSCTL_CHILDREN(ip6_fw_sysctl_tree),
	OID_AUTO, "deny_unknown_exthdrs", CTLFLAG_RW \| CTLFLAG_SECURE,
	&V_fw_deny_unknown_exthdrs, 0,
	"Deny packets with unknown IPv6 Extension Headers");
	#endif

	V_layer3_chain.rules = NULL;
	IPFW_LOCK_INIT(&V_layer3_chain);
	ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule",
	sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, 0);
	IPFW_DYN_LOCK_INIT();
	callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE);

	bzero(&default_rule, sizeof default_rule);

	default_rule.act_ofs = 0;
	default_rule.rulenum = IPFW_DEFAULT_RULE;
	default_rule.cmd_len = 1;
	default_rule.set = RESVD_SET;

	default_rule.cmd[0].len = 1;
	default_rule.cmd[0].opcode =
	#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
	1 ? O_ACCEPT :
	#endif
	O_DENY;

	error = add_rule(&V_layer3_chain, &default_rule);
	if (error != 0) {
	printf("ipfw2: error %u initializing default rule "
	"(support disabled)\n", error);
	IPFW_DYN_LOCK_DESTROY();
	IPFW_LOCK_DESTROY(&V_layer3_chain);
	uma_zdestroy(ipfw_dyn_rule_zone);
	return (error);
	}

	ip_fw_default_rule = V_layer3_chain.rules;
	printf("ipfw2 "
	#ifdef INET6
	"(+ipv6) "
	#endif
	"initialized, divert %s, nat %s, "
	"rule-based forwarding "
	#ifdef IPFIREWALL_FORWARD
	"enabled, "
	#else
	"disabled, "
	#endif
	"default to %s, logging ",
	#ifdef IPDIVERT
	"enabled",
	#else
	"loadable",
	#endif
	#ifdef IPFIREWALL_NAT
	"enabled",
	#else
	"loadable",
	#endif

	default_rule.cmd[0].opcode == O_ACCEPT ? "accept" : "deny");

	#ifdef IPFIREWALL_VERBOSE
	V_fw_verbose = 1;
	#endif
	#ifdef IPFIREWALL_VERBOSE_LIMIT
	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
	#endif
	if (V_fw_verbose == 0)
	printf("disabled\n");
	else if (V_verbose_limit == 0)
	printf("unlimited\n");
	else
	printf("limited to %d packets/entry by default\n",
	V_verbose_limit);

	error = init_tables(&V_layer3_chain);
	if (error) {
	IPFW_DYN_LOCK_DESTROY();
	IPFW_LOCK_DESTROY(&V_layer3_chain);
	uma_zdestroy(ipfw_dyn_rule_zone);
	return (error);
	}
	ip_fw_ctl_ptr = ipfw_ctl;
	ip_fw_chk_ptr = ipfw_chk;
	callout_reset(&V_ipfw_timeout, hz, ipfw_tick, NULL);
	LIST_INIT(&V_layer3_chain.nat);
	return (0);
	}

	void
	ipfw_destroy(void)
	{
	struct ip_fw *reap;

	ip_fw_chk_ptr = NULL;
	ip_fw_ctl_ptr = NULL;
	callout_drain(&V_ipfw_timeout);
	IPFW_WLOCK(&V_layer3_chain);
	flush_tables(&V_layer3_chain);
	V_layer3_chain.reap = NULL;
	free_chain(&V_layer3_chain, 1 /* kill default rule */);
	reap = V_layer3_chain.reap, V_layer3_chain.reap = NULL;
	IPFW_WUNLOCK(&V_layer3_chain);
	if (reap != NULL)
	reap_rules(reap);
	IPFW_DYN_LOCK_DESTROY();
	uma_zdestroy(ipfw_dyn_rule_zone);
	if (V_ipfw_dyn_v != NULL)
	free(V_ipfw_dyn_v, M_IPFW);
	IPFW_LOCK_DESTROY(&V_layer3_chain);

	#ifdef INET6
	/* Free IPv6 fw sysctl tree. */
	sysctl_ctx_free(&ip6_fw_sysctl_ctx);
	#endif

	printf("IP firewall unloaded\n");
	}
	Index: head/sys/netinet/ip_fw_nat.c
	===================================================================
	--- head/sys/netinet/ip_fw_nat.c (revision 183549)
	+++ head/sys/netinet/ip_fw_nat.c (revision 183550)
	@@ -1,655 +1,663 @@
	/*-
	* Copyright (c) 2008 Paolo Pisati
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/condvar.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/jail.h>
	#include <sys/module.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/ucred.h>
	#include <sys/vimage.h>

	#include <netinet/libalias/alias.h>
	#include <netinet/libalias/alias_local.h>

	#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */

	#include <net/if.h>
	#include <netinet/in.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/ip_fw.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>

	#include <machine/in_cksum.h> /* XXX for in_cksum */

	MALLOC_DECLARE(M_IPFW);

	extern struct ip_fw_chain layer3_chain;

	static eventhandler_tag ifaddr_event_tag;

	extern ipfw_nat_t *ipfw_nat_ptr;
	extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
	extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
	extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
	extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;

	static void
	ifaddr_change(void arg __unused, struct ifnet ifp)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct cfg_nat *ptr;
	struct ifaddr *ifa;

	IPFW_WLOCK(&V_layer3_chain);
	/* Check every nat entry... */
	LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) {
	/* ...using nic 'ifp->if_xname' as dynamic alias address. */
	if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) == 0) {
	mtx_lock(&ifp->if_addr_mtx);
	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr == NULL)
	continue;
	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	ptr->ip = ((struct sockaddr_in *)
	(ifa->ifa_addr))->sin_addr;
	LibAliasSetAddress(ptr->lib, ptr->ip);
	}
	mtx_unlock(&ifp->if_addr_mtx);
	}
	}
	IPFW_WUNLOCK(&V_layer3_chain);
	}

	static void
	flush_nat_ptrs(const int i)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ip_fw *rule;

	IPFW_WLOCK_ASSERT(&V_layer3_chain);
	for (rule = V_layer3_chain.rules; rule; rule = rule->next) {
	ipfw_insn_nat cmd = (ipfw_insn_nat )ACTION_PTR(rule);
	if (cmd->o.opcode != O_NAT)
	continue;
	if (cmd->nat != NULL && cmd->nat->id == i)
	cmd->nat = NULL;
	}
	}

	#define HOOK_NAT(b, p) do { \
	IPFW_WLOCK_ASSERT(&V_layer3_chain); \
	LIST_INSERT_HEAD(b, p, _next); \
	} while (0)

	#define UNHOOK_NAT(p) do { \
	IPFW_WLOCK_ASSERT(&V_layer3_chain); \
	LIST_REMOVE(p, _next); \
	} while (0)

	#define HOOK_REDIR(b, p) do { \
	LIST_INSERT_HEAD(b, p, _next); \
	} while (0)

	#define HOOK_SPOOL(b, p) do { \
	LIST_INSERT_HEAD(b, p, _next); \
	} while (0)

	static void
	del_redir_spool_cfg(struct cfg_nat n, struct redir_chain head)
	{
	struct cfg_redir r, tmp_r;
	struct cfg_spool s, tmp_s;
	int i, num;

	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
	num = 1; /* Number of alias_link to delete. */
	switch (r->mode) {
	case REDIR_PORT:
	num = r->pport_cnt;
	/* FALLTHROUGH */
	case REDIR_ADDR:
	case REDIR_PROTO:
	/* Delete all libalias redirect entry. */
	for (i = 0; i < num; i++)
	LibAliasRedirectDelete(n->lib, r->alink[i]);
	/* Del spool cfg if any. */
	LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
	LIST_REMOVE(s, _next);
	free(s, M_IPFW);
	}
	free(r->alink, M_IPFW);
	LIST_REMOVE(r, _next);
	free(r, M_IPFW);
	break;
	default:
	printf("unknown redirect mode: %u\n", r->mode);
	/* XXX - panic?!?!? */
	break;
	}
	}
	}

	static int
	add_redir_spool_cfg(char buf, struct cfg_nat ptr)
	{
	struct cfg_redir r, ser_r;
	struct cfg_spool s, ser_s;
	int cnt, off, i;
	char *panic_err;

	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
	ser_r = (struct cfg_redir *)&buf[off];
	r = malloc(SOF_REDIR, M_IPFW, M_WAITOK \| M_ZERO);
	memcpy(r, ser_r, SOF_REDIR);
	LIST_INIT(&r->spool_chain);
	off += SOF_REDIR;
	r->alink = malloc(sizeof(struct alias_link ) r->pport_cnt,
	M_IPFW, M_WAITOK \| M_ZERO);
	switch (r->mode) {
	case REDIR_ADDR:
	r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
	r->paddr);
	break;
	case REDIR_PORT:
	for (i = 0 ; i < r->pport_cnt; i++) {
	/* If remotePort is all ports, set it to 0. */
	u_short remotePortCopy = r->rport + i;
	if (r->rport_cnt == 1 && r->rport == 0)
	remotePortCopy = 0;
	r->alink[i] = LibAliasRedirectPort(ptr->lib,
	r->laddr, htons(r->lport + i), r->raddr,
	htons(remotePortCopy), r->paddr,
	htons(r->pport + i), r->proto);
	if (r->alink[i] == NULL) {
	r->alink[0] = NULL;
	break;
	}
	}
	break;
	case REDIR_PROTO:
	r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
	r->raddr, r->paddr, r->proto);
	break;
	default:
	printf("unknown redirect mode: %u\n", r->mode);
	break;
	}
	if (r->alink[0] == NULL) {
	panic_err = "LibAliasRedirect* returned NULL";
	goto bad;
	} else /* LSNAT handling. */
	for (i = 0; i < r->spool_cnt; i++) {
	ser_s = (struct cfg_spool *)&buf[off];
	s = malloc(SOF_REDIR, M_IPFW,
	M_WAITOK \| M_ZERO);
	memcpy(s, ser_s, SOF_SPOOL);
	LibAliasAddServer(ptr->lib, r->alink[0],
	s->addr, htons(s->port));
	off += SOF_SPOOL;
	/* Hook spool entry. */
	HOOK_SPOOL(&r->spool_chain, s);
	}
	/* And finally hook this redir entry. */
	HOOK_REDIR(&ptr->redir_chain, r);
	}
	return (1);
	bad:
	/* something really bad happened: panic! */
	panic("%s\n", panic_err);
	}

	static int
	ipfw_nat(struct ip_fw_args args, struct cfg_nat t, struct mbuf *m)
	{
	struct mbuf *mcl;
	struct ip *ip;
	/* XXX - libalias duct tape */
	int ldt, retval;
	char *c;

	ldt = 0;
	retval = 0;
	if ((mcl = m_megapullup(m, m->m_pkthdr.len)) ==
	NULL)
	goto badnat;
	ip = mtod(mcl, struct ip *);
	if (args->eh == NULL) {
	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);
	}

	/*
	* XXX - Libalias checksum offload 'duct tape':
	*
	* locally generated packets have only
	* pseudo-header checksum calculated
	* and libalias will screw it[1], so
	* mark them for later fix. Moreover
	* there are cases when libalias
	* modify tcp packet data[2], mark it
	* for later fix too.
	*
	* [1] libalias was never meant to run
	* in kernel, so it doesn't have any
	* knowledge about checksum
	* offloading, and it expects a packet
	* with a full internet
	* checksum. Unfortunately, packets
	* generated locally will have just the
	* pseudo header calculated, and when
	* libalias tries to adjust the
	* checksum it will actually screw it.
	*
	* [2] when libalias modify tcp's data
	* content, full TCP checksum has to
	* be recomputed: the problem is that
	* libalias doesn't have any idea
	* about checksum offloading To
	* workaround this, we do not do
	* checksumming in LibAlias, but only
	* mark the packets in th_x2 field. If
	* we receive a marked packet, we
	* calculate correct checksum for it
	* aware of offloading. Why such a
	* terrible hack instead of
	* recalculating checksum for each
	* packet? Because the previous
	* checksum was not checked!
	* Recalculating checksums for EVERY
	* packet will hide ALL transmission
	* errors. Yes, marked packets still
	* suffer from this problem. But,
	* sigh, natd(8) has this problem,
	* too.
	*
	* TODO: -make libalias mbuf aware (so
	* it can handle delayed checksum and tso)
	*/

	if (mcl->m_pkthdr.rcvif == NULL &&
	mcl->m_pkthdr.csum_flags &
	CSUM_DELAY_DATA)
	ldt = 1;

	c = mtod(mcl, char *);
	if (args->oif == NULL)
	retval = LibAliasIn(t->lib, c,
	mcl->m_len + M_TRAILINGSPACE(mcl));
	else
	retval = LibAliasOut(t->lib, c,
	mcl->m_len + M_TRAILINGSPACE(mcl));
	if (retval != PKT_ALIAS_OK &&
	retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) {
	/* XXX - should i add some logging? */
	m_free(mcl);
	badnat:
	args->m = NULL;
	return (IP_FW_DENY);
	}
	mcl->m_pkthdr.len = mcl->m_len =
	ntohs(ip->ip_len);

	/*
	* XXX - libalias checksum offload
	* 'duct tape' (see above)
	*/

	if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
	ip->ip_p == IPPROTO_TCP) {
	struct tcphdr *th;

	th = (struct tcphdr *)(ip + 1);
	if (th->th_x2)
	ldt = 1;
	}

	if (ldt) {
	struct tcphdr *th;
	struct udphdr *uh;
	u_short cksum;

	ip->ip_len = ntohs(ip->ip_len);
	cksum = in_pseudo(
	ip->ip_src.s_addr,
	ip->ip_dst.s_addr,
	htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))
	);

	switch (ip->ip_p) {
	case IPPROTO_TCP:
	th = (struct tcphdr *)(ip + 1);
	/*
	* Maybe it was set in
	* libalias...
	*/
	th->th_x2 = 0;
	th->th_sum = cksum;
	mcl->m_pkthdr.csum_data =
	offsetof(struct tcphdr, th_sum);
	break;
	case IPPROTO_UDP:
	uh = (struct udphdr *)(ip + 1);
	uh->uh_sum = cksum;
	mcl->m_pkthdr.csum_data =
	offsetof(struct udphdr, uh_sum);
	break;
	}
	/*
	* No hw checksum offloading: do it
	* by ourself.
	*/
	if ((mcl->m_pkthdr.csum_flags &
	CSUM_DELAY_DATA) == 0) {
	in_delayed_cksum(mcl);
	mcl->m_pkthdr.csum_flags &=
	~CSUM_DELAY_DATA;
	}
	ip->ip_len = htons(ip->ip_len);
	}

	if (args->eh == NULL) {
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);
	}

	args->m = mcl;
	return (IP_FW_NAT);
	}

	static int
	ipfw_nat_cfg(struct sockopt *sopt)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct cfg_nat ptr, ser_n;
	char *buf;

	buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK \| M_ZERO);
	sooptcopyin(sopt, buf, NAT_BUF_LEN,
	sizeof(struct cfg_nat));
	ser_n = (struct cfg_nat *)buf;

	/*
	* Find/create nat rule.
	*/
	IPFW_WLOCK(&V_layer3_chain);
	LOOKUP_NAT(V_layer3_chain, ser_n->id, ptr);
	if (ptr == NULL) {
	/* New rule: allocate and init new instance. */
	ptr = malloc(sizeof(struct cfg_nat),
	M_IPFW, M_NOWAIT \| M_ZERO);
	if (ptr == NULL) {
	IPFW_WUNLOCK(&V_layer3_chain);
	free(buf, M_IPFW);
	return (ENOSPC);
	}
	ptr->lib = LibAliasInit(NULL);
	if (ptr->lib == NULL) {
	IPFW_WUNLOCK(&V_layer3_chain);
	free(ptr, M_IPFW);
	free(buf, M_IPFW);
	return (EINVAL);
	}
	LIST_INIT(&ptr->redir_chain);
	} else {
	/* Entry already present: temporarly unhook it. */
	UNHOOK_NAT(ptr);
	flush_nat_ptrs(ser_n->id);
	}
	IPFW_WUNLOCK(&V_layer3_chain);

	/*
	* Basic nat configuration.
	*/
	ptr->id = ser_n->id;
	/*
	* XXX - what if this rule doesn't nat any ip and just
	* redirect?
	* do we set aliasaddress to 0.0.0.0?
	*/
	ptr->ip = ser_n->ip;
	ptr->redir_cnt = ser_n->redir_cnt;
	ptr->mode = ser_n->mode;
	LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode);
	LibAliasSetAddress(ptr->lib, ptr->ip);
	memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE);

	/*
	* Redir and LSNAT configuration.
	*/
	/* Delete old cfgs. */
	del_redir_spool_cfg(ptr, &ptr->redir_chain);
	/* Add new entries. */
	add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr);
	free(buf, M_IPFW);
	IPFW_WLOCK(&V_layer3_chain);
	HOOK_NAT(&V_layer3_chain.nat, ptr);
	IPFW_WUNLOCK(&V_layer3_chain);
	return (0);
	}

	static int
	ipfw_nat_del(struct sockopt *sopt)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct cfg_nat *ptr;
	int i;

	sooptcopyin(sopt, &i, sizeof i, sizeof i);
	IPFW_WLOCK(&V_layer3_chain);
	LOOKUP_NAT(V_layer3_chain, i, ptr);
	if (ptr == NULL) {
	IPFW_WUNLOCK(&V_layer3_chain);
	return (EINVAL);
	}
	UNHOOK_NAT(ptr);
	flush_nat_ptrs(i);
	IPFW_WUNLOCK(&V_layer3_chain);
	del_redir_spool_cfg(ptr, &ptr->redir_chain);
	LibAliasUninit(ptr->lib);
	free(ptr, M_IPFW);
	return (0);
	}

	static int
	ipfw_nat_get_cfg(struct sockopt *sopt)
	{
	+ INIT_VNET_IPFW(curvnet);
	uint8_t *data;
	struct cfg_nat *n;
	struct cfg_redir *r;
	struct cfg_spool *s;
	int nat_cnt, off;

	nat_cnt = 0;
	off = sizeof(nat_cnt);

	data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK \| M_ZERO);
	IPFW_RLOCK(&V_layer3_chain);
	/* Serialize all the data. */
	LIST_FOREACH(n, &V_layer3_chain.nat, _next) {
	nat_cnt++;
	if (off + SOF_NAT < NAT_BUF_LEN) {
	bcopy(n, &data[off], SOF_NAT);
	off += SOF_NAT;
	LIST_FOREACH(r, &n->redir_chain, _next) {
	if (off + SOF_REDIR < NAT_BUF_LEN) {
	bcopy(r, &data[off],
	SOF_REDIR);
	off += SOF_REDIR;
	LIST_FOREACH(s, &r->spool_chain,
	_next) {
	if (off + SOF_SPOOL <
	NAT_BUF_LEN) {
	bcopy(s, &data[off],
	SOF_SPOOL);
	off += SOF_SPOOL;
	} else
	goto nospace;
	}
	} else
	goto nospace;
	}
	} else
	goto nospace;
	}
	bcopy(&nat_cnt, data, sizeof(nat_cnt));
	IPFW_RUNLOCK(&V_layer3_chain);
	sooptcopyout(sopt, data, NAT_BUF_LEN);
	free(data, M_IPFW);
	return (0);
	nospace:
	IPFW_RUNLOCK(&V_layer3_chain);
	printf("serialized data buffer not big enough:"
	"please increase NAT_BUF_LEN\n");
	free(data, M_IPFW);
	return (ENOSPC);
	}

	static int
	ipfw_nat_get_log(struct sockopt *sopt)
	{
	+ INIT_VNET_IPFW(curvnet);
	uint8_t *data;
	struct cfg_nat *ptr;
	int i, size, cnt, sof;

	data = NULL;
	sof = LIBALIAS_BUF_SIZE;
	cnt = 0;

	IPFW_RLOCK(&V_layer3_chain);
	size = i = 0;
	LIST_FOREACH(ptr, &V_layer3_chain.nat, _next) {
	if (ptr->lib->logDesc == NULL)
	continue;
	cnt++;
	size = cnt * (sof + sizeof(int));
	data = realloc(data, size, M_IPFW, M_NOWAIT \| M_ZERO);
	if (data == NULL) {
	IPFW_RUNLOCK(&V_layer3_chain);
	return (ENOSPC);
	}
	bcopy(&ptr->id, &data[i], sizeof(int));
	i += sizeof(int);
	bcopy(ptr->lib->logDesc, &data[i], sof);
	i += sof;
	}
	IPFW_RUNLOCK(&V_layer3_chain);
	sooptcopyout(sopt, data, size);
	free(data, M_IPFW);
	return(0);
	}

	static void
	ipfw_nat_init(void)
	{
	+ INIT_VNET_IPFW(curvnet);

	IPFW_WLOCK(&V_layer3_chain);
	/* init ipfw hooks */
	ipfw_nat_ptr = ipfw_nat;
	ipfw_nat_cfg_ptr = ipfw_nat_cfg;
	ipfw_nat_del_ptr = ipfw_nat_del;
	ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
	ipfw_nat_get_log_ptr = ipfw_nat_get_log;
	IPFW_WUNLOCK(&V_layer3_chain);
	V_ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change,
	NULL, EVENTHANDLER_PRI_ANY);
	}

	static void
	ipfw_nat_destroy(void)
	{
	+ INIT_VNET_IPFW(curvnet);
	struct ip_fw *rule;
	struct cfg_nat ptr, ptr_temp;

	IPFW_WLOCK(&V_layer3_chain);
	LIST_FOREACH_SAFE(ptr, &V_layer3_chain.nat, _next, ptr_temp) {
	LIST_REMOVE(ptr, _next);
	del_redir_spool_cfg(ptr, &ptr->redir_chain);
	LibAliasUninit(ptr->lib);
	free(ptr, M_IPFW);
	}
	EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag);
	/* flush all nat ptrs */
	for (rule = V_layer3_chain.rules; rule; rule = rule->next) {
	ipfw_insn_nat cmd = (ipfw_insn_nat )ACTION_PTR(rule);
	if (cmd->o.opcode == O_NAT)
	cmd->nat = NULL;
	}
	/* deregister ipfw_nat */
	ipfw_nat_ptr = NULL;
	IPFW_WUNLOCK(&V_layer3_chain);
	}

	static int
	ipfw_nat_modevent(module_t mod, int type, void *unused)
	{
	int err = 0;

	switch (type) {
	case MOD_LOAD:
	ipfw_nat_init();
	break;

	case MOD_UNLOAD:
	ipfw_nat_destroy();
	break;

	default:
	return EOPNOTSUPP;
	break;
	}
	return err;
	}

	static moduledata_t ipfw_nat_mod = {
	"ipfw_nat",
	ipfw_nat_modevent,
	0
	};

	DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
	MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
	MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2);
	MODULE_VERSION(ipfw_nat, 1);
	Index: head/sys/netinet/ip_icmp.c
	===================================================================
	--- head/sys/netinet/ip_icmp.c (revision 183549)
	+++ head/sys/netinet/ip_icmp.c (revision 183550)
	@@ -1,926 +1,934 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#include <netinet/icmp_var.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/key.h>
	#endif

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	/*
	* ICMP routines: error generation, receive packet processing, and
	* routines to turnaround packets back to the originator, and
	* host table maintenance routines.
	*/

	struct icmpstat icmpstat;
	-SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW,
	- &icmpstat, icmpstat, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_STATS, stats,
	+ CTLFLAG_RW, icmpstat, icmpstat, "");

	static int icmpmaskrepl = 0;
	-SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW,
	- &icmpmaskrepl, 0, "Reply to ICMP Address Mask Request packets.");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_MASKREPL, maskrepl,
	+ CTLFLAG_RW, icmpmaskrepl, 0,
	+ "Reply to ICMP Address Mask Request packets.");

	static u_int icmpmaskfake = 0;
	-SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW,
	- &icmpmaskfake, 0, "Fake reply to ICMP Address Mask Request packets.");
	+SYSCTL_V_UINT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW,
	+ icmpmaskfake, 0, "Fake reply to ICMP Address Mask Request packets.");

	static int drop_redirect = 0;
	-SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW,
	- &drop_redirect, 0, "Ignore ICMP redirects");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, drop_redirect,
	+ CTLFLAG_RW, drop_redirect, 0, "Ignore ICMP redirects");

	static int log_redirect = 0;
	-SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW,
	- &log_redirect, 0, "Log ICMP redirects to the console");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, log_redirect,
	+ CTLFLAG_RW, log_redirect, 0, "Log ICMP redirects to the console");

	static int icmplim = 200;
	-SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW,
	- &icmplim, 0, "Maximum number of ICMP responses per second");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, ICMPCTL_ICMPLIM, icmplim,
	+ CTLFLAG_RW, icmplim, 0, "Maximum number of ICMP responses per second");

	static int icmplim_output = 1;
	-SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW,
	- &icmplim_output, 0, "Enable rate limiting of ICMP responses");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, icmplim_output,
	+ CTLFLAG_RW, icmplim_output, 0,
	+ "Enable rate limiting of ICMP responses");

	static char reply_src[IFNAMSIZ];
	-SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW,
	- &reply_src, IFNAMSIZ, "icmp reply source for non-local packets.");
	+SYSCTL_V_STRING(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, reply_src,
	+ CTLFLAG_RW, reply_src, IFNAMSIZ,
	+ "icmp reply source for non-local packets.");

	static int icmp_rfi = 0;
	-SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW,
	- &icmp_rfi, 0, "ICMP reply from incoming interface for "
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, reply_from_interface,
	+ CTLFLAG_RW, icmp_rfi, 0, "ICMP reply from incoming interface for "
	"non-local packets");

	static int icmp_quotelen = 8;
	-SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW,
	- &icmp_quotelen, 0, "Number of bytes from original packet to "
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW,
	+ icmp_quotelen, 0, "Number of bytes from original packet to "
	"quote in ICMP reply");

	/*
	* ICMP broadcast echo sysctl
	*/

	static int icmpbmcastecho = 0;
	-SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW,
	- &icmpbmcastecho, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_icmp, OID_AUTO, bmcastecho,
	+ CTLFLAG_RW, icmpbmcastecho, 0, "");


	#ifdef ICMPPRINTFS
	int icmpprintfs = 0;
	#endif

	static void icmp_reflect(struct mbuf *);
	static void icmp_send(struct mbuf , struct mbuf );

	extern struct protosw inetsw[];

	/*
	* Generate an error packet of type error
	* in response to bad packet ip.
	*/
	void
	icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu)
	{
	+ INIT_VNET_INET(curvnet);
	register struct ip oip = mtod(n, struct ip ), *nip;
	register unsigned oiphlen = oip->ip_hl << 2;
	register struct icmp *icp;
	register struct mbuf *m;
	unsigned icmplen, icmpelen, nlen;

	KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__));
	#ifdef ICMPPRINTFS
	if (icmpprintfs)
	printf("icmp_error(%p, %x, %d)\n", oip, type, code);
	#endif
	if (type != ICMP_REDIRECT)
	V_icmpstat.icps_error++;
	/*
	* Don't send error:
	* if the original packet was encrypted.
	* if not the first fragment of message.
	* in response to a multicast or broadcast packet.
	* if the old packet protocol was an ICMP error message.
	*/
	if (n->m_flags & M_DECRYPTED)
	goto freeit;
	if (oip->ip_off & ~(IP_MF\|IP_DF))
	goto freeit;
	if (n->m_flags & (M_BCAST\|M_MCAST))
	goto freeit;
	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
	n->m_len >= oiphlen + ICMP_MINLEN &&
	!ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) {
	V_icmpstat.icps_oldicmp++;
	goto freeit;
	}
	/* Drop if IP header plus 8 bytes is not contignous in first mbuf. */
	if (oiphlen + 8 > n->m_len)
	goto freeit;
	/*
	* Calculate length to quote from original packet and
	* prevent the ICMP mbuf from overflowing.
	* Unfortunatly this is non-trivial since ip_forward()
	* sends us truncated packets.
	*/
	nlen = m_length(n, NULL);
	if (oip->ip_p == IPPROTO_TCP) {
	struct tcphdr *th;
	int tcphlen;

	if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
	n->m_next == NULL)
	goto stdreply;
	if (n->m_len < oiphlen + sizeof(struct tcphdr) &&
	((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL))
	goto freeit;
	th = (struct tcphdr *)((caddr_t)oip + oiphlen);
	tcphlen = th->th_off << 2;
	if (tcphlen < sizeof(struct tcphdr))
	goto freeit;
	if (oip->ip_len < oiphlen + tcphlen)
	goto freeit;
	if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
	goto stdreply;
	if (n->m_len < oiphlen + tcphlen &&
	((n = m_pullup(n, oiphlen + tcphlen)) == NULL))
	goto freeit;
	- icmpelen = max(tcphlen, min(icmp_quotelen, oip->ip_len - oiphlen));
	+ icmpelen = max(tcphlen, min(V_icmp_quotelen, oip->ip_len - oiphlen));
	} else
	-stdreply: icmpelen = max(8, min(icmp_quotelen, oip->ip_len - oiphlen));
	+stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen));

	icmplen = min(oiphlen + icmpelen, nlen);
	if (icmplen < sizeof(struct ip))
	goto freeit;

	if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
	m = m_gethdr(M_DONTWAIT, MT_DATA);
	else
	m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
	if (m == NULL)
	goto freeit;
	#ifdef MAC
	mac_netinet_icmp_reply(n, m);
	#endif
	icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN);
	m_align(m, ICMP_MINLEN + icmplen);
	m->m_len = ICMP_MINLEN + icmplen;

	/* XXX MRT make the outgoing packet use the same FIB
	* that was associated with the incoming packet
	*/
	M_SETFIB(m, M_GETFIB(n));
	icp = mtod(m, struct icmp *);
	V_icmpstat.icps_outhist[type]++;
	icp->icmp_type = type;
	if (type == ICMP_REDIRECT)
	icp->icmp_gwaddr.s_addr = dest;
	else {
	icp->icmp_void = 0;
	/*
	* The following assignments assume an overlay with the
	* just zeroed icmp_void field.
	*/
	if (type == ICMP_PARAMPROB) {
	icp->icmp_pptr = code;
	code = 0;
	} else if (type == ICMP_UNREACH &&
	code == ICMP_UNREACH_NEEDFRAG && mtu) {
	icp->icmp_nextmtu = htons(mtu);
	}
	}
	icp->icmp_code = code;

	/*
	* Copy the quotation into ICMP message and
	* convert quoted IP header back to network representation.
	*/
	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
	nip = &icp->icmp_ip;
	nip->ip_len = htons(nip->ip_len);
	nip->ip_off = htons(nip->ip_off);

	/*
	* Set up ICMP message mbuf and copy old IP header (without options
	* in front of ICMP message.
	* If the original mbuf was meant to bypass the firewall, the error
	* reply should bypass as well.
	*/
	m->m_flags \|= n->m_flags & M_SKIP_FIREWALL;
	m->m_data -= sizeof(struct ip);
	m->m_len += sizeof(struct ip);
	m->m_pkthdr.len = m->m_len;
	m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
	nip = mtod(m, struct ip *);
	bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
	nip->ip_len = m->m_len;
	nip->ip_v = IPVERSION;
	nip->ip_hl = 5;
	nip->ip_p = IPPROTO_ICMP;
	nip->ip_tos = 0;
	icmp_reflect(m);

	freeit:
	m_freem(n);
	}

	/*
	* Process a received ICMP message.
	*/
	void
	icmp_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET(curvnet);
	struct icmp *icp;
	struct in_ifaddr *ia;
	struct ip ip = mtod(m, struct ip );
	struct sockaddr_in icmpsrc, icmpdst, icmpgw;
	int hlen = off;
	int icmplen = ip->ip_len;
	int i, code;
	void (ctlfunc)(int, struct sockaddr , void *);
	int fibnum;

	/*
	* Locate icmp structure in mbuf, and check
	* that not corrupted and of at least minimum length.
	*/
	#ifdef ICMPPRINTFS
	if (icmpprintfs) {
	char buf[4 * sizeof "123"];
	strcpy(buf, inet_ntoa(ip->ip_src));
	printf("icmp_input from %s to %s, len %d\n",
	buf, inet_ntoa(ip->ip_dst), icmplen);
	}
	#endif
	if (icmplen < ICMP_MINLEN) {
	V_icmpstat.icps_tooshort++;
	goto freeit;
	}
	i = hlen + min(icmplen, ICMP_ADVLENMIN);
	if (m->m_len < i && (m = m_pullup(m, i)) == 0) {
	V_icmpstat.icps_tooshort++;
	return;
	}
	ip = mtod(m, struct ip *);
	m->m_len -= hlen;
	m->m_data += hlen;
	icp = mtod(m, struct icmp *);
	if (in_cksum(m, icmplen)) {
	V_icmpstat.icps_checksum++;
	goto freeit;
	}
	m->m_len += hlen;
	m->m_data -= hlen;

	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
	/*
	* Deliver very specific ICMP type only.
	*/
	switch (icp->icmp_type) {
	case ICMP_UNREACH:
	case ICMP_TIMXCEED:
	break;
	default:
	goto freeit;
	}
	}

	#ifdef ICMPPRINTFS
	if (icmpprintfs)
	printf("icmp_input, type %d code %d\n", icp->icmp_type,
	icp->icmp_code);
	#endif

	/*
	* Message type specific processing.
	*/
	if (icp->icmp_type > ICMP_MAXTYPE)
	goto raw;

	/* Initialize */
	bzero(&icmpsrc, sizeof(icmpsrc));
	icmpsrc.sin_len = sizeof(struct sockaddr_in);
	icmpsrc.sin_family = AF_INET;
	bzero(&icmpdst, sizeof(icmpdst));
	icmpdst.sin_len = sizeof(struct sockaddr_in);
	icmpdst.sin_family = AF_INET;
	bzero(&icmpgw, sizeof(icmpgw));
	icmpgw.sin_len = sizeof(struct sockaddr_in);
	icmpgw.sin_family = AF_INET;

	V_icmpstat.icps_inhist[icp->icmp_type]++;
	code = icp->icmp_code;
	switch (icp->icmp_type) {

	case ICMP_UNREACH:
	switch (code) {
	case ICMP_UNREACH_NET:
	case ICMP_UNREACH_HOST:
	case ICMP_UNREACH_SRCFAIL:
	case ICMP_UNREACH_NET_UNKNOWN:
	case ICMP_UNREACH_HOST_UNKNOWN:
	case ICMP_UNREACH_ISOLATED:
	case ICMP_UNREACH_TOSNET:
	case ICMP_UNREACH_TOSHOST:
	case ICMP_UNREACH_HOST_PRECEDENCE:
	case ICMP_UNREACH_PRECEDENCE_CUTOFF:
	code = PRC_UNREACH_NET;
	break;

	case ICMP_UNREACH_NEEDFRAG:
	code = PRC_MSGSIZE;
	break;

	/*
	* RFC 1122, Sections 3.2.2.1 and 4.2.3.9.
	* Treat subcodes 2,3 as immediate RST
	*/
	case ICMP_UNREACH_PROTOCOL:
	case ICMP_UNREACH_PORT:
	code = PRC_UNREACH_PORT;
	break;

	case ICMP_UNREACH_NET_PROHIB:
	case ICMP_UNREACH_HOST_PROHIB:
	case ICMP_UNREACH_FILTER_PROHIB:
	code = PRC_UNREACH_ADMIN_PROHIB;
	break;

	default:
	goto badcode;
	}
	goto deliver;

	case ICMP_TIMXCEED:
	if (code > 1)
	goto badcode;
	code += PRC_TIMXCEED_INTRANS;
	goto deliver;

	case ICMP_PARAMPROB:
	if (code > 1)
	goto badcode;
	code = PRC_PARAMPROB;
	goto deliver;

	case ICMP_SOURCEQUENCH:
	if (code)
	goto badcode;
	code = PRC_QUENCH;
	deliver:
	/*
	* Problem with datagram; advise higher level routines.
	*/
	if (icmplen < ICMP_ADVLENMIN \|\| icmplen < ICMP_ADVLEN(icp) \|\|
	icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
	V_icmpstat.icps_badlen++;
	goto freeit;
	}
	icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len);
	/* Discard ICMP's in response to multicast packets */
	if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
	goto badcode;
	#ifdef ICMPPRINTFS
	if (icmpprintfs)
	printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
	#endif
	icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
	/*
	* XXX if the packet contains [IPv4 AH TCP], we can't make a
	* notification to TCP layer.
	*/
	ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput;
	if (ctlfunc)
	(ctlfunc)(code, (struct sockaddr )&icmpsrc,
	(void *)&icp->icmp_ip);
	break;

	badcode:
	V_icmpstat.icps_badcode++;
	break;

	case ICMP_ECHO:
	- if (!icmpbmcastecho
	+ if (!V_icmpbmcastecho
	&& (m->m_flags & (M_MCAST \| M_BCAST)) != 0) {
	V_icmpstat.icps_bmcastecho++;
	break;
	}
	icp->icmp_type = ICMP_ECHOREPLY;
	if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
	goto freeit;
	else
	goto reflect;

	case ICMP_TSTAMP:
	- if (!icmpbmcastecho
	+ if (!V_icmpbmcastecho
	&& (m->m_flags & (M_MCAST \| M_BCAST)) != 0) {
	V_icmpstat.icps_bmcasttstamp++;
	break;
	}
	if (icmplen < ICMP_TSLEN) {
	V_icmpstat.icps_badlen++;
	break;
	}
	icp->icmp_type = ICMP_TSTAMPREPLY;
	icp->icmp_rtime = iptime();
	icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */
	if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
	goto freeit;
	else
	goto reflect;

	case ICMP_MASKREQ:
	- if (icmpmaskrepl == 0)
	+ if (V_icmpmaskrepl == 0)
	break;
	/*
	* We are not able to respond with all ones broadcast
	* unless we receive it over a point-to-point interface.
	*/
	if (icmplen < ICMP_MASKLEN)
	break;
	switch (ip->ip_dst.s_addr) {

	case INADDR_BROADCAST:
	case INADDR_ANY:
	icmpdst.sin_addr = ip->ip_src;
	break;

	default:
	icmpdst.sin_addr = ip->ip_dst;
	}
	ia = (struct in_ifaddr *)ifaof_ifpforaddr(
	(struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
	if (ia == 0)
	break;
	if (ia->ia_ifp == 0)
	break;
	icp->icmp_type = ICMP_MASKREPLY;
	- if (icmpmaskfake == 0)
	+ if (V_icmpmaskfake == 0)
	icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
	else
	- icp->icmp_mask = icmpmaskfake;
	+ icp->icmp_mask = V_icmpmaskfake;
	if (ip->ip_src.s_addr == 0) {
	if (ia->ia_ifp->if_flags & IFF_BROADCAST)
	ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
	else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
	ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
	}
	reflect:
	ip->ip_len += hlen; /* since ip_input deducts this */
	V_icmpstat.icps_reflect++;
	V_icmpstat.icps_outhist[icp->icmp_type]++;
	icmp_reflect(m);
	return;

	case ICMP_REDIRECT:
	- if (log_redirect) {
	+ if (V_log_redirect) {
	u_long src, dst, gw;

	src = ntohl(ip->ip_src.s_addr);
	dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
	gw = ntohl(icp->icmp_gwaddr.s_addr);
	printf("icmp redirect from %d.%d.%d.%d: "
	"%d.%d.%d.%d => %d.%d.%d.%d\n",
	(int)(src >> 24), (int)((src >> 16) & 0xff),
	(int)((src >> 8) & 0xff), (int)(src & 0xff),
	(int)(dst >> 24), (int)((dst >> 16) & 0xff),
	(int)((dst >> 8) & 0xff), (int)(dst & 0xff),
	(int)(gw >> 24), (int)((gw >> 16) & 0xff),
	(int)((gw >> 8) & 0xff), (int)(gw & 0xff));
	}
	/*
	* RFC1812 says we must ignore ICMP redirects if we
	* are acting as router.
	*/
	- if (drop_redirect \|\| V_ipforwarding)
	+ if (V_drop_redirect \|\| V_ipforwarding)
	break;
	if (code > 3)
	goto badcode;
	if (icmplen < ICMP_ADVLENMIN \|\| icmplen < ICMP_ADVLEN(icp) \|\|
	icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
	V_icmpstat.icps_badlen++;
	break;
	}
	/*
	* Short circuit routing redirects to force
	* immediate change in the kernel's routing
	* tables. The message is also handed to anyone
	* listening on a raw socket (e.g. the routing
	* daemon for use in updating its tables).
	*/
	icmpgw.sin_addr = ip->ip_src;
	icmpdst.sin_addr = icp->icmp_gwaddr;
	#ifdef ICMPPRINTFS
	if (icmpprintfs) {
	char buf[4 * sizeof "123"];
	strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst));

	printf("redirect dst %s to %s\n",
	buf, inet_ntoa(icp->icmp_gwaddr));
	}
	#endif
	icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
	for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
	in_rtredirect((struct sockaddr *)&icmpsrc,
	(struct sockaddr *)&icmpdst,
	(struct sockaddr *)0, RTF_GATEWAY \| RTF_HOST,
	(struct sockaddr *)&icmpgw, fibnum);
	}
	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
	#ifdef IPSEC
	key_sa_routechange((struct sockaddr *)&icmpsrc);
	#endif
	break;

	/*
	* No kernel processing for the following;
	* just fall through to send to raw listener.
	*/
	case ICMP_ECHOREPLY:
	case ICMP_ROUTERADVERT:
	case ICMP_ROUTERSOLICIT:
	case ICMP_TSTAMPREPLY:
	case ICMP_IREQREPLY:
	case ICMP_MASKREPLY:
	default:
	break;
	}

	raw:
	rip_input(m, off);
	return;

	freeit:
	m_freem(m);
	}

	/*
	* Reflect the ip packet back to the source
	*/
	static void
	icmp_reflect(struct mbuf *m)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip ip = mtod(m, struct ip );
	struct ifaddr *ifa;
	struct ifnet *ifn;
	struct in_ifaddr *ia;
	struct in_addr t;
	struct mbuf *opts = 0;
	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);

	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) \|\|
	IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) {
	m_freem(m); /* Bad return address */
	V_icmpstat.icps_badaddr++;
	goto done; /* Ip_output() will check for broadcast */
	}

	t = ip->ip_dst;
	ip->ip_dst = ip->ip_src;

	/*
	* Source selection for ICMP replies:
	*
	* If the incoming packet was addressed directly to one of our
	* own addresses, use dst as the src for the reply.
	*/
	LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash)
	if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr)
	goto match;
	/*
	* If the incoming packet was addressed to one of our broadcast
	* addresses, use the first non-broadcast address which corresponds
	* to the incoming interface.
	*/
	if (m->m_pkthdr.rcvif != NULL &&
	m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
	TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	ia = ifatoia(ifa);
	if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
	t.s_addr)
	goto match;
	}
	}
	/*
	* If the packet was transiting through us, use the address of
	* the interface the packet came through in. If that interface
	* doesn't have a suitable IP address, the normal selection
	* criteria apply.
	*/
	- if (icmp_rfi && m->m_pkthdr.rcvif != NULL) {
	+ if (V_icmp_rfi && m->m_pkthdr.rcvif != NULL) {
	TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	ia = ifatoia(ifa);
	goto match;
	}
	}
	/*
	* If the incoming packet was not addressed directly to us, use
	* designated interface for icmp replies specified by sysctl
	* net.inet.icmp.reply_src (default not set). Otherwise continue
	* with normal source selection.
	*/
	- if (reply_src[0] != '\0' && (ifn = ifunit(reply_src))) {
	+ if (V_reply_src[0] != '\0' && (ifn = ifunit(V_reply_src))) {
	TAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	ia = ifatoia(ifa);
	goto match;
	}
	}
	/*
	* If the packet was transiting through us, use the address of
	* the interface that is the closest to the packet source.
	* When we don't have a route back to the packet source, stop here
	* and drop the packet.
	*/
	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
	if (ia == NULL) {
	m_freem(m);
	V_icmpstat.icps_noroute++;
	goto done;
	}
	match:
	#ifdef MAC
	mac_netinet_icmp_replyinplace(m);
	#endif
	t = IA_SIN(ia)->sin_addr;
	ip->ip_src = t;
	ip->ip_ttl = V_ip_defttl;

	if (optlen > 0) {
	register u_char *cp;
	int opt, cnt;
	u_int len;

	/*
	* Retrieve any source routing from the incoming packet;
	* add on any record-route or timestamp options.
	*/
	cp = (u_char *) (ip + 1);
	if ((opts = ip_srcroute(m)) == 0 &&
	(opts = m_gethdr(M_DONTWAIT, MT_DATA))) {
	opts->m_len = sizeof(struct in_addr);
	mtod(opts, struct in_addr *)->s_addr = 0;
	}
	if (opts) {
	#ifdef ICMPPRINTFS
	if (icmpprintfs)
	printf("icmp_reflect optlen %d rt %d => ",
	optlen, opts->m_len);
	#endif
	for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
	opt = cp[IPOPT_OPTVAL];
	if (opt == IPOPT_EOL)
	break;
	if (opt == IPOPT_NOP)
	len = 1;
	else {
	if (cnt < IPOPT_OLEN + sizeof(*cp))
	break;
	len = cp[IPOPT_OLEN];
	if (len < IPOPT_OLEN + sizeof(*cp) \|\|
	len > cnt)
	break;
	}
	/*
	* Should check for overflow, but it "can't happen"
	*/
	if (opt == IPOPT_RR \|\| opt == IPOPT_TS \|\|
	opt == IPOPT_SECURITY) {
	bcopy((caddr_t)cp,
	mtod(opts, caddr_t) + opts->m_len, len);
	opts->m_len += len;
	}
	}
	/* Terminate & pad, if necessary */
	cnt = opts->m_len % 4;
	if (cnt) {
	for (; cnt < 4; cnt++) {
	*(mtod(opts, caddr_t) + opts->m_len) =
	IPOPT_EOL;
	opts->m_len++;
	}
	}
	#ifdef ICMPPRINTFS
	if (icmpprintfs)
	printf("%d\n", opts->m_len);
	#endif
	}
	/*
	* Now strip out original options by copying rest of first
	* mbuf's data back, and adjust the IP length.
	*/
	ip->ip_len -= optlen;
	ip->ip_v = IPVERSION;
	ip->ip_hl = 5;
	m->m_len -= optlen;
	if (m->m_flags & M_PKTHDR)
	m->m_pkthdr.len -= optlen;
	optlen += sizeof(struct ip);
	bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1),
	(unsigned)(m->m_len - sizeof(struct ip)));
	}
	m_tag_delete_nonpersistent(m);
	m->m_flags &= ~(M_BCAST\|M_MCAST);
	icmp_send(m, opts);
	done:
	if (opts)
	(void)m_free(opts);
	}

	/*
	* Send an icmp packet back to the ip level,
	* after supplying a checksum.
	*/
	static void
	icmp_send(struct mbuf m, struct mbuf opts)
	{
	register struct ip ip = mtod(m, struct ip );
	register int hlen;
	register struct icmp *icp;

	hlen = ip->ip_hl << 2;
	m->m_data += hlen;
	m->m_len -= hlen;
	icp = mtod(m, struct icmp *);
	icp->icmp_cksum = 0;
	icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen);
	m->m_data -= hlen;
	m->m_len += hlen;
	m->m_pkthdr.rcvif = (struct ifnet *)0;
	#ifdef ICMPPRINTFS
	if (icmpprintfs) {
	char buf[4 * sizeof "123"];
	strcpy(buf, inet_ntoa(ip->ip_dst));
	printf("icmp_send dst %s src %s\n",
	buf, inet_ntoa(ip->ip_src));
	}
	#endif
	(void) ip_output(m, opts, NULL, 0, NULL, NULL);
	}

	n_time
	iptime(void)
	{
	struct timeval atv;
	u_long t;

	getmicrotime(&atv);
	t = (atv.tv_sec % (246060)) * 1000 + atv.tv_usec / 1000;
	return (htonl(t));
	}

	/*
	* Return the next larger or smaller MTU plateau (table from RFC 1191)
	* given current value MTU. If DIR is less than zero, a larger plateau
	* is returned; otherwise, a smaller value is returned.
	*/
	int
	ip_next_mtu(int mtu, int dir)
	{
	static int mtutab[] = {
	65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508,
	296, 68, 0
	};
	int i, size;

	size = (sizeof mtutab) / (sizeof mtutab[0]);
	if (dir >= 0) {
	for (i = 0; i < size; i++)
	if (mtu > mtutab[i])
	return mtutab[i];
	} else {
	for (i = size - 1; i >= 0; i--)
	if (mtu < mtutab[i])
	return mtutab[i];
	if (mtu == mtutab[0])
	return mtutab[0];
	}
	return 0;
	}


	/*
	* badport_bandlim() - check for ICMP bandwidth limit
	*
	* Return 0 if it is ok to send an ICMP error response, -1 if we have
	* hit our bandwidth limit and it is not ok.
	*
	* If icmplim is <= 0, the feature is disabled and 0 is returned.
	*
	* For now we separate the TCP and UDP subsystems w/ different 'which'
	* values. We may eventually remove this separation (and simplify the
	* code further).
	*
	* Note that the printing of the error message is delayed so we can
	* properly print the icmp error rate that the system was trying to do
	* (i.e. 22000/100 pps, etc...). This can cause long delays in printing
	* the 'final' error, but it doesn't make sense to solve the printing
	* delay with more complex code.
	*/

	int
	badport_bandlim(int which)
	{
	+ INIT_VNET_INET(curvnet);
	+
	#define N(a) (sizeof (a) / sizeof (a[0]))
	static struct rate {
	const char *type;
	struct timeval lasttime;
	int curpps;
	} rates[BANDLIM_MAX+1] = {
	{ "icmp unreach response" },
	{ "icmp ping response" },
	{ "icmp tstamp response" },
	{ "closed port RST response" },
	{ "open port RST response" },
	{ "icmp6 unreach response" }
	};

	/*
	* Return ok status if feature disabled or argument out of range.
	*/
	- if (icmplim > 0 && (u_int) which < N(rates)) {
	+ if (V_icmplim > 0 && (u_int) which < N(rates)) {
	struct rate *r = &rates[which];
	int opps = r->curpps;

	- if (!ppsratecheck(&r->lasttime, &r->curpps, icmplim))
	+ if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim))
	return -1; /* discard packet */
	/*
	* If we've dropped below the threshold after having
	* rate-limited traffic print the message. This preserves
	* the previous behaviour at the expense of added complexity.
	*/
	- if (icmplim_output && opps > icmplim)
	+ if (V_icmplim_output && opps > V_icmplim)
	printf("Limiting %s from %d to %d packets/sec\n",
	- r->type, opps, icmplim);
	+ r->type, opps, V_icmplim);
	}
	return 0; /* okay to send packet */
	#undef N
	}
	Index: head/sys/netinet/ip_input.c
	===================================================================
	--- head/sys/netinet/ip_input.c (revision 183549)
	+++ head/sys/netinet/ip_input.c (revision 183550)
	@@ -1,1626 +1,1664 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_bootp.h"
	#include "opt_ipfw.h"
	#include "opt_ipstealth.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"
	#include "opt_carp.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/callout.h>
	#include <sys/mbuf.h>
	#include <sys/malloc.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/pfil.h>
	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/netisr.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/ip_options.h>
	#include <machine/in_cksum.h>
	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif
	#ifdef IPSEC
	#include <netinet/ip_ipsec.h>
	#endif /* IPSEC */

	#include <sys/socketvar.h>

	/* XXX: Temporary until ipfw_ether and ipfw_bridge are converted. */
	#include <netinet/ip_fw.h>
	#include <netinet/ip_dummynet.h>

	#include <security/mac/mac_framework.h>

	#ifdef CTASSERT
	CTASSERT(sizeof(struct ip) == 20);
	#endif

	int rsvp_on = 0;

	int ipforwarding = 0;
	-SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW,
	- &ipforwarding, 0, "Enable IP forwarding between interfaces");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_FORWARDING,
	+ forwarding, CTLFLAG_RW, ipforwarding, 0,
	+ "Enable IP forwarding between interfaces");

	static int ipsendredirects = 1; /* XXX */
	-SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW,
	- &ipsendredirects, 0, "Enable sending IP redirects");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_SENDREDIRECTS,
	+ redirect, CTLFLAG_RW, ipsendredirects, 0,
	+ "Enable sending IP redirects");

	int ip_defttl = IPDEFTTL;
	-SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW,
	- &ip_defttl, 0, "Maximum TTL on IP packets");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_DEFTTL,
	+ ttl, CTLFLAG_RW, ip_defttl, 0, "Maximum TTL on IP packets");

	static int ip_keepfaith = 0;
	-SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW,
	- &ip_keepfaith, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, IPCTL_KEEPFAITH,
	+ keepfaith, CTLFLAG_RW, ip_keepfaith, 0,
	"Enable packet capture for FAITH IPv4->IPv6 translater daemon");

	static int ip_sendsourcequench = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW,
	- &ip_sendsourcequench, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO,
	+ sendsourcequench, CTLFLAG_RW, ip_sendsourcequench, 0,
	"Enable the transmission of source quench packets");

	int ip_do_randomid = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW,
	- &ip_do_randomid, 0,
	- "Assign random ip_id values");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, random_id,
	+ CTLFLAG_RW, ip_do_randomid, 0, "Assign random ip_id values");

	/*
	* XXX - Setting ip_checkinterface mostly implements the receive side of
	* the Strong ES model described in RFC 1122, but since the routing table
	* and transmit implementation do not implement the Strong ES model,
	* setting this to 1 results in an odd hybrid.
	*
	* XXX - ip_checkinterface currently must be disabled if you use ipnat
	* to translate the destination address to another local interface.
	*
	* XXX - ip_checkinterface must be disabled if you add IP aliases
	* to the loopback interface instead of the interface where the
	* packets for those addresses are received.
	*/
	static int ip_checkinterface = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW,
	- &ip_checkinterface, 0, "Verify packet arrives on correct interface");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO,
	+ check_interface, CTLFLAG_RW, ip_checkinterface, 0,
	+ "Verify packet arrives on correct interface");

	struct pfil_head inet_pfil_hook; /* Packet filter hooks */

	static struct ifqueue ipintrq;
	static int ipqmaxlen = IFQ_MAXLEN;

	extern struct domain inetdomain;
	extern struct protosw inetsw[];
	u_char ip_protox[IPPROTO_MAX];
	struct in_ifaddrhead in_ifaddrhead; /* first inet address */
	struct in_ifaddrhashhead in_ifaddrhashtbl; / inet addr hash table */
	u_long in_ifaddrhmask; /* mask for hash table */

	SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW,
	&ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue");
	SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD,
	&ipintrq.ifq_drops, 0,
	"Number of packets dropped from the IP input queue");

	struct ipstat ipstat;
	-SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
	- &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW,
	+ ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)");

	/*
	* IP datagram reassembly.
	*/
	#define IPREASS_NHASH_LOG2 6
	#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
	#define IPREASS_HMASK (IPREASS_NHASH - 1)
	#define IPREASS_HASH(x,y) \
	(((((x) & 0xF) \| ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)

	static uma_zone_t ipq_zone;
	static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH];
	static struct mtx ipqlock;

	#define IPQ_LOCK() mtx_lock(&ipqlock)
	#define IPQ_UNLOCK() mtx_unlock(&ipqlock)
	#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF)
	#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED)

	static void maxnipq_update(void);
	static void ipq_zone_change(void *);

	static int maxnipq; /* Administrative limit on # reass queues. */
	static int nipq = 0; /* Total # of reass queues */
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD,
	- &nipq, 0, "Current number of IPv4 fragment reassembly queue entries");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, fragpackets,
	+ CTLFLAG_RD, nipq, 0,
	+ "Current number of IPv4 fragment reassembly queue entries");

	static int maxfragsperpacket;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW,
	- &maxfragsperpacket, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, maxfragsperpacket,
	+ CTLFLAG_RW, maxfragsperpacket, 0,
	"Maximum number of IPv4 fragments allowed per packet");

	struct callout ipport_tick_callout;

	#ifdef IPCTL_DEFMTU
	SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
	&ip_mtu, 0, "Default MTU");
	#endif

	#ifdef IPSTEALTH
	int ipstealth = 0;
	-SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
	- &ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW,
	+ ipstealth, 0, "IP stealth mode, no TTL decrementation on forwarding");
	#endif

	/*
	* ipfw_ether and ipfw_bridge hooks.
	* XXX: Temporary until those are converted to pfil_hooks as well.
	*/
	ip_fw_chk_t *ip_fw_chk_ptr = NULL;
	ip_dn_io_t *ip_dn_io_ptr = NULL;
	int fw_one_pass = 1;

	static void ip_freef(struct ipqhead , struct ipq );

	/*
	* IP initialization: fill in IP protocol switch table.
	* All protocols not implemented in kernel go to raw IP protocol handler.
	*/
	void
	ip_init(void)
	{
	+ INIT_VNET_INET(curvnet);
	struct protosw *pr;
	int i;

	TAILQ_INIT(&V_in_ifaddrhead);
	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
	if (pr == NULL)
	panic("ip_init: PF_INET not found");

	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
	for (i = 0; i < IPPROTO_MAX; i++)
	ip_protox[i] = pr - inetsw;
	/*
	* Cycle through IP protocols and put them into the appropriate place
	* in ip_protox[].
	*/
	for (pr = inetdomain.dom_protosw;
	pr < inetdomain.dom_protoswNPROTOSW; pr++)
	if (pr->pr_domain->dom_family == PF_INET &&
	pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
	/* Be careful to only index valid IP protocols. */
	if (pr->pr_protocol < IPPROTO_MAX)
	ip_protox[pr->pr_protocol] = pr - inetsw;
	}

	/* Initialize packet filter hooks. */
	inet_pfil_hook.ph_type = PFIL_TYPE_AF;
	inet_pfil_hook.ph_af = AF_INET;
	if ((i = pfil_head_register(&inet_pfil_hook)) != 0)
	printf("%s: WARNING: unable to register pfil hook, "
	"error %d\n", __func__, i);

	/* Initialize IP reassembly queue. */
	IPQ_LOCK_INIT();
	for (i = 0; i < IPREASS_NHASH; i++)
	TAILQ_INIT(&V_ipq[i]);
	V_maxnipq = nmbclusters / 32;
	V_maxfragsperpacket = 16;
	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
	NULL, UMA_ALIGN_PTR, 0);
	maxnipq_update();

	/* Start ipport_tick. */
	callout_init(&ipport_tick_callout, CALLOUT_MPSAFE);
	ipport_tick(NULL);
	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
	SHUTDOWN_PRI_DEFAULT);
	EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change,
	NULL, EVENTHANDLER_PRI_ANY);

	/* Initialize various other remaining things. */
	V_ip_id = time_second & 0xffff;
	ipintrq.ifq_maxlen = ipqmaxlen;
	mtx_init(&ipintrq.ifq_mtx, "ip_inq", NULL, MTX_DEF);
	netisr_register(NETISR_IP, ip_input, &ipintrq, 0);
	}

	void
	ip_fini(void *xtp)
	{

	callout_stop(&ipport_tick_callout);
	}

	/*
	* Ip input routine. Checksum and byte swap header. If fragmented
	* try to reassemble. Process options. Pass to next level.
	*/
	void
	ip_input(struct mbuf *m)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip = NULL;
	struct in_ifaddr *ia = NULL;
	struct ifaddr *ifa;
	int checkif, hlen = 0;
	u_short sum;
	int dchg = 0; /* dest changed after fw */
	struct in_addr odst; /* original dst address */

	M_ASSERTPKTHDR(m);

	if (m->m_flags & M_FASTFWD_OURS) {
	/*
	* Firewall or NAT changed destination to local.
	* We expect ip_len and ip_off to be in host byte order.
	*/
	m->m_flags &= ~M_FASTFWD_OURS;
	/* Set up some basics that will be used later. */
	ip = mtod(m, struct ip *);
	hlen = ip->ip_hl << 2;
	goto ours;
	}

	V_ipstat.ips_total++;

	if (m->m_pkthdr.len < sizeof(struct ip))
	goto tooshort;

	if (m->m_len < sizeof (struct ip) &&
	(m = m_pullup(m, sizeof (struct ip))) == NULL) {
	V_ipstat.ips_toosmall++;
	return;
	}
	ip = mtod(m, struct ip *);

	if (ip->ip_v != IPVERSION) {
	V_ipstat.ips_badvers++;
	goto bad;
	}

	hlen = ip->ip_hl << 2;
	if (hlen < sizeof(struct ip)) { /* minimum header length */
	V_ipstat.ips_badhlen++;
	goto bad;
	}
	if (hlen > m->m_len) {
	if ((m = m_pullup(m, hlen)) == NULL) {
	V_ipstat.ips_badhlen++;
	return;
	}
	ip = mtod(m, struct ip *);
	}

	/* 127/8 must not appear on wire - RFC1122 */
	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET \|\|
	(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
	if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) {
	V_ipstat.ips_badaddr++;
	goto bad;
	}
	}

	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
	sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
	} else {
	if (hlen == sizeof(struct ip)) {
	sum = in_cksum_hdr(ip);
	} else {
	sum = in_cksum(m, hlen);
	}
	}
	if (sum) {
	V_ipstat.ips_badsum++;
	goto bad;
	}

	#ifdef ALTQ
	if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0)
	/* packet is dropped by traffic conditioner */
	return;
	#endif

	/*
	* Convert fields to host representation.
	*/
	ip->ip_len = ntohs(ip->ip_len);
	if (ip->ip_len < hlen) {
	V_ipstat.ips_badlen++;
	goto bad;
	}
	ip->ip_off = ntohs(ip->ip_off);

	/*
	* Check that the amount of data in the buffers
	* is as at least much as the IP header would have us expect.
	* Trim mbufs if longer than we expect.
	* Drop packet if shorter than we expect.
	*/
	if (m->m_pkthdr.len < ip->ip_len) {
	tooshort:
	V_ipstat.ips_tooshort++;
	goto bad;
	}
	if (m->m_pkthdr.len > ip->ip_len) {
	if (m->m_len == m->m_pkthdr.len) {
	m->m_len = ip->ip_len;
	m->m_pkthdr.len = ip->ip_len;
	} else
	m_adj(m, ip->ip_len - m->m_pkthdr.len);
	}
	#ifdef IPSEC
	/*
	* Bypass packet filtering for packets from a tunnel (gif).
	*/
	if (ip_ipsec_filtertunnel(m))
	goto passin;
	#endif /* IPSEC */

	/*
	* Run through list of hooks for input packets.
	*
	* NB: Beware of the destination address changing (e.g.
	* by NAT rewriting). When this happens, tell
	* ip_forward to do the right thing.
	*/

	/* Jump over all PFIL processing if hooks are not active. */
	if (!PFIL_HOOKED(&inet_pfil_hook))
	goto passin;

	odst = ip->ip_dst;
	if (pfil_run_hooks(&inet_pfil_hook, &m, m->m_pkthdr.rcvif,
	PFIL_IN, NULL) != 0)
	return;
	if (m == NULL) /* consumed by filter */
	return;

	ip = mtod(m, struct ip *);
	dchg = (odst.s_addr != ip->ip_dst.s_addr);

	#ifdef IPFIREWALL_FORWARD
	if (m->m_flags & M_FASTFWD_OURS) {
	m->m_flags &= ~M_FASTFWD_OURS;
	goto ours;
	}
	if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) {
	/*
	* Directly ship on the packet. This allows to forward packets
	* that were destined for us to some other directly connected
	* host.
	*/
	ip_forward(m, dchg);
	return;
	}
	#endif /* IPFIREWALL_FORWARD */

	passin:
	/*
	* Process options and, if not destined for us,
	* ship it on. ip_dooptions returns 1 when an
	* error was detected (causing an icmp message
	* to be sent and the original packet to be freed).
	*/
	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
	return;

	/* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
	* matter if it is destined to another node, or whether it is
	* a multicast one, RSVP wants it! and prevents it from being forwarded
	* anywhere else. Also checks if the rsvp daemon is running before
	* grabbing the packet.
	*/
	if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP)
	goto ours;

	/*
	* Check our list of addresses, to see if the packet is for us.
	* If we don't have any addresses, assume any unicast packet
	* we receive might be for us (and let the upper layers deal
	* with it).
	*/
	if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
	(m->m_flags & (M_MCAST\|M_BCAST)) == 0)
	goto ours;

	/*
	* Enable a consistency check between the destination address
	* and the arrival interface for a unicast packet (the RFC 1122
	* strong ES model) if IP forwarding is disabled and the packet
	* is not locally generated and the packet is not subject to
	* 'ipfw fwd'.
	*
	* XXX - Checking also should be disabled if the destination
	* address is ipnat'ed to a different interface.
	*
	* XXX - Checking is incompatible with IP aliases added
	* to the loopback interface instead of the interface where
	* the packets are received.
	*
	* XXX - This is the case for carp vhost IPs as well so we
	* insert a workaround. If the packet got here, we already
	* checked with carp_iamatch() and carp_forus().
	*/
	checkif = V_ip_checkinterface && (V_ipforwarding == 0) &&
	m->m_pkthdr.rcvif != NULL &&
	((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) &&
	#ifdef DEV_CARP
	!m->m_pkthdr.rcvif->if_carp &&
	#endif
	(dchg == 0);

	/*
	* Check for exact addresses in the hash bucket.
	*/
	LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
	/*
	* If the address matches, verify that the packet
	* arrived via the correct interface if checking is
	* enabled.
	*/
	if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr &&
	(!checkif \|\| ia->ia_ifp == m->m_pkthdr.rcvif))
	goto ours;
	}
	/*
	* Check for broadcast addresses.
	*
	* Only accept broadcast packets that arrive via the matching
	* interface. Reception of forwarded directed broadcasts would
	* be handled via ip_forward() and ether_output() with the loopback
	* into the stack for SIMPLEX interfaces handled by ether_output().
	*/
	if (m->m_pkthdr.rcvif != NULL &&
	m->m_pkthdr.rcvif->if_flags & IFF_BROADCAST) {
	TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family != AF_INET)
	continue;
	ia = ifatoia(ifa);
	if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
	ip->ip_dst.s_addr)
	goto ours;
	if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr)
	goto ours;
	#ifdef BOOTP_COMPAT
	if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY)
	goto ours;
	#endif
	}
	}
	/* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */
	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
	V_ipstat.ips_cantforward++;
	m_freem(m);
	return;
	}
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
	struct in_multi *inm;
	if (V_ip_mrouter) {
	/*
	* If we are acting as a multicast router, all
	* incoming multicast packets are passed to the
	* kernel-level multicast forwarding function.
	* The packet is returned (relatively) intact; if
	* ip_mforward() returns a non-zero value, the packet
	* must be discarded, else it may be accepted below.
	*/
	if (ip_mforward &&
	ip_mforward(ip, m->m_pkthdr.rcvif, m, 0) != 0) {
	V_ipstat.ips_cantforward++;
	m_freem(m);
	return;
	}

	/*
	* The process-level routing daemon needs to receive
	* all multicast IGMP packets, whether or not this
	* host belongs to their destination groups.
	*/
	if (ip->ip_p == IPPROTO_IGMP)
	goto ours;
	V_ipstat.ips_forward++;
	}
	/*
	* See if we belong to the destination multicast group on the
	* arrival interface.
	*/
	IN_MULTI_LOCK();
	IN_LOOKUP_MULTI(ip->ip_dst, m->m_pkthdr.rcvif, inm);
	IN_MULTI_UNLOCK();
	if (inm == NULL) {
	V_ipstat.ips_notmember++;
	m_freem(m);
	return;
	}
	goto ours;
	}
	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
	goto ours;
	if (ip->ip_dst.s_addr == INADDR_ANY)
	goto ours;

	/*
	* FAITH(Firewall Aided Internet Translator)
	*/
	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
	if (V_ip_keepfaith) {
	if (ip->ip_p == IPPROTO_TCP \|\| ip->ip_p == IPPROTO_ICMP)
	goto ours;
	}
	m_freem(m);
	return;
	}

	/*
	* Not for us; forward if possible and desirable.
	*/
	if (V_ipforwarding == 0) {
	V_ipstat.ips_cantforward++;
	m_freem(m);
	} else {
	#ifdef IPSEC
	if (ip_ipsec_fwd(m))
	goto bad;
	#endif /* IPSEC */
	ip_forward(m, dchg);
	}
	return;

	ours:
	#ifdef IPSTEALTH
	/*
	* IPSTEALTH: Process non-routing options only
	* if the packet is destined for us.
	*/
	if (V_ipstealth && hlen > sizeof (struct ip) &&
	ip_dooptions(m, 1))
	return;
	#endif /* IPSTEALTH */

	/* Count the packet in the ip address stats */
	if (ia != NULL) {
	ia->ia_ifa.if_ipackets++;
	ia->ia_ifa.if_ibytes += m->m_pkthdr.len;
	}

	/*
	* Attempt reassembly; if it succeeds, proceed.
	* ip_reass() will return a different mbuf.
	*/
	if (ip->ip_off & (IP_MF \| IP_OFFMASK)) {
	m = ip_reass(m);
	if (m == NULL)
	return;
	ip = mtod(m, struct ip *);
	/* Get the header length of the reassembled packet */
	hlen = ip->ip_hl << 2;
	}

	/*
	* Further protocols expect the packet length to be w/o the
	* IP header.
	*/
	ip->ip_len -= hlen;

	#ifdef IPSEC
	/*
	* enforce IPsec policy checking if we are seeing last header.
	* note that we do not visit this with protocols with pcb layer
	* code - like udp/tcp/raw ip.
	*/
	if (ip_ipsec_input(m))
	goto bad;
	#endif /* IPSEC */

	/*
	* Switch out to protocol's input routine.
	*/
	V_ipstat.ips_delivered++;

	(*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen);
	return;
	bad:
	m_freem(m);
	}

	/*
	* After maxnipq has been updated, propagate the change to UMA. The UMA zone
	* max has slightly different semantics than the sysctl, for historical
	* reasons.
	*/
	static void
	maxnipq_update(void)
	{
	+ INIT_VNET_INET(curvnet);

	/*
	* -1 for unlimited allocation.
	*/
	if (V_maxnipq < 0)
	uma_zone_set_max(V_ipq_zone, 0);
	/*
	* Positive number for specific bound.
	*/
	if (V_maxnipq > 0)
	uma_zone_set_max(V_ipq_zone, V_maxnipq);
	/*
	* Zero specifies no further fragment queue allocation -- set the
	* bound very low, but rely on implementation elsewhere to actually
	* prevent allocation and reclaim current queues.
	*/
	if (V_maxnipq == 0)
	uma_zone_set_max(V_ipq_zone, 1);
	}

	static void
	ipq_zone_change(void *tag)
	{
	+ INIT_VNET_INET(curvnet);

	if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) {
	V_maxnipq = nmbclusters / 32;
	maxnipq_update();
	}
	}

	static int
	sysctl_maxnipq(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	int error, i;

	i = V_maxnipq;
	error = sysctl_handle_int(oidp, &i, 0, req);
	if (error \|\| !req->newptr)
	return (error);

	/*
	* XXXRW: Might be a good idea to sanity check the argument and place
	* an extreme upper bound.
	*/
	if (i < -1)
	return (EINVAL);
	V_maxnipq = i;
	maxnipq_update();
	return (0);
	}

	SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT\|CTLFLAG_RW,
	NULL, 0, sysctl_maxnipq, "I",
	"Maximum number of IPv4 fragment reassembly queue entries");

	/*
	* Take incoming datagram fragment and try to reassemble it into
	* whole datagram. If the argument is the first fragment or one
	* in between the function will return NULL and store the mbuf
	* in the fragment chain. If the argument is the last fragment
	* the packet will be reassembled and the pointer to the new
	* mbuf returned for further processing. Only m_tags attached
	* to the first packet/fragment are preserved.
	* The IP header is NOT adjusted out of iplen.
	*/
	struct mbuf *
	ip_reass(struct mbuf *m)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip;
	struct mbuf p, q, nq, t;
	struct ipq *fp = NULL;
	struct ipqhead *head;
	int i, hlen, next;
	u_int8_t ecn, ecn0;
	u_short hash;

	/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
	if (V_maxnipq == 0 \|\| V_maxfragsperpacket == 0) {
	V_ipstat.ips_fragments++;
	V_ipstat.ips_fragdropped++;
	m_freem(m);
	return (NULL);
	}

	ip = mtod(m, struct ip *);
	hlen = ip->ip_hl << 2;

	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
	head = &V_ipq[hash];
	IPQ_LOCK();

	/*
	* Look for queue of fragments
	* of this datagram.
	*/
	TAILQ_FOREACH(fp, head, ipq_list)
	if (ip->ip_id == fp->ipq_id &&
	ip->ip_src.s_addr == fp->ipq_src.s_addr &&
	ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
	#ifdef MAC
	mac_ipq_match(m, fp) &&
	#endif
	ip->ip_p == fp->ipq_p)
	goto found;

	fp = NULL;

	/*
	* Attempt to trim the number of allocated fragment queues if it
	* exceeds the administrative limit.
	*/
	if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) {
	/*
	* drop something from the tail of the current queue
	* before proceeding further
	*/
	struct ipq *q = TAILQ_LAST(head, ipqhead);
	if (q == NULL) { /* gak */
	for (i = 0; i < IPREASS_NHASH; i++) {
	struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead);
	if (r) {
	V_ipstat.ips_fragtimeout +=
	r->ipq_nfrags;
	ip_freef(&V_ipq[i], r);
	break;
	}
	}
	} else {
	V_ipstat.ips_fragtimeout += q->ipq_nfrags;
	ip_freef(head, q);
	}
	}

	found:
	/*
	* Adjust ip_len to not reflect header,
	* convert offset of this to bytes.
	*/
	ip->ip_len -= hlen;
	if (ip->ip_off & IP_MF) {
	/*
	* Make sure that fragments have a data length
	* that's a non-zero multiple of 8 bytes.
	*/
	if (ip->ip_len == 0 \|\| (ip->ip_len & 0x7) != 0) {
	V_ipstat.ips_toosmall++; /* XXX */
	goto dropfrag;
	}
	m->m_flags \|= M_FRAG;
	} else
	m->m_flags &= ~M_FRAG;
	ip->ip_off <<= 3;


	/*
	* Attempt reassembly; if it succeeds, proceed.
	* ip_reass() will return a different mbuf.
	*/
	V_ipstat.ips_fragments++;
	m->m_pkthdr.header = ip;

	/* Previous ip_reass() started here. */
	/*
	* Presence of header sizes in mbufs
	* would confuse code below.
	*/
	m->m_data += hlen;
	m->m_len -= hlen;

	/*
	* If first fragment to arrive, create a reassembly queue.
	*/
	if (fp == NULL) {
	fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
	if (fp == NULL)
	goto dropfrag;
	#ifdef MAC
	if (mac_ipq_init(fp, M_NOWAIT) != 0) {
	uma_zfree(V_ipq_zone, fp);
	fp = NULL;
	goto dropfrag;
	}
	mac_ipq_create(m, fp);
	#endif
	TAILQ_INSERT_HEAD(head, fp, ipq_list);
	V_nipq++;
	fp->ipq_nfrags = 1;
	fp->ipq_ttl = IPFRAGTTL;
	fp->ipq_p = ip->ip_p;
	fp->ipq_id = ip->ip_id;
	fp->ipq_src = ip->ip_src;
	fp->ipq_dst = ip->ip_dst;
	fp->ipq_frags = m;
	m->m_nextpkt = NULL;
	goto done;
	} else {
	fp->ipq_nfrags++;
	#ifdef MAC
	mac_ipq_update(m, fp);
	#endif
	}

	#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header))

	/*
	* Handle ECN by comparing this segment with the first one;
	* if CE is set, do not lose CE.
	* drop if CE and not-ECT are mixed for the same packet.
	*/
	ecn = ip->ip_tos & IPTOS_ECN_MASK;
	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
	if (ecn == IPTOS_ECN_CE) {
	if (ecn0 == IPTOS_ECN_NOTECT)
	goto dropfrag;
	if (ecn0 != IPTOS_ECN_CE)
	GETIP(fp->ipq_frags)->ip_tos \|= IPTOS_ECN_CE;
	}
	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
	goto dropfrag;

	/*
	* Find a segment which begins after this one does.
	*/
	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
	if (GETIP(q)->ip_off > ip->ip_off)
	break;

	/*
	* If there is a preceding segment, it may provide some of
	* our data already. If so, drop the data from the incoming
	* segment. If it provides all of our data, drop us, otherwise
	* stick new segment in the proper place.
	*
	* If some of the data is dropped from the the preceding
	* segment, then it's checksum is invalidated.
	*/
	if (p) {
	i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
	if (i > 0) {
	if (i >= ip->ip_len)
	goto dropfrag;
	m_adj(m, i);
	m->m_pkthdr.csum_flags = 0;
	ip->ip_off += i;
	ip->ip_len -= i;
	}
	m->m_nextpkt = p->m_nextpkt;
	p->m_nextpkt = m;
	} else {
	m->m_nextpkt = fp->ipq_frags;
	fp->ipq_frags = m;
	}

	/*
	* While we overlap succeeding segments trim them or,
	* if they are completely covered, dequeue them.
	*/
	for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
	q = nq) {
	i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
	if (i < GETIP(q)->ip_len) {
	GETIP(q)->ip_len -= i;
	GETIP(q)->ip_off += i;
	m_adj(q, i);
	q->m_pkthdr.csum_flags = 0;
	break;
	}
	nq = q->m_nextpkt;
	m->m_nextpkt = nq;
	V_ipstat.ips_fragdropped++;
	fp->ipq_nfrags--;
	m_freem(q);
	}

	/*
	* Check for complete reassembly and perform frag per packet
	* limiting.
	*
	* Frag limiting is performed here so that the nth frag has
	* a chance to complete the packet before we drop the packet.
	* As a result, n+1 frags are actually allowed per packet, but
	* only n will ever be stored. (n = maxfragsperpacket.)
	*
	*/
	next = 0;
	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
	if (GETIP(q)->ip_off != next) {
	if (fp->ipq_nfrags > V_maxfragsperpacket) {
	V_ipstat.ips_fragdropped += fp->ipq_nfrags;
	ip_freef(head, fp);
	}
	goto done;
	}
	next += GETIP(q)->ip_len;
	}
	/* Make sure the last packet didn't have the IP_MF flag */
	if (p->m_flags & M_FRAG) {
	if (fp->ipq_nfrags > V_maxfragsperpacket) {
	V_ipstat.ips_fragdropped += fp->ipq_nfrags;
	ip_freef(head, fp);
	}
	goto done;
	}

	/*
	* Reassembly is complete. Make sure the packet is a sane size.
	*/
	q = fp->ipq_frags;
	ip = GETIP(q);
	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
	V_ipstat.ips_toolong++;
	V_ipstat.ips_fragdropped += fp->ipq_nfrags;
	ip_freef(head, fp);
	goto done;
	}

	/*
	* Concatenate fragments.
	*/
	m = q;
	t = m->m_next;
	m->m_next = NULL;
	m_cat(m, t);
	nq = q->m_nextpkt;
	q->m_nextpkt = NULL;
	for (q = nq; q != NULL; q = nq) {
	nq = q->m_nextpkt;
	q->m_nextpkt = NULL;
	m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
	m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
	m_cat(m, q);
	}
	/*
	* In order to do checksumming faster we do 'end-around carry' here
	* (and not in for{} loop), though it implies we are not going to
	* reassemble more than 64k fragments.
	*/
	m->m_pkthdr.csum_data =
	(m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16);
	#ifdef MAC
	mac_ipq_reassemble(fp, m);
	mac_ipq_destroy(fp);
	#endif

	/*
	* Create header for new ip packet by modifying header of first
	* packet; dequeue and discard fragment reassembly header.
	* Make header visible.
	*/
	ip->ip_len = (ip->ip_hl << 2) + next;
	ip->ip_src = fp->ipq_src;
	ip->ip_dst = fp->ipq_dst;
	TAILQ_REMOVE(head, fp, ipq_list);
	V_nipq--;
	uma_zfree(V_ipq_zone, fp);
	m->m_len += (ip->ip_hl << 2);
	m->m_data -= (ip->ip_hl << 2);
	/* some debugging cruft by sklower, below, will go away soon */
	if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */
	m_fixhdr(m);
	V_ipstat.ips_reassembled++;
	IPQ_UNLOCK();
	return (m);

	dropfrag:
	V_ipstat.ips_fragdropped++;
	if (fp != NULL)
	fp->ipq_nfrags--;
	m_freem(m);
	done:
	IPQ_UNLOCK();
	return (NULL);

	#undef GETIP
	}

	/*
	* Free a fragment reassembly header and all
	* associated datagrams.
	*/
	static void
	ip_freef(struct ipqhead fhp, struct ipq fp)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *q;

	IPQ_LOCK_ASSERT();

	while (fp->ipq_frags) {
	q = fp->ipq_frags;
	fp->ipq_frags = q->m_nextpkt;
	m_freem(q);
	}
	TAILQ_REMOVE(fhp, fp, ipq_list);
	uma_zfree(V_ipq_zone, fp);
	V_nipq--;
	}

	/*
	* IP timer processing;
	* if a timer expires on a reassembly
	* queue, discard it.
	*/
	void
	ip_slowtimo(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct ipq *fp;
	int i;

	IPQ_LOCK();
	- for (i = 0; i < IPREASS_NHASH; i++) {
	- for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
	- struct ipq *fpp;
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET(vnet_iter);
	+ for (i = 0; i < IPREASS_NHASH; i++) {
	+ for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) {
	+ struct ipq *fpp;

	- fpp = fp;
	- fp = TAILQ_NEXT(fp, ipq_list);
	- if(--fpp->ipq_ttl == 0) {
	- V_ipstat.ips_fragtimeout += fpp->ipq_nfrags;
	- ip_freef(&V_ipq[i], fpp);
	+ fpp = fp;
	+ fp = TAILQ_NEXT(fp, ipq_list);
	+ if(--fpp->ipq_ttl == 0) {
	+ V_ipstat.ips_fragtimeout +=
	+ fpp->ipq_nfrags;
	+ ip_freef(&V_ipq[i], fpp);
	+ }
	}
	}
	- }
	- /*
	- * If we are over the maximum number of fragments
	- * (due to the limit being lowered), drain off
	- * enough to get down to the new limit.
	- */
	- if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
	- for (i = 0; i < IPREASS_NHASH; i++) {
	- while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) {
	- V_ipstat.ips_fragdropped +=
	- TAILQ_FIRST(&V_ipq[i])->ipq_nfrags;
	- ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
	+ /*
	+ * If we are over the maximum number of fragments
	+ * (due to the limit being lowered), drain off
	+ * enough to get down to the new limit.
	+ */
	+ if (V_maxnipq >= 0 && V_nipq > V_maxnipq) {
	+ for (i = 0; i < IPREASS_NHASH; i++) {
	+ while (V_nipq > V_maxnipq &&
	+ !TAILQ_EMPTY(&V_ipq[i])) {
	+ V_ipstat.ips_fragdropped +=
	+ TAILQ_FIRST(&V_ipq[i])->ipq_nfrags;
	+ ip_freef(&V_ipq[i],
	+ TAILQ_FIRST(&V_ipq[i]));
	+ }
	}
	}
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IPQ_UNLOCK();
	}

	/*
	* Drain off all datagram fragments.
	*/
	void
	ip_drain(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	int i;

	IPQ_LOCK();
	- for (i = 0; i < IPREASS_NHASH; i++) {
	- while(!TAILQ_EMPTY(&V_ipq[i])) {
	- V_ipstat.ips_fragdropped +=
	- TAILQ_FIRST(&V_ipq[i])->ipq_nfrags;
	- ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET(vnet_iter);
	+ for (i = 0; i < IPREASS_NHASH; i++) {
	+ while(!TAILQ_EMPTY(&V_ipq[i])) {
	+ V_ipstat.ips_fragdropped +=
	+ TAILQ_FIRST(&V_ipq[i])->ipq_nfrags;
	+ ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i]));
	+ }
	}
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IPQ_UNLOCK();
	in_rtqdrain();
	}

	/*
	* The protocol to be inserted into ip_protox[] must be already registered
	* in inetsw[], either statically or through pf_proto_register().
	*/
	int
	ipproto_register(u_char ipproto)
	{
	struct protosw *pr;

	/* Sanity checks. */
	if (ipproto == 0)
	return (EPROTONOSUPPORT);

	/*
	* The protocol slot must not be occupied by another protocol
	* already. An index pointing to IPPROTO_RAW is unused.
	*/
	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
	if (pr == NULL)
	return (EPFNOSUPPORT);
	if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */
	return (EEXIST);

	/* Find the protocol position in inetsw[] and set the index. */
	for (pr = inetdomain.dom_protosw;
	pr < inetdomain.dom_protoswNPROTOSW; pr++) {
	if (pr->pr_domain->dom_family == PF_INET &&
	pr->pr_protocol && pr->pr_protocol == ipproto) {
	/* Be careful to only index valid IP protocols. */
	if (pr->pr_protocol < IPPROTO_MAX) {
	ip_protox[pr->pr_protocol] = pr - inetsw;
	return (0);
	} else
	return (EINVAL);
	}
	}
	return (EPROTONOSUPPORT);
	}

	int
	ipproto_unregister(u_char ipproto)
	{
	struct protosw *pr;

	/* Sanity checks. */
	if (ipproto == 0)
	return (EPROTONOSUPPORT);

	/* Check if the protocol was indeed registered. */
	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
	if (pr == NULL)
	return (EPFNOSUPPORT);
	if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */
	return (ENOENT);

	/* Reset the protocol slot to IPPROTO_RAW. */
	ip_protox[ipproto] = pr - inetsw;
	return (0);
	}

	/*
	* Given address of next destination (final or next hop),
	* return internet address info of interface to be used to get there.
	*/
	struct in_ifaddr *
	ip_rtaddr(struct in_addr dst, u_int fibnum)
	{
	struct route sro;
	struct sockaddr_in *sin;
	struct in_ifaddr *ifa;

	bzero(&sro, sizeof(sro));
	sin = (struct sockaddr_in *)&sro.ro_dst;
	sin->sin_family = AF_INET;
	sin->sin_len = sizeof(*sin);
	sin->sin_addr = dst;
	in_rtalloc_ign(&sro, RTF_CLONING, fibnum);

	if (sro.ro_rt == NULL)
	return (NULL);

	ifa = ifatoia(sro.ro_rt->rt_ifa);
	RTFREE(sro.ro_rt);
	return (ifa);
	}

	u_char inetctlerrmap[PRC_NCMDS] = {
	0, 0, 0, 0,
	0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
	EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
	EMSGSIZE, EHOSTUNREACH, 0, 0,
	0, 0, EHOSTUNREACH, 0,
	ENOPROTOOPT, ECONNREFUSED
	};

	/*
	* Forward a packet. If some error occurs return the sender
	* an icmp packet. Note we can't always generate a meaningful
	* icmp message because icmp doesn't have a large enough repertoire
	* of codes and types.
	*
	* If not forwarding, just drop the packet. This could be confusing
	* if ipforwarding was zero but some routing protocol was advancing
	* us as a gateway to somewhere. However, we must let the routing
	* protocol deal with that.
	*
	* The srcrt parameter indicates whether the packet is being forwarded
	* via a source route.
	*/
	void
	ip_forward(struct mbuf *m, int srcrt)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip ip = mtod(m, struct ip );
	struct in_ifaddr *ia = NULL;
	struct mbuf *mcopy;
	struct in_addr dest;
	struct route ro;
	int error, type = 0, code = 0, mtu = 0;

	if (m->m_flags & (M_BCAST\|M_MCAST) \|\| in_canforward(ip->ip_dst) == 0) {
	V_ipstat.ips_cantforward++;
	m_freem(m);
	return;
	}
	#ifdef IPSTEALTH
	if (!V_ipstealth) {
	#endif
	if (ip->ip_ttl <= IPTTLDEC) {
	icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
	0, 0);
	return;
	}
	#ifdef IPSTEALTH
	}
	#endif

	ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
	if (!srcrt && ia == NULL) {
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
	return;
	}

	/*
	* Save the IP header and at most 8 bytes of the payload,
	* in case we need to generate an ICMP message to the src.
	*
	* XXX this can be optimized a lot by saving the data in a local
	* buffer on the stack (72 bytes at most), and only allocating the
	* mbuf if really necessary. The vast majority of the packets
	* are forwarded without having to send an ICMP back (either
	* because unnecessary, or because rate limited), so we are
	* really we are wasting a lot of work here.
	*
	* We don't use m_copy() because it might return a reference
	* to a shared cluster. Both this function and ip_output()
	* assume exclusive access to the IP header in `m', so any
	* data in a cluster may change before we reach icmp_error().
	*/
	MGETHDR(mcopy, M_DONTWAIT, m->m_type);
	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) {
	/*
	* It's probably ok if the pkthdr dup fails (because
	* the deep copy of the tag chain failed), but for now
	* be conservative and just discard the copy since
	* code below may some day want the tags.
	*/
	m_free(mcopy);
	mcopy = NULL;
	}
	if (mcopy != NULL) {
	mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy));
	mcopy->m_pkthdr.len = mcopy->m_len;
	m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
	}

	#ifdef IPSTEALTH
	if (!V_ipstealth) {
	#endif
	ip->ip_ttl -= IPTTLDEC;
	#ifdef IPSTEALTH
	}
	#endif

	/*
	* If forwarding packet using same interface that it came in on,
	* perhaps should send a redirect to sender to shortcut a hop.
	* Only send redirect if source is sending directly to us,
	* and if packet was not source routed (or has any options).
	* Also, don't send redirect if forwarding using a default route
	* or a route modified by a redirect.
	*/
	dest.s_addr = 0;
	if (!srcrt && V_ipsendredirects && ia->ia_ifp == m->m_pkthdr.rcvif) {
	struct sockaddr_in *sin;
	struct rtentry *rt;

	bzero(&ro, sizeof(ro));
	sin = (struct sockaddr_in *)&ro.ro_dst;
	sin->sin_family = AF_INET;
	sin->sin_len = sizeof(*sin);
	sin->sin_addr = ip->ip_dst;
	in_rtalloc_ign(&ro, RTF_CLONING, M_GETFIB(m));

	rt = ro.ro_rt;

	if (rt && (rt->rt_flags & (RTF_DYNAMIC\|RTF_MODIFIED)) == 0 &&
	satosin(rt_key(rt))->sin_addr.s_addr != 0) {
	#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa))
	u_long src = ntohl(ip->ip_src.s_addr);

	if (RTA(rt) &&
	(src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) {
	if (rt->rt_flags & RTF_GATEWAY)
	dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr;
	else
	dest.s_addr = ip->ip_dst.s_addr;
	/* Router requirements says to only send host redirects */
	type = ICMP_REDIRECT;
	code = ICMP_REDIRECT_HOST;
	}
	}
	if (rt)
	RTFREE(rt);
	}

	/*
	* Try to cache the route MTU from ip_output so we can consider it for
	* the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
	*/
	bzero(&ro, sizeof(ro));

	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);

	if (error == EMSGSIZE && ro.ro_rt)
	mtu = ro.ro_rt->rt_rmx.rmx_mtu;
	if (ro.ro_rt)
	RTFREE(ro.ro_rt);

	if (error)
	V_ipstat.ips_cantforward++;
	else {
	V_ipstat.ips_forward++;
	if (type)
	V_ipstat.ips_redirectsent++;
	else {
	if (mcopy)
	m_freem(mcopy);
	return;
	}
	}
	if (mcopy == NULL)
	return;

	switch (error) {

	case 0: /* forwarded, but need redirect */
	/* type, code set above */
	break;

	case ENETUNREACH: /* shouldn't happen, checked above */
	case EHOSTUNREACH:
	case ENETDOWN:
	case EHOSTDOWN:
	default:
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_HOST;
	break;

	case EMSGSIZE:
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_NEEDFRAG;

	#ifdef IPSEC
	/*
	* If IPsec is configured for this path,
	* override any possibly mtu value set by ip_output.
	*/
	mtu = ip_ipsec_mtu(m, mtu);
	#endif /* IPSEC */
	/*
	* If the MTU was set before make sure we are below the
	* interface MTU.
	* If the MTU wasn't set before use the interface mtu or
	* fall back to the next smaller mtu step compared to the
	* current packet size.
	*/
	if (mtu != 0) {
	if (ia != NULL)
	mtu = min(mtu, ia->ia_ifp->if_mtu);
	} else {
	if (ia != NULL)
	mtu = ia->ia_ifp->if_mtu;
	else
	mtu = ip_next_mtu(ip->ip_len, 0);
	}
	V_ipstat.ips_cantfrag++;
	break;

	case ENOBUFS:
	/*
	* A router should not generate ICMP_SOURCEQUENCH as
	* required in RFC1812 Requirements for IP Version 4 Routers.
	* Source quench could be a big problem under DoS attacks,
	* or if the underlying interface is rate-limited.
	* Those who need source quench packets may re-enable them
	* via the net.inet.ip.sendsourcequench sysctl.
	*/
	if (V_ip_sendsourcequench == 0) {
	m_freem(mcopy);
	return;
	} else {
	type = ICMP_SOURCEQUENCH;
	code = 0;
	}
	break;

	case EACCES: /* ipfw denied packet */
	m_freem(mcopy);
	return;
	}
	icmp_error(mcopy, type, code, dest.s_addr, mtu);
	}

	void
	ip_savecontrol(struct inpcb inp, struct mbuf mp, struct ip ip,
	struct mbuf *m)
	{
	+ INIT_VNET_NET(inp->inp_vnet);
	+
	if (inp->inp_socket->so_options & (SO_BINTIME \| SO_TIMESTAMP)) {
	struct bintime bt;

	bintime(&bt);
	if (inp->inp_socket->so_options & SO_BINTIME) {
	*mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt),
	SCM_BINTIME, SOL_SOCKET);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
	struct timeval tv;

	bintime2timeval(&bt, &tv);
	*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
	SCM_TIMESTAMP, SOL_SOCKET);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	}
	if (inp->inp_flags & INP_RECVDSTADDR) {
	*mp = sbcreatecontrol((caddr_t) &ip->ip_dst,
	sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	if (inp->inp_flags & INP_RECVTTL) {
	*mp = sbcreatecontrol((caddr_t) &ip->ip_ttl,
	sizeof(u_char), IP_RECVTTL, IPPROTO_IP);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	#ifdef notyet
	/* XXX
	* Moving these out of udp_input() made them even more broken
	* than they already were.
	*/
	/* options were tossed already */
	if (inp->inp_flags & INP_RECVOPTS) {
	*mp = sbcreatecontrol((caddr_t) opts_deleted_above,
	sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	/* ip_srcroute doesn't do what we want here, need to fix */
	if (inp->inp_flags & INP_RECVRETOPTS) {
	*mp = sbcreatecontrol((caddr_t) ip_srcroute(m),
	sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	#endif
	if (inp->inp_flags & INP_RECVIF) {
	struct ifnet *ifp;
	struct sdlbuf {
	struct sockaddr_dl sdl;
	u_char pad[32];
	} sdlbuf;
	struct sockaddr_dl *sdp;
	struct sockaddr_dl *sdl2 = &sdlbuf.sdl;

	if (((ifp = m->m_pkthdr.rcvif))
	&& ( ifp->if_index && (ifp->if_index <= V_if_index))) {
	sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
	/*
	* Change our mind and don't try copy.
	*/
	if ((sdp->sdl_family != AF_LINK)
	\|\| (sdp->sdl_len > sizeof(sdlbuf))) {
	goto makedummy;
	}
	bcopy(sdp, sdl2, sdp->sdl_len);
	} else {
	makedummy:
	sdl2->sdl_len
	= offsetof(struct sockaddr_dl, sdl_data[0]);
	sdl2->sdl_family = AF_LINK;
	sdl2->sdl_index = 0;
	sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
	}
	*mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len,
	IP_RECVIF, IPPROTO_IP);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	}

	/*
	* XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
	* ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
	* locking. This code remains in ip_input.c as ip_mroute.c is optionally
	* compiled.
	*/
	static int ip_rsvp_on;
	struct socket *ip_rsvpd;
	int
	ip_rsvp_init(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	+
	if (so->so_type != SOCK_RAW \|\|
	so->so_proto->pr_protocol != IPPROTO_RSVP)
	return EOPNOTSUPP;

	if (V_ip_rsvpd != NULL)
	return EADDRINUSE;

	V_ip_rsvpd = so;
	/*
	* This may seem silly, but we need to be sure we don't over-increment
	* the RSVP counter, in case something slips up.
	*/
	if (!V_ip_rsvp_on) {
	V_ip_rsvp_on = 1;
	V_rsvp_on++;
	}

	return 0;
	}

	int
	ip_rsvp_done(void)
	{
	+ INIT_VNET_INET(curvnet);
	+
	V_ip_rsvpd = NULL;
	/*
	* This may seem silly, but we need to be sure we don't over-decrement
	* the RSVP counter, in case something slips up.
	*/
	if (V_ip_rsvp_on) {
	V_ip_rsvp_on = 0;
	V_rsvp_on--;
	}
	return 0;
	}

	void
	rsvp_input(struct mbuf m, int off) / XXX must fixup manually */
	{
	+ INIT_VNET_INET(curvnet);
	+
	if (rsvp_input_p) { /* call the real one if loaded */
	rsvp_input_p(m, off);
	return;
	}

	/* Can still get packets with rsvp_on = 0 if there is a local member
	* of the group to which the RSVP packet is addressed. But in this
	* case we want to throw the packet away.
	*/

	if (!V_rsvp_on) {
	m_freem(m);
	return;
	}

	if (V_ip_rsvpd != NULL) {
	rip_input(m, off);
	return;
	}
	/* Drop the packet */
	m_freem(m);
	}
	Index: head/sys/netinet/ip_ipsec.c
	===================================================================
	--- head/sys/netinet/ip_ipsec.c (revision 183549)
	+++ head/sys/netinet/ip_ipsec.c (revision 183550)
	@@ -1,399 +1,402 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipsec.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/ip_ipsec.h>

	#include <machine/in_cksum.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/xform.h>
	#include <netipsec/key.h>
	#endif /IPSEC/

	extern struct protosw inetsw[];

	/*
	* Check if we have to jump over firewall processing for this packet.
	* Called from ip_input().
	* 1 = jump over firewall, 0 = packet goes through firewall.
	*/
	int
	ip_ipsec_filtertunnel(struct mbuf *m)
	{
	#if defined(IPSEC) && !defined(IPSEC_FILTERTUNNEL)
	/*
	* Bypass packet filtering for packets from a tunnel.
	*/
	if (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL)
	return 1;
	#endif
	return 0;
	}

	/*
	* Check if this packet has an active SA and needs to be dropped instead
	* of forwarded.
	* Called from ip_input().
	* 1 = drop packet, 0 = forward packet.
	*/
	int
	ip_ipsec_fwd(struct mbuf *m)
	{
	#ifdef IPSEC
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_IPSEC(curvnet);
	struct m_tag *mtag;
	struct tdb_ident *tdbi;
	struct secpolicy *sp;
	int s, error;

	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
	s = splnet();
	if (mtag != NULL) {
	tdbi = (struct tdb_ident *)(mtag + 1);
	sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
	} else {
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
	IP_FORWARDING, &error);
	}
	if (sp == NULL) { /* NB: can happen if error */
	splx(s);
	/XXX error stat???/
	DPRINTF(("ip_input: no SP for forwarding\n")); /XXX/
	return 1;
	}

	/*
	* Check security policy against packet attributes.
	*/
	error = ipsec_in_reject(sp, m);
	KEY_FREESP(&sp);
	splx(s);
	if (error) {
	V_ipstat.ips_cantforward++;
	return 1;
	}
	#endif /* IPSEC */
	return 0;
	}

	/*
	* Check if protocol type doesn't have a further header and do IPSEC
	* decryption or reject right now. Protocols with further headers get
	* their IPSEC treatment within the protocol specific processing.
	* Called from ip_input().
	* 1 = drop packet, 0 = continue processing packet.
	*/
	int
	ip_ipsec_input(struct mbuf *m)
	{
	struct ip ip = mtod(m, struct ip );
	#ifdef IPSEC
	+ INIT_VNET_IPSEC(curvnet);
	struct m_tag *mtag;
	struct tdb_ident *tdbi;
	struct secpolicy *sp;
	int s, error;
	/*
	* enforce IPsec policy checking if we are seeing last header.
	* note that we do not visit this with protocols with pcb layer
	* code - like udp/tcp/raw ip.
	*/
	if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) {
	/*
	* Check if the packet has already had IPsec processing
	* done. If so, then just pass it along. This tag gets
	* set during AH, ESP, etc. input handling, before the
	* packet is returned to the ip input queue for delivery.
	*/
	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
	s = splnet();
	if (mtag != NULL) {
	tdbi = (struct tdb_ident *)(mtag + 1);
	sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
	} else {
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
	IP_FORWARDING, &error);
	}
	if (sp != NULL) {
	/*
	* Check security policy against packet attributes.
	*/
	error = ipsec_in_reject(sp, m);
	KEY_FREESP(&sp);
	} else {
	/* XXX error stat??? */
	error = EINVAL;
	DPRINTF(("ip_input: no SP, packet discarded\n"));/XXX/
	return 1;
	}
	splx(s);
	if (error)
	return 1;
	}
	#endif /* IPSEC */
	return 0;
	}

	/*
	* Compute the MTU for a forwarded packet that gets IPSEC encapsulated.
	* Called from ip_forward().
	* Returns MTU suggestion for ICMP needfrag reply.
	*/
	int
	ip_ipsec_mtu(struct mbuf *m, int mtu)
	{
	/*
	* If the packet is routed over IPsec tunnel, tell the
	* originator the tunnel MTU.
	* tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
	* XXX quickhack!!!
	*/
	struct secpolicy *sp = NULL;
	int ipsecerror;
	int ipsechdr;
	struct route *ro;
	sp = ipsec_getpolicybyaddr(m,
	IPSEC_DIR_OUTBOUND,
	IP_FORWARDING,
	&ipsecerror);
	if (sp != NULL) {
	/* count IPsec header size */
	ipsechdr = ipsec4_hdrsiz(m,
	IPSEC_DIR_OUTBOUND,
	NULL);

	/*
	* find the correct route for outer IPv4
	* header, compute tunnel MTU.
	*/
	if (sp->req != NULL &&
	sp->req->sav != NULL &&
	sp->req->sav->sah != NULL) {
	ro = &sp->req->sav->sah->sa_route;
	if (ro->ro_rt && ro->ro_rt->rt_ifp) {
	mtu =
	ro->ro_rt->rt_rmx.rmx_mtu ?
	ro->ro_rt->rt_rmx.rmx_mtu :
	ro->ro_rt->rt_ifp->if_mtu;
	mtu -= ipsechdr;
	}
	}
	KEY_FREESP(&sp);
	}
	return mtu;
	}

	/*
	*
	* Called from ip_output().
	* 1 = drop packet, 0 = continue processing packet,
	* -1 = packet was reinjected and stop processing packet
	*/
	int
	ip_ipsec_output(struct mbuf *m, struct inpcb inp, int flags, int error,
	struct route *ro, struct route iproute, struct sockaddr_in **dst,
	struct in_ifaddr ia, struct ifnet ifp)
	{
	#ifdef IPSEC
	struct secpolicy *sp = NULL;
	struct ip ip = mtod(m, struct ip *);
	struct tdb_ident *tdbi;
	struct m_tag *mtag;
	int s;
	/*
	* Check the security policy (SP) for the packet and, if
	* required, do IPsec-related processing. There are two
	* cases here; the first time a packet is sent through
	* it will be untagged and handled by ipsec4_checkpolicy.
	* If the packet is resubmitted to ip_output (e.g. after
	* AH, ESP, etc. processing), there will be a tag to bypass
	* the lookup and related policy checking.
	*/
	mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
	s = splnet();
	if (mtag != NULL) {
	tdbi = (struct tdb_ident *)(mtag + 1);
	sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
	if (sp == NULL)
	error = -EINVAL; / force silent drop */
	m_tag_delete(*m, mtag);
	} else {
	sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, flags,
	error, inp);
	}
	/*
	* There are four return cases:
	* sp != NULL apply IPsec policy
	* sp == NULL, error == 0 no IPsec handling needed
	* sp == NULL, error == -EINVAL discard packet w/o error
	* sp == NULL, error != 0 discard packet, report error
	*/
	if (sp != NULL) {
	/* Loop detection, check if ipsec processing already done */
	KASSERT(sp->req != NULL, ("ip_output: no ipsec request"));
	for (mtag = m_tag_first(*m); mtag != NULL;
	mtag = m_tag_next(*m, mtag)) {
	if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
	continue;
	if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
	mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
	continue;
	/*
	* Check if policy has an SA associated with it.
	* This can happen when an SP has yet to acquire
	* an SA; e.g. on first reference. If it occurs,
	* then we let ipsec4_process_packet do its thing.
	*/
	if (sp->req->sav == NULL)
	break;
	tdbi = (struct tdb_ident *)(mtag + 1);
	if (tdbi->spi == sp->req->sav->spi &&
	tdbi->proto == sp->req->sav->sah->saidx.proto &&
	bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst,
	sizeof (union sockaddr_union)) == 0) {
	/*
	* No IPsec processing is needed, free
	* reference to SP.
	*
	* NB: null pointer to avoid free at
	* done: below.
	*/
	KEY_FREESP(&sp), sp = NULL;
	splx(s);
	goto done;
	}
	}

	/*
	* Do delayed checksums now because we send before
	* this is done in the normal processing path.
	*/
	if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	in_delayed_cksum(*m);
	(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}

	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);

	/* NB: callee frees mbuf */
	error = ipsec4_process_packet(m, sp->req, *flags, 0);
	if (*error == EJUSTRETURN) {
	/*
	* We had a SP with a level of 'use' and no SA. We
	* will just continue to process the packet without
	* IPsec processing and return without error.
	*/
	*error = 0;
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);
	goto done;
	}
	/*
	* Preserve KAME behaviour: ENOENT can be returned
	* when an SA acquire is in progress. Don't propagate
	* this to user-level; it confuses applications.
	*
	* XXX this will go away when the SADB is redone.
	*/
	if (*error == ENOENT)
	*error = 0;
	splx(s);
	goto reinjected;
	} else { /* sp == NULL */
	splx(s);

	if (*error != 0) {
	/*
	* Hack: -EINVAL is used to signal that a packet
	* should be silently discarded. This is typically
	* because we asked key management for an SA and
	* it was delayed (e.g. kicked up to IKE).
	*/
	if (*error == -EINVAL)
	*error = 0;
	goto bad;
	} else {
	/* No IPsec processing for this packet. */
	}
	#ifdef notyet
	/*
	* If deferred crypto processing is needed, check that
	* the interface supports it.
	*/
	mtag = m_tag_find(*m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL);
	if (mtag != NULL && ((*ifp)->if_capenable & IFCAP_IPSEC) == 0) {
	/* notify IPsec to do its own crypto */
	ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1));
	*error = EHOSTUNREACH;
	goto bad;
	}
	#endif
	}
	done:
	if (sp != NULL)
	KEY_FREESP(&sp);
	return 0;
	reinjected:
	if (sp != NULL)
	KEY_FREESP(&sp);
	return -1;
	bad:
	if (sp != NULL)
	KEY_FREESP(&sp);
	return 1;
	#endif /* IPSEC */
	return 0;
	}
	Index: head/sys/netinet/ip_mroute.c
	===================================================================
	--- head/sys/netinet/ip_mroute.c (revision 183549)
	+++ head/sys/netinet/ip_mroute.c (revision 183550)
	@@ -1,3152 +1,3167 @@
	/*-
	* Copyright (c) 1989 Stephen Deering
	* Copyright (c) 1992, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Stephen Deering of Stanford University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
	*/

	/*
	* IP multicast forwarding procedures
	*
	* Written by David Waitzman, BBN Labs, August 1988.
	* Modified by Steve Deering, Stanford, February 1989.
	* Modified by Mark J. Steiglitz, Stanford, May, 1991
	* Modified by Van Jacobson, LBL, January 1993
	* Modified by Ajit Thyagarajan, PARC, August 1993
	* Modified by Bill Fenner, PARC, April 1995
	* Modified by Ahmed Helmy, SGI, June 1996
	* Modified by George Edmond Eddy (Rusty), ISI, February 1998
	* Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
	* Modified by Hitoshi Asaeda, WIDE, August 2000
	* Modified by Pavlin Radoslavov, ICSI, October 2002
	*
	* MROUTING Revision: 3.5
	* and PIM-SMv2 and PIM-DM support, advanced API support,
	* bandwidth metering and signaling
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mac.h"
	#include "opt_mrouting.h"

	#define _PIM_VT 1

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/priv.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/vimage.h>
	#include <net/if.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <netinet/in.h>
	#include <netinet/igmp.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_encap.h>
	#include <netinet/ip_mroute.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/pim.h>
	#include <netinet/pim_var.h>
	#include <netinet/udp.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/ip6_mroute.h>
	#include <netinet6/ip6_var.h>
	#endif
	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	/*
	* Control debugging code for rsvp and multicast routing code.
	* Can only set them with the debugger.
	*/
	static u_int rsvpdebug; /* non-zero enables debugging */

	static u_int mrtdebug; /* any set of the flags below */
	#define DEBUG_MFC 0x02
	#define DEBUG_FORWARD 0x04
	#define DEBUG_EXPIRE 0x08
	#define DEBUG_XMIT 0x10
	#define DEBUG_PIM 0x20

	#define VIFI_INVALID ((vifi_t) -1)

	#define M_HASCL(m) ((m)->m_flags & M_EXT)

	static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables");

	/*
	* Locking. We use two locks: one for the virtual interface table and
	* one for the forwarding table. These locks may be nested in which case
	* the VIF lock must always be taken first. Note that each lock is used
	* to cover not only the specific data structure but also related data
	* structures. It may be better to add more fine-grained locking later;
	* it's not clear how performance-critical this code is.
	*
	* XXX: This module could particularly benefit from being cleaned
	* up to use the <sys/queue.h> macros.
	*
	*/

	static struct mrtstat mrtstat;
	SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
	&mrtstat, mrtstat,
	"Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)");

	static struct mfc *mfctable[MFCTBLSIZ];
	SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
	&mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]",
	"Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)");

	static struct mtx mrouter_mtx;
	#define MROUTER_LOCK() mtx_lock(&mrouter_mtx)
	#define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx)
	#define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED)
	#define MROUTER_LOCK_INIT() \
	mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF)
	#define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx)

	static struct mtx mfc_mtx;
	#define MFC_LOCK() mtx_lock(&mfc_mtx)
	#define MFC_UNLOCK() mtx_unlock(&mfc_mtx)
	#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED)
	#define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF)
	#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx)

	static struct vif viftable[MAXVIFS];
	SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
	&viftable, sizeof(viftable), "S,vif[MAXVIFS]",
	"Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");

	static struct mtx vif_mtx;
	#define VIF_LOCK() mtx_lock(&vif_mtx)
	#define VIF_UNLOCK() mtx_unlock(&vif_mtx)
	#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED)
	#define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF)
	#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx)

	static u_char nexpire[MFCTBLSIZ];

	static eventhandler_tag if_detach_event_tag = NULL;

	static struct callout expire_upcalls_ch;

	#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
	#define UPCALL_EXPIRE 6 /* number of timeouts */

	#define ENCAP_TTL 64

	/*
	* Bandwidth meter variables and constants
	*/
	static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
	/*
	* Pending timeouts are stored in a hash table, the key being the
	* expiration time. Periodically, the entries are analysed and processed.
	*/
	#define BW_METER_BUCKETS 1024
	static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS];
	static struct callout bw_meter_ch;
	#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */

	/*
	* Pending upcalls are stored in a vector which is flushed when
	* full, or periodically
	*/
	static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX];
	static u_int bw_upcalls_n; /* # of pending upcalls */
	static struct callout bw_upcalls_ch;
	#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */

	static struct pimstat pimstat;

	SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");
	SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD,
	&pimstat, pimstat,
	"PIM Statistics (struct pimstat, netinet/pim_var.h)");

	static u_long pim_squelch_wholepkt = 0;
	SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
	&pim_squelch_wholepkt, 0,
	"Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");

	extern struct domain inetdomain;
	struct protosw in_pim_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inetdomain,
	.pr_protocol = IPPROTO_PIM,
	.pr_flags = PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	.pr_input = pim_input,
	.pr_output = (pr_output_t*)rip_output,
	.pr_ctloutput = rip_ctloutput,
	.pr_usrreqs = &rip_usrreqs
	};
	static const struct encaptab *pim_encap_cookie;

	#ifdef INET6
	/* ip6_mroute.c glue */
	extern struct in6_protosw in6_pim_protosw;
	static const struct encaptab *pim6_encap_cookie;

	extern int X_ip6_mrouter_set(struct socket , struct sockopt );
	extern int X_ip6_mrouter_get(struct socket , struct sockopt );
	extern int X_ip6_mrouter_done(void);
	extern int X_ip6_mforward(struct ip6_hdr , struct ifnet , struct mbuf *);
	extern int X_mrt6_ioctl(int, caddr_t);
	#endif

	static int pim_encapcheck(const struct mbuf , int, int, void );

	/*
	* Note: the PIM Register encapsulation adds the following in front of a
	* data packet:
	*
	* struct pim_encap_hdr {
	* struct ip ip;
	* struct pim_encap_pimhdr pim;
	* }
	*
	*/

	struct pim_encap_pimhdr {
	struct pim pim;
	uint32_t flags;
	};

	static struct ip pim_encap_iphdr = {
	#if BYTE_ORDER == LITTLE_ENDIAN
	sizeof(struct ip) >> 2,
	IPVERSION,
	#else
	IPVERSION,
	sizeof(struct ip) >> 2,
	#endif
	0, /* tos */
	sizeof(struct ip), /* total length */
	0, /* id */
	0, /* frag offset */
	ENCAP_TTL,
	IPPROTO_PIM,
	0, /* checksum */
	};

	static struct pim_encap_pimhdr pim_encap_pimhdr = {
	{
	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
	0, /* reserved */
	0, /* checksum */
	},
	0 /* flags */
	};

	static struct ifnet multicast_register_if;
	static vifi_t reg_vif_num = VIFI_INVALID;

	/*
	* Private variables.
	*/
	static vifi_t numvifs;

	static u_long X_ip_mcast_src(int vifi);
	static int X_ip_mforward(struct ip ip, struct ifnet ifp,
	struct mbuf m, struct ip_moptions imo);
	static int X_ip_mrouter_done(void);
	static int X_ip_mrouter_get(struct socket so, struct sockopt m);
	static int X_ip_mrouter_set(struct socket so, struct sockopt m);
	static int X_legal_vif_num(int vif);
	static int X_mrt_ioctl(int cmd, caddr_t data, int fibnum);

	static int get_sg_cnt(struct sioc_sg_req *);
	static int get_vif_cnt(struct sioc_vif_req *);
	static void if_detached_event(void arg __unused, struct ifnet );
	static int ip_mrouter_init(struct socket *, int);
	static int add_vif(struct vifctl *);
	static int del_vif_locked(vifi_t);
	static int del_vif(vifi_t);
	static int add_mfc(struct mfcctl2 *);
	static int del_mfc(struct mfcctl2 *);
	static int set_api_config(uint32_t ); / chose API capabilities */
	static int socket_send(struct socket , struct mbuf , struct sockaddr_in *);
	static int set_assert(int);
	static void expire_upcalls(void *);
	static int ip_mdq(struct mbuf , struct ifnet , struct mfc *, vifi_t);
	static void phyint_send(struct ip , struct vif , struct mbuf *);
	static void send_packet(struct vif , struct mbuf );

	/*
	* Bandwidth monitoring
	*/
	static void free_bw_list(struct bw_meter *list);
	static int add_bw_upcall(struct bw_upcall *);
	static int del_bw_upcall(struct bw_upcall *);
	static void bw_meter_receive_packet(struct bw_meter *x, int plen,
	struct timeval *nowp);
	static void bw_meter_prepare_upcall(struct bw_meter x, struct timeval nowp);
	static void bw_upcalls_send(void);
	static void schedule_bw_meter(struct bw_meter x, struct timeval nowp);
	static void unschedule_bw_meter(struct bw_meter *x);
	static void bw_meter_process(void);
	static void expire_bw_upcalls_send(void *);
	static void expire_bw_meter_process(void *);

	static int pim_register_send(struct ip , struct vif ,
	struct mbuf , struct mfc );
	static int pim_register_send_rp(struct ip , struct vif ,
	struct mbuf , struct mfc );
	static int pim_register_send_upcall(struct ip , struct vif ,
	struct mbuf , struct mfc );
	static struct mbuf pim_register_prepare(struct ip , struct mbuf *);

	/*
	* whether or not special PIM assert processing is enabled.
	*/
	static int pim_assert;
	/*
	* Rate limit for assert notification messages, in usec
	*/
	#define ASSERT_MSG_TIME 3000000

	/*
	* Kernel multicast routing API capabilities and setup.
	* If more API capabilities are added to the kernel, they should be
	* recorded in `mrt_api_support'.
	*/
	static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF \|
	MRT_MFC_FLAGS_BORDER_VIF \|
	MRT_MFC_RP \|
	MRT_MFC_BW_UPCALL);
	static uint32_t mrt_api_config = 0;

	/*
	* Hash function for a source, group entry
	*/
	#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
	((g) >> 20) ^ ((g) >> 10) ^ (g))

	/*
	* Find a route for a given origin IP address and Multicast group address
	* Statistics are updated by the caller if needed
	* (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
	*/
	static struct mfc *
	mfc_find(in_addr_t o, in_addr_t g)
	{
	struct mfc *rt;

	MFC_LOCK_ASSERT();

	for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next)
	if ((rt->mfc_origin.s_addr == o) &&
	(rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL))
	break;
	return rt;
	}

	/*
	* Macros to compute elapsed time efficiently
	* Borrowed from Van Jacobson's scheduling code
	*/
	#define TV_DELTA(a, b, delta) { \
	int xxs; \
	delta = (a).tv_usec - (b).tv_usec; \
	if ((xxs = (a).tv_sec - (b).tv_sec)) { \
	switch (xxs) { \
	case 2: \
	delta += 1000000; \
	/* FALLTHROUGH */ \
	case 1: \
	delta += 1000000; \
	break; \
	default: \
	delta += (1000000 * xxs); \
	} \
	} \
	}

	#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
	(a).tv_sec <= (b).tv_sec) \|\| (a).tv_sec < (b).tv_sec)

	/*
	* Handle MRT setsockopt commands to modify the multicast routing tables.
	*/
	static int
	X_ip_mrouter_set(struct socket so, struct sockopt sopt)
	{
	+ INIT_VNET_INET(curvnet);
	int error, optval;
	vifi_t vifi;
	struct vifctl vifc;
	struct mfcctl2 mfc;
	struct bw_upcall bw_upcall;
	uint32_t i;

	if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
	return EPERM;

	error = 0;
	switch (sopt->sopt_name) {
	case MRT_INIT:
	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
	if (error)
	break;
	error = ip_mrouter_init(so, optval);
	break;

	case MRT_DONE:
	error = ip_mrouter_done();
	break;

	case MRT_ADD_VIF:
	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
	if (error)
	break;
	error = add_vif(&vifc);
	break;

	case MRT_DEL_VIF:
	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
	if (error)
	break;
	error = del_vif(vifi);
	break;

	case MRT_ADD_MFC:
	case MRT_DEL_MFC:
	/*
	* select data size depending on API version.
	*/
	if (sopt->sopt_name == MRT_ADD_MFC &&
	mrt_api_config & MRT_API_FLAGS_ALL) {
	error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
	sizeof(struct mfcctl2));
	} else {
	error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
	sizeof(struct mfcctl));
	bzero((caddr_t)&mfc + sizeof(struct mfcctl),
	sizeof(mfc) - sizeof(struct mfcctl));
	}
	if (error)
	break;
	if (sopt->sopt_name == MRT_ADD_MFC)
	error = add_mfc(&mfc);
	else
	error = del_mfc(&mfc);
	break;

	case MRT_ASSERT:
	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
	if (error)
	break;
	set_assert(optval);
	break;

	case MRT_API_CONFIG:
	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
	if (!error)
	error = set_api_config(&i);
	if (!error)
	error = sooptcopyout(sopt, &i, sizeof i);
	break;

	case MRT_ADD_BW_UPCALL:
	case MRT_DEL_BW_UPCALL:
	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
	sizeof bw_upcall);
	if (error)
	break;
	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
	error = add_bw_upcall(&bw_upcall);
	else
	error = del_bw_upcall(&bw_upcall);
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}
	return error;
	}

	/*
	* Handle MRT getsockopt commands
	*/
	static int
	X_ip_mrouter_get(struct socket so, struct sockopt sopt)
	{
	int error;
	static int version = 0x0305; /* !!! why is this here? XXX */

	switch (sopt->sopt_name) {
	case MRT_VERSION:
	error = sooptcopyout(sopt, &version, sizeof version);
	break;

	case MRT_ASSERT:
	error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert);
	break;

	case MRT_API_SUPPORT:
	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
	break;

	case MRT_API_CONFIG:
	error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config);
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}
	return error;
	}

	/*
	* Handle ioctl commands to obtain information from the cache
	*/
	static int
	X_mrt_ioctl(int cmd, caddr_t data, int fibnum)
	{
	int error = 0;

	/*
	* Currently the only function calling this ioctl routine is rtioctl().
	* Typically, only root can create the raw socket in order to execute
	* this ioctl method, however the request might be coming from a prison
	*/
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error)
	return (error);
	switch (cmd) {
	case (SIOCGETVIFCNT):
	error = get_vif_cnt((struct sioc_vif_req *)data);
	break;

	case (SIOCGETSGCNT):
	error = get_sg_cnt((struct sioc_sg_req *)data);
	break;

	default:
	error = EINVAL;
	break;
	}
	return error;
	}

	/*
	* returns the packet, byte, rpf-failure count for the source group provided
	*/
	static int
	get_sg_cnt(struct sioc_sg_req *req)
	{
	struct mfc *rt;

	MFC_LOCK();
	rt = mfc_find(req->src.s_addr, req->grp.s_addr);
	if (rt == NULL) {
	MFC_UNLOCK();
	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
	return EADDRNOTAVAIL;
	}
	req->pktcnt = rt->mfc_pkt_cnt;
	req->bytecnt = rt->mfc_byte_cnt;
	req->wrong_if = rt->mfc_wrong_if;
	MFC_UNLOCK();
	return 0;
	}

	/*
	* returns the input and output packet and byte counts on the vif provided
	*/
	static int
	get_vif_cnt(struct sioc_vif_req *req)
	{
	vifi_t vifi = req->vifi;

	VIF_LOCK();
	if (vifi >= numvifs) {
	VIF_UNLOCK();
	return EINVAL;
	}

	req->icount = viftable[vifi].v_pkt_in;
	req->ocount = viftable[vifi].v_pkt_out;
	req->ibytes = viftable[vifi].v_bytes_in;
	req->obytes = viftable[vifi].v_bytes_out;
	VIF_UNLOCK();

	return 0;
	}

	static void
	ip_mrouter_reset(void)
	{
	bzero((caddr_t)mfctable, sizeof(mfctable));
	bzero((caddr_t)nexpire, sizeof(nexpire));

	pim_assert = 0;
	mrt_api_config = 0;

	callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE);

	bw_upcalls_n = 0;
	bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers));
	callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE);
	callout_init(&bw_meter_ch, CALLOUT_MPSAFE);
	}

	static void
	if_detached_event(void arg __unused, struct ifnet ifp)
	{
	+ INIT_VNET_INET(curvnet);
	vifi_t vifi;
	int i;
	struct mfc *mfc;
	struct mfc *nmfc;
	struct mfc *ppmfc; / Pointer to previous node's next-pointer */
	struct rtdetq *pq;
	struct rtdetq *npq;

	MROUTER_LOCK();
	if (V_ip_mrouter == NULL) {
	MROUTER_UNLOCK();
	}

	/*
	* Tear down multicast forwarder state associated with this ifnet.
	* 1. Walk the vif list, matching vifs against this ifnet.
	* 2. Walk the multicast forwarding cache (mfc) looking for
	* inner matches with this vif's index.
	* 3. Free any pending mbufs for this mfc.
	* 4. Free the associated mfc entry and state associated with this vif.
	* Be very careful about unlinking from a singly-linked list whose
	* "head node" is a pointer in a simple array.
	* 5. Free vif state. This should disable ALLMULTI on the interface.
	*/
	VIF_LOCK();
	MFC_LOCK();
	for (vifi = 0; vifi < numvifs; vifi++) {
	if (viftable[vifi].v_ifp != ifp)
	continue;
	for (i = 0; i < MFCTBLSIZ; i++) {
	ppmfc = &mfctable[i];
	for (mfc = mfctable[i]; mfc != NULL; ) {
	nmfc = mfc->mfc_next;
	if (mfc->mfc_parent == vifi) {
	for (pq = mfc->mfc_stall; pq != NULL; ) {
	npq = pq->next;
	m_freem(pq->m);
	free(pq, M_MRTABLE);
	pq = npq;
	}
	free_bw_list(mfc->mfc_bw_meter);
	free(mfc, M_MRTABLE);
	*ppmfc = nmfc;
	} else {
	ppmfc = &mfc->mfc_next;
	}
	mfc = nmfc;
	}
	}
	del_vif_locked(vifi);
	}
	MFC_UNLOCK();
	VIF_UNLOCK();

	MROUTER_UNLOCK();
	}

	/*
	* Enable multicast routing
	*/
	static int
	ip_mrouter_init(struct socket *so, int version)
	{
	+ INIT_VNET_INET(curvnet);
	+
	if (mrtdebug)
	log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
	so->so_type, so->so_proto->pr_protocol);

	if (so->so_type != SOCK_RAW \|\| so->so_proto->pr_protocol != IPPROTO_IGMP)
	return EOPNOTSUPP;

	if (version != 1)
	return ENOPROTOOPT;

	MROUTER_LOCK();

	if (V_ip_mrouter != NULL) {
	MROUTER_UNLOCK();
	return EADDRINUSE;
	}

	if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
	if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
	if (if_detach_event_tag == NULL) {
	MROUTER_UNLOCK();
	return (ENOMEM);
	}

	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL);

	callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
	expire_bw_upcalls_send, NULL);
	callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL);

	V_ip_mrouter = so;

	MROUTER_UNLOCK();

	if (mrtdebug)
	log(LOG_DEBUG, "ip_mrouter_init\n");

	return 0;
	}

	/*
	* Disable multicast routing
	*/
	static int
	X_ip_mrouter_done(void)
	{
	+ INIT_VNET_INET(curvnet);
	vifi_t vifi;
	int i;
	struct ifnet *ifp;
	struct ifreq ifr;
	struct mfc *rt;
	struct rtdetq *rte;

	MROUTER_LOCK();

	if (V_ip_mrouter == NULL) {
	MROUTER_UNLOCK();
	return EINVAL;
	}

	/*
	* Detach/disable hooks to the reset of the system.
	*/
	V_ip_mrouter = NULL;
	mrt_api_config = 0;

	VIF_LOCK();
	/*
	* For each phyint in use, disable promiscuous reception of all IP
	* multicasts.
	*/
	for (vifi = 0; vifi < numvifs; vifi++) {
	if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
	!(viftable[vifi].v_flags & (VIFF_TUNNEL \| VIFF_REGISTER))) {
	struct sockaddr_in so = (struct sockaddr_in )&(ifr.ifr_addr);

	so->sin_len = sizeof(struct sockaddr_in);
	so->sin_family = AF_INET;
	so->sin_addr.s_addr = INADDR_ANY;
	ifp = viftable[vifi].v_ifp;
	if_allmulti(ifp, 0);
	}
	}
	bzero((caddr_t)viftable, sizeof(viftable));
	numvifs = 0;
	pim_assert = 0;
	VIF_UNLOCK();
	EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);

	/*
	* Free all multicast forwarding cache entries.
	*/
	callout_stop(&expire_upcalls_ch);
	callout_stop(&bw_upcalls_ch);
	callout_stop(&bw_meter_ch);

	MFC_LOCK();
	for (i = 0; i < MFCTBLSIZ; i++) {
	for (rt = mfctable[i]; rt != NULL; ) {
	struct mfc *nr = rt->mfc_next;

	for (rte = rt->mfc_stall; rte != NULL; ) {
	struct rtdetq *n = rte->next;

	m_freem(rte->m);
	free(rte, M_MRTABLE);
	rte = n;
	}
	free_bw_list(rt->mfc_bw_meter);
	free(rt, M_MRTABLE);
	rt = nr;
	}
	}
	bzero((caddr_t)mfctable, sizeof(mfctable));
	bzero((caddr_t)nexpire, sizeof(nexpire));
	bw_upcalls_n = 0;
	bzero(bw_meter_timers, sizeof(bw_meter_timers));
	MFC_UNLOCK();

	reg_vif_num = VIFI_INVALID;

	MROUTER_UNLOCK();

	if (mrtdebug)
	log(LOG_DEBUG, "ip_mrouter_done\n");

	return 0;
	}

	/*
	* Set PIM assert processing global
	*/
	static int
	set_assert(int i)
	{
	if ((i != 1) && (i != 0))
	return EINVAL;

	pim_assert = i;

	return 0;
	}

	/*
	* Configure API capabilities
	*/
	int
	set_api_config(uint32_t *apival)
	{
	int i;

	/*
	* We can set the API capabilities only if it is the first operation
	* after MRT_INIT. I.e.:
	* - there are no vifs installed
	* - pim_assert is not enabled
	* - the MFC table is empty
	*/
	if (numvifs > 0) {
	*apival = 0;
	return EPERM;
	}
	if (pim_assert) {
	*apival = 0;
	return EPERM;
	}
	for (i = 0; i < MFCTBLSIZ; i++) {
	if (mfctable[i] != NULL) {
	*apival = 0;
	return EPERM;
	}
	}

	mrt_api_config = *apival & mrt_api_support;
	*apival = mrt_api_config;

	return 0;
	}

	/*
	* Add a vif to the vif table
	*/
	static int
	add_vif(struct vifctl *vifcp)
	{
	struct vif *vifp = viftable + vifcp->vifc_vifi;
	struct sockaddr_in sin = {sizeof sin, AF_INET};
	struct ifaddr *ifa;
	struct ifnet *ifp;
	int error;

	VIF_LOCK();
	if (vifcp->vifc_vifi >= MAXVIFS) {
	VIF_UNLOCK();
	return EINVAL;
	}
	/* rate limiting is no longer supported by this code */
	if (vifcp->vifc_rate_limit != 0) {
	log(LOG_ERR, "rate limiting is no longer supported\n");
	VIF_UNLOCK();
	return EINVAL;
	}
	if (vifp->v_lcl_addr.s_addr != INADDR_ANY) {
	VIF_UNLOCK();
	return EADDRINUSE;
	}
	if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) {
	VIF_UNLOCK();
	return EADDRNOTAVAIL;
	}

	/* Find the interface with an address in AF_INET family */
	if (vifcp->vifc_flags & VIFF_REGISTER) {
	/*
	* XXX: Because VIFF_REGISTER does not really need a valid
	* local interface (e.g. it could be 127.0.0.2), we don't
	* check its address.
	*/
	ifp = NULL;
	} else {
	sin.sin_addr = vifcp->vifc_lcl_addr;
	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
	if (ifa == NULL) {
	VIF_UNLOCK();
	return EADDRNOTAVAIL;
	}
	ifp = ifa->ifa_ifp;
	}

	if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
	log(LOG_ERR, "tunnels are no longer supported\n");
	VIF_UNLOCK();
	return EOPNOTSUPP;
	} else if (vifcp->vifc_flags & VIFF_REGISTER) {
	ifp = &multicast_register_if;
	if (mrtdebug)
	log(LOG_DEBUG, "Adding a register vif, ifp: %p\n",
	(void *)&multicast_register_if);
	if (reg_vif_num == VIFI_INVALID) {
	if_initname(&multicast_register_if, "register_vif", 0);
	multicast_register_if.if_flags = IFF_LOOPBACK;
	reg_vif_num = vifcp->vifc_vifi;
	}
	} else { /* Make sure the interface supports multicast */
	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
	VIF_UNLOCK();
	return EOPNOTSUPP;
	}

	/* Enable promiscuous reception of all IP multicasts from the if */
	error = if_allmulti(ifp, 1);
	if (error) {
	VIF_UNLOCK();
	return error;
	}
	}

	vifp->v_flags = vifcp->vifc_flags;
	vifp->v_threshold = vifcp->vifc_threshold;
	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
	vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
	vifp->v_ifp = ifp;
	vifp->v_rsvp_on = 0;
	vifp->v_rsvpd = NULL;
	/* initialize per vif pkt counters */
	vifp->v_pkt_in = 0;
	vifp->v_pkt_out = 0;
	vifp->v_bytes_in = 0;
	vifp->v_bytes_out = 0;
	bzero(&vifp->v_route, sizeof(vifp->v_route));

	/* Adjust numvifs up if the vifi is higher than numvifs */
	if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;

	VIF_UNLOCK();

	if (mrtdebug)
	log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x\n",
	vifcp->vifc_vifi,
	(u_long)ntohl(vifcp->vifc_lcl_addr.s_addr),
	(vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
	(u_long)ntohl(vifcp->vifc_rmt_addr.s_addr),
	vifcp->vifc_threshold);

	return 0;
	}

	/*
	* Delete a vif from the vif table
	*/
	static int
	del_vif_locked(vifi_t vifi)
	{
	struct vif *vifp;

	VIF_LOCK_ASSERT();

	if (vifi >= numvifs) {
	return EINVAL;
	}
	vifp = &viftable[vifi];
	if (vifp->v_lcl_addr.s_addr == INADDR_ANY) {
	return EADDRNOTAVAIL;
	}

	if (!(vifp->v_flags & (VIFF_TUNNEL \| VIFF_REGISTER)))
	if_allmulti(vifp->v_ifp, 0);

	if (vifp->v_flags & VIFF_REGISTER)
	reg_vif_num = VIFI_INVALID;

	bzero((caddr_t)vifp, sizeof (*vifp));

	if (mrtdebug)
	log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs);

	/* Adjust numvifs down */
	for (vifi = numvifs; vifi > 0; vifi--)
	if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY)
	break;
	numvifs = vifi;

	return 0;
	}

	static int
	del_vif(vifi_t vifi)
	{
	int cc;

	VIF_LOCK();
	cc = del_vif_locked(vifi);
	VIF_UNLOCK();

	return cc;
	}

	/*
	* update an mfc entry without resetting counters and S,G addresses.
	*/
	static void
	update_mfc_params(struct mfc rt, struct mfcctl2 mfccp)
	{
	int i;

	rt->mfc_parent = mfccp->mfcc_parent;
	for (i = 0; i < numvifs; i++) {
	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config &
	MRT_MFC_FLAGS_ALL;
	}
	/* set the RP address */
	if (mrt_api_config & MRT_MFC_RP)
	rt->mfc_rp = mfccp->mfcc_rp;
	else
	rt->mfc_rp.s_addr = INADDR_ANY;
	}

	/*
	* fully initialize an mfc entry from the parameter.
	*/
	static void
	init_mfc_params(struct mfc rt, struct mfcctl2 mfccp)
	{
	rt->mfc_origin = mfccp->mfcc_origin;
	rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;

	update_mfc_params(rt, mfccp);

	/* initialize pkt counters per src-grp */
	rt->mfc_pkt_cnt = 0;
	rt->mfc_byte_cnt = 0;
	rt->mfc_wrong_if = 0;
	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
	}


	/*
	* Add an mfc entry
	*/
	static int
	add_mfc(struct mfcctl2 *mfccp)
	{
	struct mfc *rt;
	u_long hash;
	struct rtdetq *rte;
	u_short nstl;

	VIF_LOCK();
	MFC_LOCK();

	rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);

	/* If an entry already exists, just update the fields */
	if (rt) {
	if (mrtdebug & DEBUG_MFC)
	log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n",
	(u_long)ntohl(mfccp->mfcc_origin.s_addr),
	(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
	mfccp->mfcc_parent);

	update_mfc_params(rt, mfccp);
	MFC_UNLOCK();
	VIF_UNLOCK();
	return 0;
	}

	/*
	* Find the entry for which the upcall was made and update
	*/
	hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
	for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) {

	if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
	(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
	(rt->mfc_stall != NULL)) {

	if (nstl++)
	log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n",
	"multiple kernel entries",
	(u_long)ntohl(mfccp->mfcc_origin.s_addr),
	(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
	mfccp->mfcc_parent, (void *)rt->mfc_stall);

	if (mrtdebug & DEBUG_MFC)
	log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n",
	(u_long)ntohl(mfccp->mfcc_origin.s_addr),
	(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
	mfccp->mfcc_parent, (void *)rt->mfc_stall);

	init_mfc_params(rt, mfccp);

	rt->mfc_expire = 0; /* Don't clean this guy up */
	nexpire[hash]--;

	/* free packets Qed at the end of this entry */
	for (rte = rt->mfc_stall; rte != NULL; ) {
	struct rtdetq *n = rte->next;

	ip_mdq(rte->m, rte->ifp, rt, -1);
	m_freem(rte->m);
	free(rte, M_MRTABLE);
	rte = n;
	}
	rt->mfc_stall = NULL;
	}
	}

	/*
	* It is possible that an entry is being inserted without an upcall
	*/
	if (nstl == 0) {
	if (mrtdebug & DEBUG_MFC)
	log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n",
	hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr),
	(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
	mfccp->mfcc_parent);

	for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) {
	if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
	(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
	init_mfc_params(rt, mfccp);
	if (rt->mfc_expire)
	nexpire[hash]--;
	rt->mfc_expire = 0;
	break; /* XXX */
	}
	}
	if (rt == NULL) { /* no upcall, so make a new entry */
	rt = (struct mfc )malloc(sizeof(rt), M_MRTABLE, M_NOWAIT);
	if (rt == NULL) {
	MFC_UNLOCK();
	VIF_UNLOCK();
	return ENOBUFS;
	}

	init_mfc_params(rt, mfccp);
	rt->mfc_expire = 0;
	rt->mfc_stall = NULL;

	rt->mfc_bw_meter = NULL;
	/* insert new entry at head of hash chain */
	rt->mfc_next = mfctable[hash];
	mfctable[hash] = rt;
	}
	}
	MFC_UNLOCK();
	VIF_UNLOCK();
	return 0;
	}

	/*
	* Delete an mfc entry
	*/
	static int
	del_mfc(struct mfcctl2 *mfccp)
	{
	struct in_addr origin;
	struct in_addr mcastgrp;
	struct mfc *rt;
	struct mfc **nptr;
	u_long hash;
	struct bw_meter *list;

	origin = mfccp->mfcc_origin;
	mcastgrp = mfccp->mfcc_mcastgrp;

	if (mrtdebug & DEBUG_MFC)
	log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n",
	(u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));

	MFC_LOCK();

	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
	for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next)
	if (origin.s_addr == rt->mfc_origin.s_addr &&
	mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
	rt->mfc_stall == NULL)
	break;
	if (rt == NULL) {
	MFC_UNLOCK();
	return EADDRNOTAVAIL;
	}

	*nptr = rt->mfc_next;

	/*
	* free the bw_meter entries
	*/
	list = rt->mfc_bw_meter;
	rt->mfc_bw_meter = NULL;

	free(rt, M_MRTABLE);

	free_bw_list(list);

	MFC_UNLOCK();

	return 0;
	}

	/*
	* Send a message to the routing daemon on the multicast routing socket
	*/
	static int
	socket_send(struct socket s, struct mbuf mm, struct sockaddr_in *src)
	{
	if (s) {
	SOCKBUF_LOCK(&s->so_rcv);
	if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
	NULL) != 0) {
	sorwakeup_locked(s);
	return 0;
	}
	SOCKBUF_UNLOCK(&s->so_rcv);
	}
	m_freem(mm);
	return -1;
	}

	/*
	* IP multicast forwarding function. This function assumes that the packet
	* pointed to by "ip" has arrived on (or is about to be sent to) the interface
	* pointed to by "ifp", and the packet is to be relayed to other networks
	* that have members of the packet's destination IP multicast group.
	*
	* The packet is returned unscathed to the caller, unless it is
	* erroneous, in which case a non-zero return value tells the caller to
	* discard it.
	*/

	#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */

	static int
	X_ip_mforward(struct ip ip, struct ifnet ifp, struct mbuf *m,
	struct ip_moptions *imo)
	{
	+ INIT_VNET_INET(curvnet);
	struct mfc *rt;
	int error;
	vifi_t vifi;

	if (mrtdebug & DEBUG_FORWARD)
	log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n",
	(u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr),
	(void *)ifp);

	if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 \|\|
	((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
	/*
	* Packet arrived via a physical interface or
	* an encapsulated tunnel or a register_vif.
	*/
	} else {
	/*
	* Packet arrived through a source-route tunnel.
	* Source-route tunnels are no longer supported.
	*/
	static int last_log;
	if (last_log != time_uptime) {
	last_log = time_uptime;
	log(LOG_ERR,
	"ip_mforward: received source-routed packet from %lx\n",
	(u_long)ntohl(ip->ip_src.s_addr));
	}
	return 1;
	}

	VIF_LOCK();
	MFC_LOCK();
	if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) {
	if (ip->ip_ttl < MAXTTL)
	ip->ip_ttl++; /* compensate for -1 in _send routines /
	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
	struct vif *vifp = viftable + vifi;

	printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n",
	(long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr),
	vifi,
	(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
	vifp->v_ifp->if_xname);
	}
	error = ip_mdq(m, ifp, NULL, vifi);
	MFC_UNLOCK();
	VIF_UNLOCK();
	return error;
	}
	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
	printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n",
	(long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr));
	if (!imo)
	printf("In fact, no options were specified at all\n");
	}

	/*
	* Don't forward a packet with time-to-live of zero or one,
	* or a packet destined to a local-only group.
	*/
	if (ip->ip_ttl <= 1 \|\| IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
	MFC_UNLOCK();
	VIF_UNLOCK();
	return 0;
	}

	/*
	* Determine forwarding vifs from the forwarding cache table
	*/
	++mrtstat.mrts_mfc_lookups;
	rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr);

	/* Entry exists, so forward if necessary */
	if (rt != NULL) {
	error = ip_mdq(m, ifp, rt, -1);
	MFC_UNLOCK();
	VIF_UNLOCK();
	return error;
	} else {
	/*
	* If we don't have a route for packet's origin,
	* Make a copy of the packet & send message to routing daemon
	*/

	struct mbuf *mb0;
	struct rtdetq *rte;
	u_long hash;
	int hlen = ip->ip_hl << 2;

	++mrtstat.mrts_mfc_misses;

	mrtstat.mrts_no_route++;
	if (mrtdebug & (DEBUG_FORWARD \| DEBUG_MFC))
	log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n",
	(u_long)ntohl(ip->ip_src.s_addr),
	(u_long)ntohl(ip->ip_dst.s_addr));

	/*
	* Allocate mbufs early so that we don't do extra work if we are
	* just going to fail anyway. Make sure to pullup the header so
	* that other people can't step on it.
	*/
	rte = (struct rtdetq )malloc((sizeof rte), M_MRTABLE, M_NOWAIT);
	if (rte == NULL) {
	MFC_UNLOCK();
	VIF_UNLOCK();
	return ENOBUFS;
	}
	mb0 = m_copypacket(m, M_DONTWAIT);
	if (mb0 && (M_HASCL(mb0) \|\| mb0->m_len < hlen))
	mb0 = m_pullup(mb0, hlen);
	if (mb0 == NULL) {
	free(rte, M_MRTABLE);
	MFC_UNLOCK();
	VIF_UNLOCK();
	return ENOBUFS;
	}

	/* is there an upcall waiting for this flow ? */
	hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr);
	for (rt = mfctable[hash]; rt; rt = rt->mfc_next) {
	if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) &&
	(ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
	(rt->mfc_stall != NULL))
	break;
	}

	if (rt == NULL) {
	int i;
	struct igmpmsg *im;
	struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
	struct mbuf *mm;

	/*
	* Locate the vifi for the incoming interface for this packet.
	* If none found, drop packet.
	*/
	for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
	;
	if (vifi >= numvifs) /* vif not found, drop packet */
	goto non_fatal;

	/* no upcall, so make a new entry */
	rt = (struct mfc )malloc(sizeof(rt), M_MRTABLE, M_NOWAIT);
	if (rt == NULL)
	goto fail;
	/* Make a copy of the header to send to the user level process */
	mm = m_copy(mb0, 0, hlen);
	if (mm == NULL)
	goto fail1;

	/*
	* Send message to routing daemon to install
	* a route into the kernel table
	*/

	im = mtod(mm, struct igmpmsg *);
	im->im_msgtype = IGMPMSG_NOCACHE;
	im->im_mbz = 0;
	im->im_vif = vifi;

	mrtstat.mrts_upcalls++;

	k_igmpsrc.sin_addr = ip->ip_src;
	if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
	log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
	++mrtstat.mrts_upq_sockfull;
	fail1:
	free(rt, M_MRTABLE);
	fail:
	free(rte, M_MRTABLE);
	m_freem(mb0);
	MFC_UNLOCK();
	VIF_UNLOCK();
	return ENOBUFS;
	}

	/* insert new entry at head of hash chain */
	rt->mfc_origin.s_addr = ip->ip_src.s_addr;
	rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr;
	rt->mfc_expire = UPCALL_EXPIRE;
	nexpire[hash]++;
	for (i = 0; i < numvifs; i++) {
	rt->mfc_ttls[i] = 0;
	rt->mfc_flags[i] = 0;
	}
	rt->mfc_parent = -1;

	rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */

	rt->mfc_bw_meter = NULL;

	/* link into table */
	rt->mfc_next = mfctable[hash];
	mfctable[hash] = rt;
	rt->mfc_stall = rte;

	} else {
	/* determine if q has overflowed */
	int npkts = 0;
	struct rtdetq **p;

	/*
	* XXX ouch! we need to append to the list, but we
	* only have a pointer to the front, so we have to
	* scan the entire list every time.
	*/
	for (p = &rt->mfc_stall; p != NULL; p = &(p)->next)
	npkts++;

	if (npkts > MAX_UPQ) {
	mrtstat.mrts_upq_ovflw++;
	non_fatal:
	free(rte, M_MRTABLE);
	m_freem(mb0);
	MFC_UNLOCK();
	VIF_UNLOCK();
	return 0;
	}

	/* Add this entry to the end of the queue */
	*p = rte;
	}

	rte->m = mb0;
	rte->ifp = ifp;
	rte->next = NULL;

	MFC_UNLOCK();
	VIF_UNLOCK();

	return 0;
	}
	}

	/*
	* Clean up the cache entry if upcall is not serviced
	*/
	static void
	expire_upcalls(void *unused)
	{
	struct rtdetq *rte;
	struct mfc mfc, *nptr;
	int i;

	MFC_LOCK();
	for (i = 0; i < MFCTBLSIZ; i++) {
	if (nexpire[i] == 0)
	continue;
	nptr = &mfctable[i];
	for (mfc = nptr; mfc != NULL; mfc = nptr) {
	/*
	* Skip real cache entries
	* Make sure it wasn't marked to not expire (shouldn't happen)
	* If it expires now
	*/
	if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 &&
	--mfc->mfc_expire == 0) {
	if (mrtdebug & DEBUG_EXPIRE)
	log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n",
	(u_long)ntohl(mfc->mfc_origin.s_addr),
	(u_long)ntohl(mfc->mfc_mcastgrp.s_addr));
	/*
	* drop all the packets
	* free the mbuf with the pkt, if, timing info
	*/
	for (rte = mfc->mfc_stall; rte; ) {
	struct rtdetq *n = rte->next;

	m_freem(rte->m);
	free(rte, M_MRTABLE);
	rte = n;
	}
	++mrtstat.mrts_cache_cleanups;
	nexpire[i]--;

	/*
	* free the bw_meter entries
	*/
	while (mfc->mfc_bw_meter != NULL) {
	struct bw_meter *x = mfc->mfc_bw_meter;

	mfc->mfc_bw_meter = x->bm_mfc_next;
	free(x, M_BWMETER);
	}

	*nptr = mfc->mfc_next;
	free(mfc, M_MRTABLE);
	} else {
	nptr = &mfc->mfc_next;
	}
	}
	}
	MFC_UNLOCK();

	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL);
	}

	/*
	* Packet forwarding routine once entry in the cache is made
	*/
	static int
	ip_mdq(struct mbuf m, struct ifnet ifp, struct mfc *rt, vifi_t xmt_vif)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip ip = mtod(m, struct ip );
	vifi_t vifi;
	int plen = ip->ip_len;

	VIF_LOCK_ASSERT();

	/*
	* If xmt_vif is not -1, send on only the requested vif.
	*
	* (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
	*/
	if (xmt_vif < numvifs) {
	if (viftable[xmt_vif].v_flags & VIFF_REGISTER)
	pim_register_send(ip, viftable + xmt_vif, m, rt);
	else
	phyint_send(ip, viftable + xmt_vif, m);
	return 1;
	}

	/*
	* Don't forward if it didn't arrive from the parent vif for its origin.
	*/
	vifi = rt->mfc_parent;
	if ((vifi >= numvifs) \|\| (viftable[vifi].v_ifp != ifp)) {
	/* came in the wrong interface */
	if (mrtdebug & DEBUG_FORWARD)
	log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
	(void )ifp, vifi, (void )viftable[vifi].v_ifp);
	++mrtstat.mrts_wrong_if;
	++rt->mfc_wrong_if;
	/*
	* If we are doing PIM assert processing, send a message
	* to the routing daemon.
	*
	* XXX: A PIM-SM router needs the WRONGVIF detection so it
	* can complete the SPT switch, regardless of the type
	* of the iif (broadcast media, GRE tunnel, etc).
	*/
	if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) {
	struct timeval now;
	u_long delta;

	if (ifp == &multicast_register_if)
	pimstat.pims_rcv_registers_wrongiif++;

	/* Get vifi for the incoming packet */
	for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
	;
	if (vifi >= numvifs)
	return 0; /* The iif is not found: ignore the packet. */

	if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
	return 0; /* WRONGVIF disabled: ignore the packet */

	GET_TIME(now);

	TV_DELTA(now, rt->mfc_last_assert, delta);

	if (delta > ASSERT_MSG_TIME) {
	struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
	struct igmpmsg *im;
	int hlen = ip->ip_hl << 2;
	struct mbuf *mm = m_copy(m, 0, hlen);

	if (mm && (M_HASCL(mm) \|\| mm->m_len < hlen))
	mm = m_pullup(mm, hlen);
	if (mm == NULL)
	return ENOBUFS;

	rt->mfc_last_assert = now;

	im = mtod(mm, struct igmpmsg *);
	im->im_msgtype = IGMPMSG_WRONGVIF;
	im->im_mbz = 0;
	im->im_vif = vifi;

	mrtstat.mrts_upcalls++;

	k_igmpsrc.sin_addr = im->im_src;
	if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
	log(LOG_WARNING,
	"ip_mforward: ip_mrouter socket queue full\n");
	++mrtstat.mrts_upq_sockfull;
	return ENOBUFS;
	}
	}
	}
	return 0;
	}

	/* If I sourced this packet, it counts as output, else it was input. */
	if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) {
	viftable[vifi].v_pkt_out++;
	viftable[vifi].v_bytes_out += plen;
	} else {
	viftable[vifi].v_pkt_in++;
	viftable[vifi].v_bytes_in += plen;
	}
	rt->mfc_pkt_cnt++;
	rt->mfc_byte_cnt += plen;

	/*
	* For each vif, decide if a copy of the packet should be forwarded.
	* Forward if:
	* - the ttl exceeds the vif's threshold
	* - there are group members downstream on interface
	*/
	for (vifi = 0; vifi < numvifs; vifi++)
	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
	viftable[vifi].v_pkt_out++;
	viftable[vifi].v_bytes_out += plen;
	if (viftable[vifi].v_flags & VIFF_REGISTER)
	pim_register_send(ip, viftable + vifi, m, rt);
	else
	phyint_send(ip, viftable + vifi, m);
	}

	/*
	* Perform upcall-related bw measuring.
	*/
	if (rt->mfc_bw_meter != NULL) {
	struct bw_meter *x;
	struct timeval now;

	GET_TIME(now);
	MFC_LOCK_ASSERT();
	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
	bw_meter_receive_packet(x, plen, &now);
	}

	return 0;
	}

	/*
	* check if a vif number is legal/ok. This is used by ip_output.
	*/
	static int
	X_legal_vif_num(int vif)
	{
	/* XXX unlocked, matter? */
	return (vif >= 0 && vif < numvifs);
	}

	/*
	* Return the local address used by this vif
	*/
	static u_long
	X_ip_mcast_src(int vifi)
	{
	/* XXX unlocked, matter? */
	if (vifi >= 0 && vifi < numvifs)
	return viftable[vifi].v_lcl_addr.s_addr;
	else
	return INADDR_ANY;
	}

	static void
	phyint_send(struct ip ip, struct vif vifp, struct mbuf *m)
	{
	struct mbuf *mb_copy;
	int hlen = ip->ip_hl << 2;

	VIF_LOCK_ASSERT();

	/*
	* Make a new reference to the packet; make sure that
	* the IP header is actually copied, not just referenced,
	* so that ip_output() only scribbles on the copy.
	*/
	mb_copy = m_copypacket(m, M_DONTWAIT);
	if (mb_copy && (M_HASCL(mb_copy) \|\| mb_copy->m_len < hlen))
	mb_copy = m_pullup(mb_copy, hlen);
	if (mb_copy == NULL)
	return;

	send_packet(vifp, mb_copy);
	}

	static void
	send_packet(struct vif vifp, struct mbuf m)
	{
	struct ip_moptions imo;
	struct in_multi *imm[2];
	int error;

	VIF_LOCK_ASSERT();

	imo.imo_multicast_ifp = vifp->v_ifp;
	imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
	imo.imo_multicast_loop = 1;
	imo.imo_multicast_vif = -1;
	imo.imo_num_memberships = 0;
	imo.imo_max_memberships = 2;
	imo.imo_membership = &imm[0];

	/*
	* Re-entrancy should not be a problem here, because
	* the packets that we send out and are looped back at us
	* should get rejected because they appear to come from
	* the loopback interface, thus preventing looping.
	*/
	error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL);
	if (mrtdebug & DEBUG_XMIT) {
	log(LOG_DEBUG, "phyint_send on vif %td err %d\n",
	vifp - viftable, error);
	}
	}

	static int
	X_ip_rsvp_vif(struct socket so, struct sockopt sopt)
	{
	+ INIT_VNET_INET(curvnet);
	int error, vifi;

	if (so->so_type != SOCK_RAW \|\| so->so_proto->pr_protocol != IPPROTO_RSVP)
	return EOPNOTSUPP;

	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
	if (error)
	return error;

	VIF_LOCK();

	if (vifi < 0 \|\| vifi >= numvifs) { /* Error if vif is invalid */
	VIF_UNLOCK();
	return EADDRNOTAVAIL;
	}

	if (sopt->sopt_name == IP_RSVP_VIF_ON) {
	/* Check if socket is available. */
	if (viftable[vifi].v_rsvpd != NULL) {
	VIF_UNLOCK();
	return EADDRINUSE;
	}

	viftable[vifi].v_rsvpd = so;
	/* This may seem silly, but we need to be sure we don't over-increment
	* the RSVP counter, in case something slips up.
	*/
	if (!viftable[vifi].v_rsvp_on) {
	viftable[vifi].v_rsvp_on = 1;
	V_rsvp_on++;
	}
	} else { /* must be VIF_OFF */
	/*
	* XXX as an additional consistency check, one could make sure
	* that viftable[vifi].v_rsvpd == so, otherwise passing so as
	* first parameter is pretty useless.
	*/
	viftable[vifi].v_rsvpd = NULL;
	/*
	* This may seem silly, but we need to be sure we don't over-decrement
	* the RSVP counter, in case something slips up.
	*/
	if (viftable[vifi].v_rsvp_on) {
	viftable[vifi].v_rsvp_on = 0;
	V_rsvp_on--;
	}
	}
	VIF_UNLOCK();
	return 0;
	}

	static void
	X_ip_rsvp_force_done(struct socket *so)
	{
	+ INIT_VNET_INET(curvnet);
	int vifi;

	/* Don't bother if it is not the right type of socket. */
	if (so->so_type != SOCK_RAW \|\| so->so_proto->pr_protocol != IPPROTO_RSVP)
	return;

	VIF_LOCK();

	/* The socket may be attached to more than one vif...this
	* is perfectly legal.
	*/
	for (vifi = 0; vifi < numvifs; vifi++) {
	if (viftable[vifi].v_rsvpd == so) {
	viftable[vifi].v_rsvpd = NULL;
	/* This may seem silly, but we need to be sure we don't
	* over-decrement the RSVP counter, in case something slips up.
	*/
	if (viftable[vifi].v_rsvp_on) {
	viftable[vifi].v_rsvp_on = 0;
	V_rsvp_on--;
	}
	}
	}

	VIF_UNLOCK();
	}

	static void
	X_rsvp_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET(curvnet);
	int vifi;
	struct ip ip = mtod(m, struct ip );
	struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET };
	struct ifnet *ifp;

	if (rsvpdebug)
	printf("rsvp_input: rsvp_on %d\n", V_rsvp_on);

	/* Can still get packets with rsvp_on = 0 if there is a local member
	* of the group to which the RSVP packet is addressed. But in this
	* case we want to throw the packet away.
	*/
	if (!V_rsvp_on) {
	m_freem(m);
	return;
	}

	if (rsvpdebug)
	printf("rsvp_input: check vifs\n");

	#ifdef DIAGNOSTIC
	M_ASSERTPKTHDR(m);
	#endif

	ifp = m->m_pkthdr.rcvif;

	VIF_LOCK();
	/* Find which vif the packet arrived on. */
	for (vifi = 0; vifi < numvifs; vifi++)
	if (viftable[vifi].v_ifp == ifp)
	break;

	if (vifi == numvifs \|\| viftable[vifi].v_rsvpd == NULL) {
	/*
	* Drop the lock here to avoid holding it across rip_input.
	* This could make rsvpdebug printfs wrong. If you care,
	* record the state of stuff before dropping the lock.
	*/
	VIF_UNLOCK();
	/*
	* If the old-style non-vif-associated socket is set,
	* then use it. Otherwise, drop packet since there
	* is no specific socket for this vif.
	*/
	if (V_ip_rsvpd != NULL) {
	if (rsvpdebug)
	printf("rsvp_input: Sending packet up old-style socket\n");
	rip_input(m, off); /* xxx */
	} else {
	if (rsvpdebug && vifi == numvifs)
	printf("rsvp_input: Can't find vif for packet.\n");
	else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL)
	printf("rsvp_input: No socket defined for vif %d\n",vifi);
	m_freem(m);
	}
	return;
	}
	rsvp_src.sin_addr = ip->ip_src;

	if (rsvpdebug && m)
	printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n",
	m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv)));

	if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) {
	if (rsvpdebug)
	printf("rsvp_input: Failed to append to socket\n");
	} else {
	if (rsvpdebug)
	printf("rsvp_input: send packet up\n");
	}
	VIF_UNLOCK();
	}

	/*
	* Code for bandwidth monitors
	*/

	/*
	* Define common interface for timeval-related methods
	*/
	#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
	#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
	#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))

	static uint32_t
	compute_bw_meter_flags(struct bw_upcall *req)
	{
	uint32_t flags = 0;

	if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
	flags \|= BW_METER_UNIT_PACKETS;
	if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
	flags \|= BW_METER_UNIT_BYTES;
	if (req->bu_flags & BW_UPCALL_GEQ)
	flags \|= BW_METER_GEQ;
	if (req->bu_flags & BW_UPCALL_LEQ)
	flags \|= BW_METER_LEQ;

	return flags;
	}

	/*
	* Add a bw_meter entry
	*/
	static int
	add_bw_upcall(struct bw_upcall *req)
	{
	struct mfc *mfc;
	struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
	BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
	struct timeval now;
	struct bw_meter *x;
	uint32_t flags;

	if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
	return EOPNOTSUPP;

	/* Test if the flags are valid */
	if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS \| BW_UPCALL_UNIT_BYTES)))
	return EINVAL;
	if (!(req->bu_flags & (BW_UPCALL_GEQ \| BW_UPCALL_LEQ)))
	return EINVAL;
	if ((req->bu_flags & (BW_UPCALL_GEQ \| BW_UPCALL_LEQ))
	== (BW_UPCALL_GEQ \| BW_UPCALL_LEQ))
	return EINVAL;

	/* Test if the threshold time interval is valid */
	if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
	return EINVAL;

	flags = compute_bw_meter_flags(req);

	/*
	* Find if we have already same bw_meter entry
	*/
	MFC_LOCK();
	mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr);
	if (mfc == NULL) {
	MFC_UNLOCK();
	return EADDRNOTAVAIL;
	}
	for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
	&req->bu_threshold.b_time, ==)) &&
	(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
	(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
	(x->bm_flags & BW_METER_USER_FLAGS) == flags) {
	MFC_UNLOCK();
	return 0; /* XXX Already installed */
	}
	}

	/* Allocate the new bw_meter entry */
	x = (struct bw_meter )malloc(sizeof(x), M_BWMETER, M_NOWAIT);
	if (x == NULL) {
	MFC_UNLOCK();
	return ENOBUFS;
	}

	/* Set the new bw_meter entry */
	x->bm_threshold.b_time = req->bu_threshold.b_time;
	GET_TIME(now);
	x->bm_start_time = now;
	x->bm_threshold.b_packets = req->bu_threshold.b_packets;
	x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
	x->bm_measured.b_packets = 0;
	x->bm_measured.b_bytes = 0;
	x->bm_flags = flags;
	x->bm_time_next = NULL;
	x->bm_time_hash = BW_METER_BUCKETS;

	/* Add the new bw_meter entry to the front of entries for this MFC */
	x->bm_mfc = mfc;
	x->bm_mfc_next = mfc->mfc_bw_meter;
	mfc->mfc_bw_meter = x;
	schedule_bw_meter(x, &now);
	MFC_UNLOCK();

	return 0;
	}

	static void
	free_bw_list(struct bw_meter *list)
	{
	while (list != NULL) {
	struct bw_meter *x = list;

	list = list->bm_mfc_next;
	unschedule_bw_meter(x);
	free(x, M_BWMETER);
	}
	}

	/*
	* Delete one or multiple bw_meter entries
	*/
	static int
	del_bw_upcall(struct bw_upcall *req)
	{
	struct mfc *mfc;
	struct bw_meter *x;

	if (!(mrt_api_config & MRT_MFC_BW_UPCALL))
	return EOPNOTSUPP;

	MFC_LOCK();
	/* Find the corresponding MFC entry */
	mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr);
	if (mfc == NULL) {
	MFC_UNLOCK();
	return EADDRNOTAVAIL;
	} else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
	/*
	* Delete all bw_meter entries for this mfc
	*/
	struct bw_meter *list;

	list = mfc->mfc_bw_meter;
	mfc->mfc_bw_meter = NULL;
	free_bw_list(list);
	MFC_UNLOCK();
	return 0;
	} else { /* Delete a single bw_meter entry */
	struct bw_meter *prev;
	uint32_t flags = 0;

	flags = compute_bw_meter_flags(req);

	/* Find the bw_meter entry to delete */
	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
	prev = x, x = x->bm_mfc_next) {
	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
	&req->bu_threshold.b_time, ==)) &&
	(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
	(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
	(x->bm_flags & BW_METER_USER_FLAGS) == flags)
	break;
	}
	if (x != NULL) { /* Delete entry from the list for this MFC */
	if (prev != NULL)
	prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
	else
	x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */

	unschedule_bw_meter(x);
	MFC_UNLOCK();
	/* Free the bw_meter entry */
	free(x, M_BWMETER);
	return 0;
	} else {
	MFC_UNLOCK();
	return EINVAL;
	}
	}
	/* NOTREACHED */
	}

	/*
	* Perform bandwidth measurement processing that may result in an upcall
	*/
	static void
	bw_meter_receive_packet(struct bw_meter x, int plen, struct timeval nowp)
	{
	struct timeval delta;

	MFC_LOCK_ASSERT();

	delta = *nowp;
	BW_TIMEVALDECR(&delta, &x->bm_start_time);

	if (x->bm_flags & BW_METER_GEQ) {
	/*
	* Processing for ">=" type of bw_meter entry
	*/
	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
	/* Reset the bw_meter entry */
	x->bm_start_time = *nowp;
	x->bm_measured.b_packets = 0;
	x->bm_measured.b_bytes = 0;
	x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
	}

	/* Record that a packet is received */
	x->bm_measured.b_packets++;
	x->bm_measured.b_bytes += plen;

	/*
	* Test if we should deliver an upcall
	*/
	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
	if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
	(x->bm_measured.b_packets >= x->bm_threshold.b_packets)) \|\|
	((x->bm_flags & BW_METER_UNIT_BYTES) &&
	(x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
	/* Prepare an upcall for delivery */
	bw_meter_prepare_upcall(x, nowp);
	x->bm_flags \|= BW_METER_UPCALL_DELIVERED;
	}
	}
	} else if (x->bm_flags & BW_METER_LEQ) {
	/*
	* Processing for "<=" type of bw_meter entry
	*/
	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
	/*
	* We are behind time with the multicast forwarding table
	* scanning for "<=" type of bw_meter entries, so test now
	* if we should deliver an upcall.
	*/
	if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
	(x->bm_measured.b_packets <= x->bm_threshold.b_packets)) \|\|
	((x->bm_flags & BW_METER_UNIT_BYTES) &&
	(x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
	/* Prepare an upcall for delivery */
	bw_meter_prepare_upcall(x, nowp);
	}
	/* Reschedule the bw_meter entry */
	unschedule_bw_meter(x);
	schedule_bw_meter(x, nowp);
	}

	/* Record that a packet is received */
	x->bm_measured.b_packets++;
	x->bm_measured.b_bytes += plen;

	/*
	* Test if we should restart the measuring interval
	*/
	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
	x->bm_measured.b_packets <= x->bm_threshold.b_packets) \|\|
	(x->bm_flags & BW_METER_UNIT_BYTES &&
	x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
	/* Don't restart the measuring interval */
	} else {
	/* Do restart the measuring interval */
	/*
	* XXX: note that we don't unschedule and schedule, because this
	* might be too much overhead per packet. Instead, when we process
	* all entries for a given timer hash bin, we check whether it is
	* really a timeout. If not, we reschedule at that time.
	*/
	x->bm_start_time = *nowp;
	x->bm_measured.b_packets = 0;
	x->bm_measured.b_bytes = 0;
	x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
	}
	}
	}

	/*
	* Prepare a bandwidth-related upcall
	*/
	static void
	bw_meter_prepare_upcall(struct bw_meter x, struct timeval nowp)
	{
	struct timeval delta;
	struct bw_upcall *u;

	MFC_LOCK_ASSERT();

	/*
	* Compute the measured time interval
	*/
	delta = *nowp;
	BW_TIMEVALDECR(&delta, &x->bm_start_time);

	/*
	* If there are too many pending upcalls, deliver them now
	*/
	if (bw_upcalls_n >= BW_UPCALLS_MAX)
	bw_upcalls_send();

	/*
	* Set the bw_upcall entry
	*/
	u = &bw_upcalls[bw_upcalls_n++];
	u->bu_src = x->bm_mfc->mfc_origin;
	u->bu_dst = x->bm_mfc->mfc_mcastgrp;
	u->bu_threshold.b_time = x->bm_threshold.b_time;
	u->bu_threshold.b_packets = x->bm_threshold.b_packets;
	u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
	u->bu_measured.b_time = delta;
	u->bu_measured.b_packets = x->bm_measured.b_packets;
	u->bu_measured.b_bytes = x->bm_measured.b_bytes;
	u->bu_flags = 0;
	if (x->bm_flags & BW_METER_UNIT_PACKETS)
	u->bu_flags \|= BW_UPCALL_UNIT_PACKETS;
	if (x->bm_flags & BW_METER_UNIT_BYTES)
	u->bu_flags \|= BW_UPCALL_UNIT_BYTES;
	if (x->bm_flags & BW_METER_GEQ)
	u->bu_flags \|= BW_UPCALL_GEQ;
	if (x->bm_flags & BW_METER_LEQ)
	u->bu_flags \|= BW_UPCALL_LEQ;
	}

	/*
	* Send the pending bandwidth-related upcalls
	*/
	static void
	bw_upcalls_send(void)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *m;
	int len = bw_upcalls_n * sizeof(bw_upcalls[0]);
	struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
	static struct igmpmsg igmpmsg = { 0, /* unused1 */
	0, /* unused2 */
	IGMPMSG_BW_UPCALL,/* im_msgtype */
	0, /* im_mbz */
	0, /* im_vif */
	0, /* unused3 */
	{ 0 }, /* im_src */
	{ 0 } }; /* im_dst */

	MFC_LOCK_ASSERT();

	if (bw_upcalls_n == 0)
	return; /* No pending upcalls */

	bw_upcalls_n = 0;

	/*
	* Allocate a new mbuf, initialize it with the header and
	* the payload for the pending calls.
	*/
	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL) {
	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
	return;
	}

	m->m_len = m->m_pkthdr.len = 0;
	m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
	m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]);

	/*
	* Send the upcalls
	* XXX do we need to set the address in k_igmpsrc ?
	*/
	mrtstat.mrts_upcalls++;
	if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
	++mrtstat.mrts_upq_sockfull;
	}
	}

	/*
	* Compute the timeout hash value for the bw_meter entries
	*/
	#define BW_METER_TIMEHASH(bw_meter, hash) \
	do { \
	struct timeval next_timeval = (bw_meter)->bm_start_time; \
	\
	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
	(hash) = next_timeval.tv_sec; \
	if (next_timeval.tv_usec) \
	(hash)++; /* XXX: make sure we don't timeout early */ \
	(hash) %= BW_METER_BUCKETS; \
	} while (0)

	/*
	* Schedule a timer to process periodically bw_meter entry of type "<="
	* by linking the entry in the proper hash bucket.
	*/
	static void
	schedule_bw_meter(struct bw_meter x, struct timeval nowp)
	{
	int time_hash;

	MFC_LOCK_ASSERT();

	if (!(x->bm_flags & BW_METER_LEQ))
	return; /* XXX: we schedule timers only for "<=" entries */

	/*
	* Reset the bw_meter entry
	*/
	x->bm_start_time = *nowp;
	x->bm_measured.b_packets = 0;
	x->bm_measured.b_bytes = 0;
	x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;

	/*
	* Compute the timeout hash value and insert the entry
	*/
	BW_METER_TIMEHASH(x, time_hash);
	x->bm_time_next = bw_meter_timers[time_hash];
	bw_meter_timers[time_hash] = x;
	x->bm_time_hash = time_hash;
	}

	/*
	* Unschedule the periodic timer that processes bw_meter entry of type "<="
	* by removing the entry from the proper hash bucket.
	*/
	static void
	unschedule_bw_meter(struct bw_meter *x)
	{
	int time_hash;
	struct bw_meter prev, tmp;

	MFC_LOCK_ASSERT();

	if (!(x->bm_flags & BW_METER_LEQ))
	return; /* XXX: we schedule timers only for "<=" entries */

	/*
	* Compute the timeout hash value and delete the entry
	*/
	time_hash = x->bm_time_hash;
	if (time_hash >= BW_METER_BUCKETS)
	return; /* Entry was not scheduled */

	for (prev = NULL, tmp = bw_meter_timers[time_hash];
	tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
	if (tmp == x)
	break;

	if (tmp == NULL)
	panic("unschedule_bw_meter: bw_meter entry not found");

	if (prev != NULL)
	prev->bm_time_next = x->bm_time_next;
	else
	bw_meter_timers[time_hash] = x->bm_time_next;

	x->bm_time_next = NULL;
	x->bm_time_hash = BW_METER_BUCKETS;
	}


	/*
	* Process all "<=" type of bw_meter that should be processed now,
	* and for each entry prepare an upcall if necessary. Each processed
	* entry is rescheduled again for the (periodic) processing.
	*
	* This is run periodically (once per second normally). On each round,
	* all the potentially matching entries are in the hash slot that we are
	* looking at.
	*/
	static void
	bw_meter_process()
	{
	static uint32_t last_tv_sec; /* last time we processed this */

	uint32_t loops;
	int i;
	struct timeval now, process_endtime;

	GET_TIME(now);
	if (last_tv_sec == now.tv_sec)
	return; /* nothing to do */

	loops = now.tv_sec - last_tv_sec;
	last_tv_sec = now.tv_sec;
	if (loops > BW_METER_BUCKETS)
	loops = BW_METER_BUCKETS;

	MFC_LOCK();
	/*
	* Process all bins of bw_meter entries from the one after the last
	* processed to the current one. On entry, i points to the last bucket
	* visited, so we need to increment i at the beginning of the loop.
	*/
	for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
	struct bw_meter x, tmp_list;

	if (++i >= BW_METER_BUCKETS)
	i = 0;

	/* Disconnect the list of bw_meter entries from the bin */
	tmp_list = bw_meter_timers[i];
	bw_meter_timers[i] = NULL;

	/* Process the list of bw_meter entries */
	while (tmp_list != NULL) {
	x = tmp_list;
	tmp_list = tmp_list->bm_time_next;

	/* Test if the time interval is over */
	process_endtime = x->bm_start_time;
	BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
	if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
	/* Not yet: reschedule, but don't reset */
	int time_hash;

	BW_METER_TIMEHASH(x, time_hash);
	if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
	/*
	* XXX: somehow the bin processing is a bit ahead of time.
	* Put the entry in the next bin.
	*/
	if (++time_hash >= BW_METER_BUCKETS)
	time_hash = 0;
	}
	x->bm_time_next = bw_meter_timers[time_hash];
	bw_meter_timers[time_hash] = x;
	x->bm_time_hash = time_hash;

	continue;
	}

	/*
	* Test if we should deliver an upcall
	*/
	if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
	(x->bm_measured.b_packets <= x->bm_threshold.b_packets)) \|\|
	((x->bm_flags & BW_METER_UNIT_BYTES) &&
	(x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
	/* Prepare an upcall for delivery */
	bw_meter_prepare_upcall(x, &now);
	}

	/*
	* Reschedule for next processing
	*/
	schedule_bw_meter(x, &now);
	}
	}

	/* Send all upcalls that are pending delivery */
	bw_upcalls_send();

	MFC_UNLOCK();
	}

	/*
	* A periodic function for sending all upcalls that are pending delivery
	*/
	static void
	expire_bw_upcalls_send(void *unused)
	{
	MFC_LOCK();
	bw_upcalls_send();
	MFC_UNLOCK();

	callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD,
	expire_bw_upcalls_send, NULL);
	}

	/*
	* A periodic function for periodic scanning of the multicast forwarding
	* table for processing all "<=" bw_meter entries.
	*/
	static void
	expire_bw_meter_process(void *unused)
	{
	if (mrt_api_config & MRT_MFC_BW_UPCALL)
	bw_meter_process();

	callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL);
	}

	/*
	* End of bandwidth monitoring code
	*/

	/*
	* Send the packet up to the user daemon, or eventually do kernel encapsulation
	*
	*/
	static int
	pim_register_send(struct ip ip, struct vif vifp, struct mbuf *m,
	struct mfc *rt)
	{
	struct mbuf mb_copy, mm;

	if (mrtdebug & DEBUG_PIM)
	log(LOG_DEBUG, "pim_register_send: ");

	/*
	* Do not send IGMP_WHOLEPKT notifications to userland, if the
	* rendezvous point was unspecified, and we were told not to.
	*/
	if (pim_squelch_wholepkt != 0 && (mrt_api_config & MRT_MFC_RP) &&
	(rt->mfc_rp.s_addr == INADDR_ANY))
	return 0;

	mb_copy = pim_register_prepare(ip, m);
	if (mb_copy == NULL)
	return ENOBUFS;

	/*
	* Send all the fragments. Note that the mbuf for each fragment
	* is freed by the sending machinery.
	*/
	for (mm = mb_copy; mm; mm = mb_copy) {
	mb_copy = mm->m_nextpkt;
	mm->m_nextpkt = 0;
	mm = m_pullup(mm, sizeof(struct ip));
	if (mm != NULL) {
	ip = mtod(mm, struct ip *);
	if ((mrt_api_config & MRT_MFC_RP) &&
	(rt->mfc_rp.s_addr != INADDR_ANY)) {
	pim_register_send_rp(ip, vifp, mm, rt);
	} else {
	pim_register_send_upcall(ip, vifp, mm, rt);
	}
	}
	}

	return 0;
	}

	/*
	* Return a copy of the data packet that is ready for PIM Register
	* encapsulation.
	* XXX: Note that in the returned copy the IP header is a valid one.
	*/
	static struct mbuf *
	pim_register_prepare(struct ip ip, struct mbuf m)
	{
	struct mbuf *mb_copy = NULL;
	int mtu;

	/* Take care of delayed checksums */
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	in_delayed_cksum(m);
	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}

	/*
	* Copy the old packet & pullup its IP header into the
	* new mbuf so we can modify it.
	*/
	mb_copy = m_copypacket(m, M_DONTWAIT);
	if (mb_copy == NULL)
	return NULL;
	mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
	if (mb_copy == NULL)
	return NULL;

	/* take care of the TTL */
	ip = mtod(mb_copy, struct ip *);
	--ip->ip_ttl;

	/* Compute the MTU after the PIM Register encapsulation */
	mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);

	if (ip->ip_len <= mtu) {
	/* Turn the IP header into a valid one */
	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);
	ip->ip_sum = 0;
	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
	} else {
	/* Fragment the packet */
	if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) {
	m_freem(mb_copy);
	return NULL;
	}
	}
	return mb_copy;
	}

	/*
	* Send an upcall with the data packet to the user-level process.
	*/
	static int
	pim_register_send_upcall(struct ip ip, struct vif vifp,
	struct mbuf mb_copy, struct mfc rt)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *mb_first;
	int len = ntohs(ip->ip_len);
	struct igmpmsg *im;
	struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };

	VIF_LOCK_ASSERT();

	/*
	* Add a new mbuf with an upcall header
	*/
	MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
	if (mb_first == NULL) {
	m_freem(mb_copy);
	return ENOBUFS;
	}
	mb_first->m_data += max_linkhdr;
	mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
	mb_first->m_len = sizeof(struct igmpmsg);
	mb_first->m_next = mb_copy;

	/* Send message to routing daemon */
	im = mtod(mb_first, struct igmpmsg *);
	im->im_msgtype = IGMPMSG_WHOLEPKT;
	im->im_mbz = 0;
	im->im_vif = vifp - viftable;
	im->im_src = ip->ip_src;
	im->im_dst = ip->ip_dst;

	k_igmpsrc.sin_addr = ip->ip_src;

	mrtstat.mrts_upcalls++;

	if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
	if (mrtdebug & DEBUG_PIM)
	log(LOG_WARNING,
	"mcast: pim_register_send_upcall: ip_mrouter socket queue full");
	++mrtstat.mrts_upq_sockfull;
	return ENOBUFS;
	}

	/* Keep statistics */
	pimstat.pims_snd_registers_msgs++;
	pimstat.pims_snd_registers_bytes += len;

	return 0;
	}

	/*
	* Encapsulate the data packet in PIM Register message and send it to the RP.
	*/
	static int
	pim_register_send_rp(struct ip ip, struct vif vifp, struct mbuf *mb_copy,
	struct mfc *rt)
	{
	+ INIT_VNET_INET(curvnet);
	struct mbuf *mb_first;
	struct ip *ip_outer;
	struct pim_encap_pimhdr *pimhdr;
	int len = ntohs(ip->ip_len);
	vifi_t vifi = rt->mfc_parent;

	VIF_LOCK_ASSERT();

	if ((vifi >= numvifs) \|\| (viftable[vifi].v_lcl_addr.s_addr == 0)) {
	m_freem(mb_copy);
	return EADDRNOTAVAIL; /* The iif vif is invalid */
	}

	/*
	* Add a new mbuf with the encapsulating header
	*/
	MGETHDR(mb_first, M_DONTWAIT, MT_DATA);
	if (mb_first == NULL) {
	m_freem(mb_copy);
	return ENOBUFS;
	}
	mb_first->m_data += max_linkhdr;
	mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
	mb_first->m_next = mb_copy;

	mb_first->m_pkthdr.len = len + mb_first->m_len;

	/*
	* Fill in the encapsulating IP and PIM header
	*/
	ip_outer = mtod(mb_first, struct ip *);
	*ip_outer = pim_encap_iphdr;
	ip_outer->ip_id = ip_newid();
	ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
	ip_outer->ip_src = viftable[vifi].v_lcl_addr;
	ip_outer->ip_dst = rt->mfc_rp;
	/*
	* Copy the inner header TOS to the outer header, and take care of the
	* IP_DF bit.
	*/
	ip_outer->ip_tos = ip->ip_tos;
	if (ntohs(ip->ip_off) & IP_DF)
	ip_outer->ip_off \|= IP_DF;
	pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
	+ sizeof(pim_encap_iphdr));
	*pimhdr = pim_encap_pimhdr;
	/* If the iif crosses a border, set the Border-bit */
	if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config)
	pimhdr->flags \|= htonl(PIM_BORDER_REGISTER);

	mb_first->m_data += sizeof(pim_encap_iphdr);
	pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
	mb_first->m_data -= sizeof(pim_encap_iphdr);

	send_packet(vifp, mb_first);

	/* Keep statistics */
	pimstat.pims_snd_registers_msgs++;
	pimstat.pims_snd_registers_bytes += len;

	return 0;
	}

	/*
	* pim_encapcheck() is called by the encap[46]_input() path at runtime to
	* determine if a packet is for PIM; allowing PIM to be dynamically loaded
	* into the kernel.
	*/
	static int
	pim_encapcheck(const struct mbuf m, int off, int proto, void arg)
	{

	#ifdef DIAGNOSTIC
	KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
	#endif
	if (proto != IPPROTO_PIM)
	return 0; /* not for us; reject the datagram. */

	return 64; /* claim the datagram. */
	}

	/*
	* PIM-SMv2 and PIM-DM messages processing.
	* Receives and verifies the PIM control messages, and passes them
	* up to the listening socket, using rip_input().
	* The only message with special processing is the PIM_REGISTER message
	* (used by PIM-SM): the PIM header is stripped off, and the inner packet
	* is passed to if_simloop().
	*/
	void
	pim_input(struct mbuf *m, int off)
	{
	struct ip ip = mtod(m, struct ip );
	struct pim *pim;
	int minlen;
	int datalen = ip->ip_len;
	int ip_tos;
	int iphlen = off;

	/* Keep statistics */
	pimstat.pims_rcv_total_msgs++;
	pimstat.pims_rcv_total_bytes += datalen;

	/*
	* Validate lengths
	*/
	if (datalen < PIM_MINLEN) {
	pimstat.pims_rcv_tooshort++;
	log(LOG_ERR, "pim_input: packet size too small %d from %lx\n",
	datalen, (u_long)ip->ip_src.s_addr);
	m_freem(m);
	return;
	}

	/*
	* If the packet is at least as big as a REGISTER, go agead
	* and grab the PIM REGISTER header size, to avoid another
	* possible m_pullup() later.
	*
	* PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8
	* PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
	*/
	minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
	/*
	* Get the IP and PIM headers in contiguous memory, and
	* possibly the PIM REGISTER header.
	*/
	if ((m->m_flags & M_EXT \|\| m->m_len < minlen) &&
	(m = m_pullup(m, minlen)) == 0) {
	log(LOG_ERR, "pim_input: m_pullup failure\n");
	return;
	}
	/* m_pullup() may have given us a new mbuf so reset ip. */
	ip = mtod(m, struct ip *);
	ip_tos = ip->ip_tos;

	/* adjust mbuf to point to the PIM header */
	m->m_data += iphlen;
	m->m_len -= iphlen;
	pim = mtod(m, struct pim *);

	/*
	* Validate checksum. If PIM REGISTER, exclude the data packet.
	*
	* XXX: some older PIMv2 implementations don't make this distinction,
	* so for compatibility reason perform the checksum over part of the
	* message, and if error, then over the whole message.
	*/
	if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
	/* do nothing, checksum okay */
	} else if (in_cksum(m, datalen)) {
	pimstat.pims_rcv_badsum++;
	if (mrtdebug & DEBUG_PIM)
	log(LOG_DEBUG, "pim_input: invalid checksum");
	m_freem(m);
	return;
	}

	/* PIM version check */
	if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
	pimstat.pims_rcv_badversion++;
	log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n",
	PIM_VT_V(pim->pim_vt), PIM_VERSION);
	m_freem(m);
	return;
	}

	/* restore mbuf back to the outer IP */
	m->m_data -= iphlen;
	m->m_len += iphlen;

	if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
	/*
	* Since this is a REGISTER, we'll make a copy of the register
	* headers ip + pim + u_int32 + encap_ip, to be passed up to the
	* routing daemon.
	*/
	struct sockaddr_in dst = { sizeof(dst), AF_INET };
	struct mbuf *mcp;
	struct ip *encap_ip;
	u_int32_t *reghdr;
	struct ifnet *vifp;

	VIF_LOCK();
	if ((reg_vif_num >= numvifs) \|\| (reg_vif_num == VIFI_INVALID)) {
	VIF_UNLOCK();
	if (mrtdebug & DEBUG_PIM)
	log(LOG_DEBUG,
	"pim_input: register vif not set: %d\n", reg_vif_num);
	m_freem(m);
	return;
	}
	/* XXX need refcnt? */
	vifp = viftable[reg_vif_num].v_ifp;
	VIF_UNLOCK();

	/*
	* Validate length
	*/
	if (datalen < PIM_REG_MINLEN) {
	pimstat.pims_rcv_tooshort++;
	pimstat.pims_rcv_badregisters++;
	log(LOG_ERR,
	"pim_input: register packet size too small %d from %lx\n",
	datalen, (u_long)ip->ip_src.s_addr);
	m_freem(m);
	return;
	}

	reghdr = (u_int32_t *)(pim + 1);
	encap_ip = (struct ip *)(reghdr + 1);

	if (mrtdebug & DEBUG_PIM) {
	log(LOG_DEBUG,
	"pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n",
	(u_long)ntohl(encap_ip->ip_src.s_addr),
	(u_long)ntohl(encap_ip->ip_dst.s_addr),
	ntohs(encap_ip->ip_len));
	}

	/* verify the version number of the inner packet */
	if (encap_ip->ip_v != IPVERSION) {
	pimstat.pims_rcv_badregisters++;
	if (mrtdebug & DEBUG_PIM) {
	log(LOG_DEBUG, "pim_input: invalid IP version (%d) "
	"of the inner packet\n", encap_ip->ip_v);
	}
	m_freem(m);
	return;
	}

	/* verify the inner packet is destined to a mcast group */
	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
	pimstat.pims_rcv_badregisters++;
	if (mrtdebug & DEBUG_PIM)
	log(LOG_DEBUG,
	"pim_input: inner packet of register is not "
	"multicast %lx\n",
	(u_long)ntohl(encap_ip->ip_dst.s_addr));
	m_freem(m);
	return;
	}

	/* If a NULL_REGISTER, pass it to the daemon */
	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
	goto pim_input_to_daemon;

	/*
	* Copy the TOS from the outer IP header to the inner IP header.
	*/
	if (encap_ip->ip_tos != ip_tos) {
	/* Outer TOS -> inner TOS */
	encap_ip->ip_tos = ip_tos;
	/* Recompute the inner header checksum. Sigh... */

	/* adjust mbuf to point to the inner IP header */
	m->m_data += (iphlen + PIM_MINLEN);
	m->m_len -= (iphlen + PIM_MINLEN);

	encap_ip->ip_sum = 0;
	encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);

	/* restore mbuf to point back to the outer IP header */
	m->m_data -= (iphlen + PIM_MINLEN);
	m->m_len += (iphlen + PIM_MINLEN);
	}

	/*
	* Decapsulate the inner IP packet and loopback to forward it
	* as a normal multicast packet. Also, make a copy of the
	* outer_iphdr + pimhdr + reghdr + encap_iphdr
	* to pass to the daemon later, so it can take the appropriate
	* actions (e.g., send back PIM_REGISTER_STOP).
	* XXX: here m->m_data points to the outer IP header.
	*/
	mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN);
	if (mcp == NULL) {
	log(LOG_ERR,
	"pim_input: pim register: could not copy register head\n");
	m_freem(m);
	return;
	}

	/* Keep statistics */
	/* XXX: registers_bytes include only the encap. mcast pkt */
	pimstat.pims_rcv_registers_msgs++;
	pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len);

	/*
	* forward the inner ip packet; point m_data at the inner ip.
	*/
	m_adj(m, iphlen + PIM_MINLEN);

	if (mrtdebug & DEBUG_PIM) {
	log(LOG_DEBUG,
	"pim_input: forwarding decapsulated register: "
	"src %lx, dst %lx, vif %d\n",
	(u_long)ntohl(encap_ip->ip_src.s_addr),
	(u_long)ntohl(encap_ip->ip_dst.s_addr),
	reg_vif_num);
	}
	/* NB: vifp was collected above; can it change on us? */
	if_simloop(vifp, m, dst.sin_family, 0);

	/* prepare the register head to send to the mrouting daemon */
	m = mcp;
	}

	pim_input_to_daemon:
	/*
	* Pass the PIM message up to the daemon; if it is a Register message,
	* pass the 'head' only up to the daemon. This includes the
	* outer IP header, PIM header, PIM-Register header and the
	* inner IP header.
	* XXX: the outer IP header pkt size of a Register is not adjust to
	* reflect the fact that the inner multicast data is truncated.
	*/
	rip_input(m, iphlen);

	return;
	}

	/*
	* XXX: This is common code for dealing with initialization for both
	* the IPv4 and IPv6 multicast forwarding paths. It could do with cleanup.
	*/
	static int
	ip_mroute_modevent(module_t mod, int type, void *unused)
	{
	+ INIT_VNET_INET(curvnet);
	+
	switch (type) {
	case MOD_LOAD:
	MROUTER_LOCK_INIT();
	MFC_LOCK_INIT();
	VIF_LOCK_INIT();
	ip_mrouter_reset();
	TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
	&pim_squelch_wholepkt);

	pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM,
	pim_encapcheck, &in_pim_protosw, NULL);
	if (pim_encap_cookie == NULL) {
	printf("ip_mroute: unable to attach pim encap\n");
	VIF_LOCK_DESTROY();
	MFC_LOCK_DESTROY();
	MROUTER_LOCK_DESTROY();
	return (EINVAL);
	}

	#ifdef INET6
	pim6_encap_cookie = encap_attach_func(AF_INET6, IPPROTO_PIM,
	pim_encapcheck, (struct protosw *)&in6_pim_protosw, NULL);
	if (pim6_encap_cookie == NULL) {
	printf("ip_mroute: unable to attach pim6 encap\n");
	if (pim_encap_cookie) {
	encap_detach(pim_encap_cookie);
	pim_encap_cookie = NULL;
	}
	VIF_LOCK_DESTROY();
	MFC_LOCK_DESTROY();
	MROUTER_LOCK_DESTROY();
	return (EINVAL);
	}
	#endif

	ip_mcast_src = X_ip_mcast_src;
	ip_mforward = X_ip_mforward;
	ip_mrouter_done = X_ip_mrouter_done;
	ip_mrouter_get = X_ip_mrouter_get;
	ip_mrouter_set = X_ip_mrouter_set;

	#ifdef INET6
	ip6_mforward = X_ip6_mforward;
	ip6_mrouter_done = X_ip6_mrouter_done;
	ip6_mrouter_get = X_ip6_mrouter_get;
	ip6_mrouter_set = X_ip6_mrouter_set;
	mrt6_ioctl = X_mrt6_ioctl;
	#endif

	ip_rsvp_force_done = X_ip_rsvp_force_done;
	ip_rsvp_vif = X_ip_rsvp_vif;

	legal_vif_num = X_legal_vif_num;
	mrt_ioctl = X_mrt_ioctl;
	rsvp_input_p = X_rsvp_input;
	break;

	case MOD_UNLOAD:
	/*
	* Typically module unload happens after the user-level
	* process has shutdown the kernel services (the check
	* below insures someone can't just yank the module out
	* from under a running process). But if the module is
	* just loaded and then unloaded w/o starting up a user
	* process we still need to cleanup.
	*/
	if (V_ip_mrouter
	#ifdef INET6
	\|\| ip6_mrouter
	#endif
	)
	return EINVAL;

	#ifdef INET6
	if (pim6_encap_cookie) {
	encap_detach(pim6_encap_cookie);
	pim6_encap_cookie = NULL;
	}
	X_ip6_mrouter_done();
	ip6_mforward = NULL;
	ip6_mrouter_done = NULL;
	ip6_mrouter_get = NULL;
	ip6_mrouter_set = NULL;
	mrt6_ioctl = NULL;
	#endif

	if (pim_encap_cookie) {
	encap_detach(pim_encap_cookie);
	pim_encap_cookie = NULL;
	}
	X_ip_mrouter_done();
	ip_mcast_src = NULL;
	ip_mforward = NULL;
	ip_mrouter_done = NULL;
	ip_mrouter_get = NULL;
	ip_mrouter_set = NULL;

	ip_rsvp_force_done = NULL;
	ip_rsvp_vif = NULL;

	legal_vif_num = NULL;
	mrt_ioctl = NULL;
	rsvp_input_p = NULL;

	VIF_LOCK_DESTROY();
	MFC_LOCK_DESTROY();
	MROUTER_LOCK_DESTROY();
	break;

	default:
	return EOPNOTSUPP;
	}
	return 0;
	}

	static moduledata_t ip_mroutemod = {
	"ip_mroute",
	ip_mroute_modevent,
	0
	};
	DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
	Index: head/sys/netinet/ip_options.c
	===================================================================
	--- head/sys/netinet/ip_options.c (revision 183549)
	+++ head/sys/netinet/ip_options.c (revision 183550)
	@@ -1,683 +1,684 @@
	/*
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California.
	* Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipstealth.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/netisr.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/ip_icmp.h>
	#include <machine/in_cksum.h>

	#include <sys/socketvar.h>

	#include <security/mac/mac_framework.h>

	static int ip_dosourceroute = 0;
	SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW,
	&ip_dosourceroute, 0, "Enable forwarding source routed IP packets");

	static int ip_acceptsourceroute = 0;
	SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
	CTLFLAG_RW, &ip_acceptsourceroute, 0,
	"Enable accepting source routed IP packets");

	int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */
	SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW,
	&ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)");

	static void save_rte(struct mbuf m, u_char , struct in_addr);

	/*
	* Do option processing on a datagram, possibly discarding it if bad options
	* are encountered, or forwarding it if source-routed.
	*
	* The pass argument is used when operating in the IPSTEALTH mode to tell
	* what options to process: [LS]SRR (pass 0) or the others (pass 1). The
	* reason for as many as two passes is that when doing IPSTEALTH, non-routing
	* options should be processed only if the packet is for us.
	*
	* Returns 1 if packet has been forwarded/freed, 0 if the packet should be
	* processed further.
	*/
	int
	ip_dooptions(struct mbuf *m, int pass)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip ip = mtod(m, struct ip );
	u_char *cp;
	struct in_ifaddr *ia;
	int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0;
	struct in_addr *sin, dst;
	n_time ntime;
	struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };

	/* Ignore or reject packets with IP options. */
	if (ip_doopts == 0)
	return 0;
	else if (ip_doopts == 2) {
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_FILTER_PROHIB;
	goto bad;
	}

	dst = ip->ip_dst;
	cp = (u_char *)(ip + 1);
	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
	for (; cnt > 0; cnt -= optlen, cp += optlen) {
	opt = cp[IPOPT_OPTVAL];
	if (opt == IPOPT_EOL)
	break;
	if (opt == IPOPT_NOP)
	optlen = 1;
	else {
	if (cnt < IPOPT_OLEN + sizeof(*cp)) {
	code = &cp[IPOPT_OLEN] - (u_char *)ip;
	goto bad;
	}
	optlen = cp[IPOPT_OLEN];
	if (optlen < IPOPT_OLEN + sizeof(*cp) \|\| optlen > cnt) {
	code = &cp[IPOPT_OLEN] - (u_char *)ip;
	goto bad;
	}
	}
	switch (opt) {

	default:
	break;

	/*
	* Source routing with record. Find interface with current
	* destination address. If none on this machine then drop if
	* strictly routed, or do nothing if loosely routed. Record
	* interface address and bring up next address component. If
	* strictly routed make sure next address is on directly
	* accessible net.
	*/
	case IPOPT_LSRR:
	case IPOPT_SSRR:
	#ifdef IPSTEALTH
	if (V_ipstealth && pass > 0)
	break;
	#endif
	if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
	code = &cp[IPOPT_OLEN] - (u_char *)ip;
	goto bad;
	}
	if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
	code = &cp[IPOPT_OFFSET] - (u_char *)ip;
	goto bad;
	}
	ipaddr.sin_addr = ip->ip_dst;
	ia = (struct in_ifaddr *)
	ifa_ifwithaddr((struct sockaddr *)&ipaddr);
	if (ia == NULL) {
	if (opt == IPOPT_SSRR) {
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_SRCFAIL;
	goto bad;
	}
	if (!ip_dosourceroute)
	goto nosourcerouting;
	/*
	* Loose routing, and not at next destination
	* yet; nothing to do except forward.
	*/
	break;
	}
	off--; /* 0 origin */
	if (off > optlen - (int)sizeof(struct in_addr)) {
	/*
	* End of source route. Should be for us.
	*/
	if (!ip_acceptsourceroute)
	goto nosourcerouting;
	save_rte(m, cp, ip->ip_src);
	break;
	}
	#ifdef IPSTEALTH
	if (V_ipstealth)
	goto dropit;
	#endif
	if (!ip_dosourceroute) {
	if (V_ipforwarding) {
	char buf[16]; /* aaa.bbb.ccc.ddd\0 */
	/*
	* Acting as a router, so generate
	* ICMP
	*/
	nosourcerouting:
	strcpy(buf, inet_ntoa(ip->ip_dst));
	log(LOG_WARNING,
	"attempted source route from %s to %s\n",
	inet_ntoa(ip->ip_src), buf);
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_SRCFAIL;
	goto bad;
	} else {
	/*
	* Not acting as a router, so
	* silently drop.
	*/
	#ifdef IPSTEALTH
	dropit:
	#endif
	V_ipstat.ips_cantforward++;
	m_freem(m);
	return (1);
	}
	}

	/*
	* locate outgoing interface
	*/
	(void)memcpy(&ipaddr.sin_addr, cp + off,
	sizeof(ipaddr.sin_addr));

	if (opt == IPOPT_SSRR) {
	#define INA struct in_ifaddr *
	#define SA struct sockaddr *
	if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL)
	ia = (INA)ifa_ifwithnet((SA)&ipaddr);
	} else
	/* XXX MRT 0 for routing */
	ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m));
	if (ia == NULL) {
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_SRCFAIL;
	goto bad;
	}
	ip->ip_dst = ipaddr.sin_addr;
	(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
	sizeof(struct in_addr));
	cp[IPOPT_OFFSET] += sizeof(struct in_addr);
	/*
	* Let ip_intr's mcast routing check handle mcast pkts
	*/
	forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
	break;

	case IPOPT_RR:
	#ifdef IPSTEALTH
	if (V_ipstealth && pass == 0)
	break;
	#endif
	if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
	code = &cp[IPOPT_OFFSET] - (u_char *)ip;
	goto bad;
	}
	if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
	code = &cp[IPOPT_OFFSET] - (u_char *)ip;
	goto bad;
	}
	/*
	* If no space remains, ignore.
	*/
	off--; /* 0 origin */
	if (off > optlen - (int)sizeof(struct in_addr))
	break;
	(void)memcpy(&ipaddr.sin_addr, &ip->ip_dst,
	sizeof(ipaddr.sin_addr));
	/*
	* Locate outgoing interface; if we're the
	* destination, use the incoming interface (should be
	* same).
	*/
	if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL &&
	(ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) {
	type = ICMP_UNREACH;
	code = ICMP_UNREACH_HOST;
	goto bad;
	}
	(void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
	sizeof(struct in_addr));
	cp[IPOPT_OFFSET] += sizeof(struct in_addr);
	break;

	case IPOPT_TS:
	#ifdef IPSTEALTH
	if (V_ipstealth && pass == 0)
	break;
	#endif
	code = cp - (u_char *)ip;
	if (optlen < 4 \|\| optlen > 40) {
	code = &cp[IPOPT_OLEN] - (u_char *)ip;
	goto bad;
	}
	if ((off = cp[IPOPT_OFFSET]) < 5) {
	code = &cp[IPOPT_OLEN] - (u_char *)ip;
	goto bad;
	}
	if (off > optlen - (int)sizeof(int32_t)) {
	cp[IPOPT_OFFSET + 1] += (1 << 4);
	if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) {
	code = &cp[IPOPT_OFFSET] - (u_char *)ip;
	goto bad;
	}
	break;
	}
	off--; /* 0 origin */
	sin = (struct in_addr *)(cp + off);
	switch (cp[IPOPT_OFFSET + 1] & 0x0f) {

	case IPOPT_TS_TSONLY:
	break;

	case IPOPT_TS_TSANDADDR:
	if (off + sizeof(n_time) +
	sizeof(struct in_addr) > optlen) {
	code = &cp[IPOPT_OFFSET] - (u_char *)ip;
	goto bad;
	}
	ipaddr.sin_addr = dst;
	ia = (INA)ifaof_ifpforaddr((SA)&ipaddr,
	m->m_pkthdr.rcvif);
	if (ia == NULL)
	continue;
	(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
	sizeof(struct in_addr));
	cp[IPOPT_OFFSET] += sizeof(struct in_addr);
	off += sizeof(struct in_addr);
	break;

	case IPOPT_TS_PRESPEC:
	if (off + sizeof(n_time) +
	sizeof(struct in_addr) > optlen) {
	code = &cp[IPOPT_OFFSET] - (u_char *)ip;
	goto bad;
	}
	(void)memcpy(&ipaddr.sin_addr, sin,
	sizeof(struct in_addr));
	if (ifa_ifwithaddr((SA)&ipaddr) == NULL)
	continue;
	cp[IPOPT_OFFSET] += sizeof(struct in_addr);
	off += sizeof(struct in_addr);
	break;

	default:
	code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip;
	goto bad;
	}
	ntime = iptime();
	(void)memcpy(cp + off, &ntime, sizeof(n_time));
	cp[IPOPT_OFFSET] += sizeof(n_time);
	}
	}
	if (forward && V_ipforwarding) {
	ip_forward(m, 1);
	return (1);
	}
	return (0);
	bad:
	icmp_error(m, type, code, 0, 0);
	V_ipstat.ips_badoptions++;
	return (1);
	}

	/*
	* Save incoming source route for use in replies, to be picked up later by
	* ip_srcroute if the receiver is interested.
	*/
	static void
	save_rte(struct mbuf m, u_char option, struct in_addr dst)
	{
	unsigned olen;
	struct ipopt_tag *opts;

	opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS,
	sizeof(struct ipopt_tag), M_NOWAIT);
	if (opts == NULL)
	return;

	olen = option[IPOPT_OLEN];
	if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) {
	m_tag_free((struct m_tag *)opts);
	return;
	}
	bcopy(option, opts->ip_srcrt.srcopt, olen);
	opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr);
	opts->ip_srcrt.dst = dst;
	m_tag_prepend(m, (struct m_tag *)opts);
	}

	/*
	* Retrieve incoming source route for use in replies, in the same form used
	* by setsockopt. The first hop is placed before the options, will be
	* removed later.
	*/
	struct mbuf *
	ip_srcroute(struct mbuf *m0)
	{
	struct in_addr p, q;
	struct mbuf *m;
	struct ipopt_tag *opts;

	opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL);
	if (opts == NULL)
	return (NULL);

	if (opts->ip_nhops == 0)
	return (NULL);
	m = m_get(M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return (NULL);

	#define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt))

	/* length is (nhops+1)sizeof(addr) + sizeof(nop + srcrt header) /
	m->m_len = opts->ip_nhops * sizeof(struct in_addr) +
	sizeof(struct in_addr) + OPTSIZ;

	/*
	* First, save first hop for return route.
	*/
	p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]);
	(mtod(m, struct in_addr )) = *p--;

	/*
	* Copy option fields and padding (nop) to mbuf.
	*/
	opts->ip_srcrt.nop = IPOPT_NOP;
	opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
	(void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
	&(opts->ip_srcrt.nop), OPTSIZ);
	q = (struct in_addr *)(mtod(m, caddr_t) +
	sizeof(struct in_addr) + OPTSIZ);
	#undef OPTSIZ
	/*
	* Record return path as an IP source route, reversing the path
	* (pointers are now aligned).
	*/
	while (p >= opts->ip_srcrt.route) {
	q++ = p--;
	}
	/*
	* Last hop goes to final destination.
	*/
	*q = opts->ip_srcrt.dst;
	m_tag_delete(m0, (struct m_tag *)opts);
	return (m);
	}

	/*
	* Strip out IP options, at higher level protocol in the kernel. Second
	* argument is buffer to which options will be moved, and return value is
	* their length.
	*
	* XXX should be deleted; last arg currently ignored.
	*/
	void
	ip_stripoptions(struct mbuf m, struct mbuf mopt)
	{
	int i;
	struct ip ip = mtod(m, struct ip );
	caddr_t opts;
	int olen;

	olen = (ip->ip_hl << 2) - sizeof (struct ip);
	opts = (caddr_t)(ip + 1);
	i = m->m_len - (sizeof (struct ip) + olen);
	bcopy(opts + olen, opts, (unsigned)i);
	m->m_len -= olen;
	if (m->m_flags & M_PKTHDR)
	m->m_pkthdr.len -= olen;
	ip->ip_v = IPVERSION;
	ip->ip_hl = sizeof(struct ip) >> 2;
	}

	/*
	* Insert IP options into preformed packet. Adjust IP destination as
	* required for IP source routing, as indicated by a non-zero in_addr at the
	* start of the options.
	*
	* XXX This routine assumes that the packet has no options in place.
	*/
	struct mbuf *
	ip_insertoptions(struct mbuf m, struct mbuf opt, int *phlen)
	{
	struct ipoption p = mtod(opt, struct ipoption );
	struct mbuf *n;
	struct ip ip = mtod(m, struct ip );
	unsigned optlen;

	optlen = opt->m_len - sizeof(p->ipopt_dst);
	if (optlen + ip->ip_len > IP_MAXPACKET) {
	*phlen = 0;
	return (m); /* XXX should fail */
	}
	if (p->ipopt_dst.s_addr)
	ip->ip_dst = p->ipopt_dst;
	if (m->m_flags & M_EXT \|\| m->m_data - optlen < m->m_pktdat) {
	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (n == NULL) {
	*phlen = 0;
	return (m);
	}
	M_MOVE_PKTHDR(n, m);
	n->m_pkthdr.rcvif = NULL;
	n->m_pkthdr.len += optlen;
	m->m_len -= sizeof(struct ip);
	m->m_data += sizeof(struct ip);
	n->m_next = m;
	m = n;
	m->m_len = optlen + sizeof(struct ip);
	m->m_data += max_linkhdr;
	bcopy(ip, mtod(m, void *), sizeof(struct ip));
	} else {
	m->m_data -= optlen;
	m->m_len += optlen;
	m->m_pkthdr.len += optlen;
	bcopy(ip, mtod(m, void *), sizeof(struct ip));
	}
	ip = mtod(m, struct ip *);
	bcopy(p->ipopt_list, ip + 1, optlen);
	*phlen = sizeof(struct ip) + optlen;
	ip->ip_v = IPVERSION;
	ip->ip_hl = *phlen >> 2;
	ip->ip_len += optlen;
	return (m);
	}

	/*
	* Copy options from ip to jp, omitting those not copied during
	* fragmentation.
	*/
	int
	ip_optcopy(struct ip ip, struct ip jp)
	{
	u_char cp, dp;
	int opt, optlen, cnt;

	cp = (u_char *)(ip + 1);
	dp = (u_char *)(jp + 1);
	cnt = (ip->ip_hl << 2) - sizeof (struct ip);
	for (; cnt > 0; cnt -= optlen, cp += optlen) {
	opt = cp[0];
	if (opt == IPOPT_EOL)
	break;
	if (opt == IPOPT_NOP) {
	/* Preserve for IP mcast tunnel's LSRR alignment. */
	*dp++ = IPOPT_NOP;
	optlen = 1;
	continue;
	}

	KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp),
	("ip_optcopy: malformed ipv4 option"));
	optlen = cp[IPOPT_OLEN];
	KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt,
	("ip_optcopy: malformed ipv4 option"));

	/* Bogus lengths should have been caught by ip_dooptions. */
	if (optlen > cnt)
	optlen = cnt;
	if (IPOPT_COPIED(opt)) {
	bcopy(cp, dp, optlen);
	dp += optlen;
	}
	}
	for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
	*dp++ = IPOPT_EOL;
	return (optlen);
	}

	/*
	* Set up IP options in pcb for insertion in output packets. Store in mbuf
	* with pointer in pcbopt, adding pseudo-option with destination address if
	* source routed.
	*/
	int
	ip_pcbopts(struct inpcb inp, int optname, struct mbuf m)
	{
	int cnt, optlen;
	u_char *cp;
	struct mbuf **pcbopt;
	u_char opt;

	INP_WLOCK_ASSERT(inp);

	pcbopt = &inp->inp_options;

	/* turn off any old options */
	if (*pcbopt)
	(void)m_free(*pcbopt);
	*pcbopt = 0;
	if (m == NULL \|\| m->m_len == 0) {
	/*
	* Only turning off any previous options.
	*/
	if (m != NULL)
	(void)m_free(m);
	return (0);
	}

	if (m->m_len % sizeof(int32_t))
	goto bad;
	/*
	* IP first-hop destination address will be stored before actual
	* options; move other options back and clear it when none present.
	*/
	if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
	goto bad;
	cnt = m->m_len;
	m->m_len += sizeof(struct in_addr);
	cp = mtod(m, u_char *) + sizeof(struct in_addr);
	bcopy(mtod(m, void *), cp, (unsigned)cnt);
	bzero(mtod(m, void *), sizeof(struct in_addr));

	for (; cnt > 0; cnt -= optlen, cp += optlen) {
	opt = cp[IPOPT_OPTVAL];
	if (opt == IPOPT_EOL)
	break;
	if (opt == IPOPT_NOP)
	optlen = 1;
	else {
	if (cnt < IPOPT_OLEN + sizeof(*cp))
	goto bad;
	optlen = cp[IPOPT_OLEN];
	if (optlen < IPOPT_OLEN + sizeof(*cp) \|\| optlen > cnt)
	goto bad;
	}
	switch (opt) {

	default:
	break;

	case IPOPT_LSRR:
	case IPOPT_SSRR:
	/*
	* User process specifies route as:
	*
	* ->A->B->C->D
	*
	* D must be our final destination (but we can't
	* check that since we may not have connected yet).
	* A is first hop destination, which doesn't appear
	* in actual IP option, but is stored before the
	* options.
	*/
	/* XXX-BZ PRIV_NETINET_SETHDROPTS? */
	if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
	goto bad;
	m->m_len -= sizeof(struct in_addr);
	cnt -= sizeof(struct in_addr);
	optlen -= sizeof(struct in_addr);
	cp[IPOPT_OLEN] = optlen;
	/*
	* Move first hop before start of options.
	*/
	bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
	sizeof(struct in_addr));
	/*
	* Then copy rest of options back
	* to close up the deleted entry.
	*/
	bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)),
	&cp[IPOPT_OFFSET+1],
	(unsigned)cnt - (IPOPT_MINOFF - 1));
	break;
	}
	}
	if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
	goto bad;
	*pcbopt = m;
	return (0);

	bad:
	(void)m_free(m);
	return (EINVAL);
	}
	Index: head/sys/netinet/ip_output.c
	===================================================================
	--- head/sys/netinet/ip_output.c (revision 183549)
	+++ head/sys/netinet/ip_output.c (revision 183550)
	@@ -1,1183 +1,1186 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipfw.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"
	#include "opt_mbuf_stress_test.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/ucred.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/netisr.h>
	#include <net/pfil.h>
	#include <net/route.h>
	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>

	#ifdef IPSEC
	#include <netinet/ip_ipsec.h>
	#include <netipsec/ipsec.h>
	#endif /* IPSEC*/

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\
	x, (ntohl(a.s_addr)>>24)&0xFF,\
	(ntohl(a.s_addr)>>16)&0xFF,\
	(ntohl(a.s_addr)>>8)&0xFF,\
	(ntohl(a.s_addr))&0xFF, y);

	u_short ip_id;

	#ifdef MBUF_STRESS_TEST
	int mbuf_frag_size = 0;
	SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
	#endif

	static void ip_mloopback
	(struct ifnet , struct mbuf , struct sockaddr_in *, int);


	extern struct protosw inetsw[];

	/*
	* IP output. The packet in mbuf chain m contains a skeletal IP
	* header (with len, off, ttl, proto, tos, src, dst).
	* The mbuf chain containing the packet will be freed.
	* The mbuf opt, if present, will not be freed.
	* In the IP forwarding case, the packet will arrive with options already
	* inserted, so must have a NULL opt pointer.
	*/
	int
	ip_output(struct mbuf m, struct mbuf opt, struct route *ro, int flags,
	struct ip_moptions imo, struct inpcb inp)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET(curvnet);
	struct ip *ip;
	struct ifnet ifp = NULL; / keep compiler happy */
	struct mbuf *m0;
	int hlen = sizeof (struct ip);
	int mtu;
	int len, error = 0;
	struct sockaddr_in dst = NULL; / keep compiler happy */
	struct in_ifaddr *ia = NULL;
	int isbroadcast, sw_csum;
	struct route iproute;
	struct in_addr odst;
	#ifdef IPFIREWALL_FORWARD
	struct m_tag *fwd_tag = NULL;
	#endif
	M_ASSERTPKTHDR(m);

	if (ro == NULL) {
	ro = &iproute;
	bzero(ro, sizeof (*ro));
	}

	if (inp != NULL)
	INP_LOCK_ASSERT(inp);

	if (opt) {
	len = 0;
	m = ip_insertoptions(m, opt, &len);
	if (len != 0)
	hlen = len;
	}
	ip = mtod(m, struct ip *);

	/*
	* Fill in IP header. If we are not allowing fragmentation,
	* then the ip_id field is meaningless, but we don't set it
	* to zero. Doing so causes various problems when devices along
	* the path (routers, load balancers, firewalls, etc.) illegally
	* disable DF on our packet. Note that a 16-bit counter
	* will wrap around in less than 10 seconds at 100 Mbit/s on a
	* medium with MTU 1500. See Steven M. Bellovin, "A Technique
	* for Counting NATted Hosts", Proc. IMW'02, available at
	* <http://www.cs.columbia.edu/~smb/papers/fnat.pdf>.
	*/
	if ((flags & (IP_FORWARDING\|IP_RAWOUTPUT)) == 0) {
	ip->ip_v = IPVERSION;
	ip->ip_hl = hlen >> 2;
	ip->ip_id = ip_newid();
	V_ipstat.ips_localout++;
	} else {
	hlen = ip->ip_hl << 2;
	}

	dst = (struct sockaddr_in *)&ro->ro_dst;
	again:
	/*
	* If there is a cached route,
	* check that it is to the same destination
	* and is still up. If not, free it and try again.
	* The address family should also be checked in case of sharing the
	* cache with IPv6.
	*/
	if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 \|\|
	dst->sin_family != AF_INET \|\|
	dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
	RTFREE(ro->ro_rt);
	ro->ro_rt = (struct rtentry *)NULL;
	}
	#ifdef IPFIREWALL_FORWARD
	if (ro->ro_rt == NULL && fwd_tag == NULL) {
	#else
	if (ro->ro_rt == NULL) {
	#endif
	bzero(dst, sizeof(*dst));
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = ip->ip_dst;
	}
	/*
	* If routing to interface only, short circuit routing lookup.
	* The use of an all-ones broadcast address implies this; an
	* interface is specified by the broadcast address of an interface,
	* or the destination address of a ptp interface.
	*/
	if (flags & IP_SENDONES) {
	if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL &&
	(ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
	V_ipstat.ips_noroute++;
	error = ENETUNREACH;
	goto bad;
	}
	ip->ip_dst.s_addr = INADDR_BROADCAST;
	dst->sin_addr = ip->ip_dst;
	ifp = ia->ia_ifp;
	ip->ip_ttl = 1;
	isbroadcast = 1;
	} else if (flags & IP_ROUTETOIF) {
	if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL &&
	(ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == NULL) {
	V_ipstat.ips_noroute++;
	error = ENETUNREACH;
	goto bad;
	}
	ifp = ia->ia_ifp;
	ip->ip_ttl = 1;
	isbroadcast = in_broadcast(dst->sin_addr, ifp);
	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
	imo != NULL && imo->imo_multicast_ifp != NULL) {
	/*
	* Bypass the normal routing lookup for multicast
	* packets if the interface is specified.
	*/
	ifp = imo->imo_multicast_ifp;
	IFP_TO_IA(ifp, ia);
	isbroadcast = 0; /* fool gcc */
	} else {
	/*
	* We want to do any cloning requested by the link layer,
	* as this is probably required in all cases for correct
	* operation (as it is for ARP).
	*/
	if (ro->ro_rt == NULL)
	#ifdef RADIX_MPATH
	rtalloc_mpath_fib(ro,
	ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
	inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
	#else
	in_rtalloc_ign(ro, 0,
	inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
	#endif
	if (ro->ro_rt == NULL) {
	V_ipstat.ips_noroute++;
	error = EHOSTUNREACH;
	goto bad;
	}
	ia = ifatoia(ro->ro_rt->rt_ifa);
	ifp = ro->ro_rt->rt_ifp;
	ro->ro_rt->rt_rmx.rmx_pksent++;
	if (ro->ro_rt->rt_flags & RTF_GATEWAY)
	dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
	if (ro->ro_rt->rt_flags & RTF_HOST)
	isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
	else
	isbroadcast = in_broadcast(dst->sin_addr, ifp);
	}
	/*
	* Calculate MTU. If we have a route that is up, use that,
	* otherwise use the interface's MTU.
	*/
	if (ro->ro_rt != NULL && (ro->ro_rt->rt_flags & (RTF_UP\|RTF_HOST))) {
	/*
	* This case can happen if the user changed the MTU
	* of an interface after enabling IP on it. Because
	* most netifs don't keep track of routes pointing to
	* them, there is no way for one to update all its
	* routes when the MTU is changed.
	*/
	if (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)
	ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
	mtu = ro->ro_rt->rt_rmx.rmx_mtu;
	} else {
	mtu = ifp->if_mtu;
	}
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
	struct in_multi *inm;

	m->m_flags \|= M_MCAST;
	/*
	* IP destination address is multicast. Make sure "dst"
	* still points to the address in "ro". (It may have been
	* changed to point to a gateway address, above.)
	*/
	dst = (struct sockaddr_in *)&ro->ro_dst;
	/*
	* See if the caller provided any multicast options
	*/
	if (imo != NULL) {
	ip->ip_ttl = imo->imo_multicast_ttl;
	if (imo->imo_multicast_vif != -1)
	ip->ip_src.s_addr =
	ip_mcast_src ?
	ip_mcast_src(imo->imo_multicast_vif) :
	INADDR_ANY;
	} else
	ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
	/*
	* Confirm that the outgoing interface supports multicast.
	*/
	if ((imo == NULL) \|\| (imo->imo_multicast_vif == -1)) {
	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
	V_ipstat.ips_noroute++;
	error = ENETUNREACH;
	goto bad;
	}
	}
	/*
	* If source address not specified yet, use address
	* of outgoing interface.
	*/
	if (ip->ip_src.s_addr == INADDR_ANY) {
	/* Interface may have no addresses. */
	if (ia != NULL)
	ip->ip_src = IA_SIN(ia)->sin_addr;
	}

	IN_MULTI_LOCK();
	IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm);
	if (inm != NULL &&
	(imo == NULL \|\| imo->imo_multicast_loop)) {
	IN_MULTI_UNLOCK();
	/*
	* If we belong to the destination multicast group
	* on the outgoing interface, and the caller did not
	* forbid loopback, loop back a copy.
	*/
	ip_mloopback(ifp, m, dst, hlen);
	}
	else {
	IN_MULTI_UNLOCK();
	/*
	* If we are acting as a multicast router, perform
	* multicast forwarding as if the packet had just
	* arrived on the interface to which we are about
	* to send. The multicast forwarding function
	* recursively calls this function, using the
	* IP_FORWARDING flag to prevent infinite recursion.
	*
	* Multicasts that are looped back by ip_mloopback(),
	* above, will be forwarded by the ip_input() routine,
	* if necessary.
	*/
	if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
	/*
	* If rsvp daemon is not running, do not
	* set ip_moptions. This ensures that the packet
	* is multicast and not just sent down one link
	* as prescribed by rsvpd.
	*/
	if (!V_rsvp_on)
	imo = NULL;
	if (ip_mforward &&
	ip_mforward(ip, ifp, m, imo) != 0) {
	m_freem(m);
	goto done;
	}
	}
	}

	/*
	* Multicasts with a time-to-live of zero may be looped-
	* back, above, but must not be transmitted on a network.
	* Also, multicasts addressed to the loopback interface
	* are not sent -- the above call to ip_mloopback() will
	* loop back a copy if this host actually belongs to the
	* destination group on the loopback interface.
	*/
	if (ip->ip_ttl == 0 \|\| ifp->if_flags & IFF_LOOPBACK) {
	m_freem(m);
	goto done;
	}

	goto sendit;
	}

	/*
	* If the source address is not specified yet, use the address
	* of the outoing interface.
	*/
	if (ip->ip_src.s_addr == INADDR_ANY) {
	/* Interface may have no addresses. */
	if (ia != NULL) {
	ip->ip_src = IA_SIN(ia)->sin_addr;
	}
	}

	/*
	* Verify that we have any chance at all of being able to queue the
	* packet or packet fragments, unless ALTQ is enabled on the given
	* interface in which case packetdrop should be done by queueing.
	*/
	#ifdef ALTQ
	if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) &&
	((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
	ifp->if_snd.ifq_maxlen))
	#else
	if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >=
	ifp->if_snd.ifq_maxlen)
	#endif /* ALTQ */
	{
	error = ENOBUFS;
	V_ipstat.ips_odropped++;
	ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1);
	goto bad;
	}

	/*
	* Look for broadcast address and
	* verify user is allowed to send
	* such a packet.
	*/
	if (isbroadcast) {
	if ((ifp->if_flags & IFF_BROADCAST) == 0) {
	error = EADDRNOTAVAIL;
	goto bad;
	}
	if ((flags & IP_ALLOWBROADCAST) == 0) {
	error = EACCES;
	goto bad;
	}
	/* don't allow broadcast messages to be fragmented */
	if (ip->ip_len > mtu) {
	error = EMSGSIZE;
	goto bad;
	}
	m->m_flags \|= M_BCAST;
	} else {
	m->m_flags &= ~M_BCAST;
	}

	sendit:
	#ifdef IPSEC
	switch(ip_ipsec_output(&m, inp, &flags, &error, &ro, &iproute, &dst, &ia, &ifp)) {
	case 1:
	goto bad;
	case -1:
	goto done;
	case 0:
	default:
	break; /* Continue with packet processing. */
	}
	/* Update variables that are affected by ipsec4_output(). */
	ip = mtod(m, struct ip *);
	hlen = ip->ip_hl << 2;
	#endif /* IPSEC */

	/* Jump over all PFIL processing if hooks are not active. */
	if (!PFIL_HOOKED(&inet_pfil_hook))
	goto passout;

	/* Run through list of hooks for output packets. */
	odst.s_addr = ip->ip_dst.s_addr;
	error = pfil_run_hooks(&inet_pfil_hook, &m, ifp, PFIL_OUT, inp);
	if (error != 0 \|\| m == NULL)
	goto done;

	ip = mtod(m, struct ip *);

	/* See if destination IP address was changed by packet filter. */
	if (odst.s_addr != ip->ip_dst.s_addr) {
	m->m_flags \|= M_SKIP_FIREWALL;
	/* If destination is now ourself drop to ip_input(). */
	if (in_localip(ip->ip_dst)) {
	m->m_flags \|= M_FASTFWD_OURS;
	if (m->m_pkthdr.rcvif == NULL)
	m->m_pkthdr.rcvif = V_loif;
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	m->m_pkthdr.csum_flags \|=
	CSUM_DATA_VALID \| CSUM_PSEUDO_HDR;
	m->m_pkthdr.csum_data = 0xffff;
	}
	m->m_pkthdr.csum_flags \|=
	CSUM_IP_CHECKED \| CSUM_IP_VALID;

	error = netisr_queue(NETISR_IP, m);
	goto done;
	} else
	goto again; /* Redo the routing table lookup. */
	}

	#ifdef IPFIREWALL_FORWARD
	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
	if (m->m_flags & M_FASTFWD_OURS) {
	if (m->m_pkthdr.rcvif == NULL)
	m->m_pkthdr.rcvif = V_loif;
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	m->m_pkthdr.csum_flags \|=
	CSUM_DATA_VALID \| CSUM_PSEUDO_HDR;
	m->m_pkthdr.csum_data = 0xffff;
	}
	m->m_pkthdr.csum_flags \|=
	CSUM_IP_CHECKED \| CSUM_IP_VALID;

	error = netisr_queue(NETISR_IP, m);
	goto done;
	}
	/* Or forward to some other address? */
	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
	if (fwd_tag) {
	dst = (struct sockaddr_in *)&ro->ro_dst;
	bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
	m->m_flags \|= M_SKIP_FIREWALL;
	m_tag_delete(m, fwd_tag);
	goto again;
	}
	#endif /* IPFIREWALL_FORWARD */

	passout:
	/* 127/8 must not appear on wire - RFC1122. */
	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET \|\|
	(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
	if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
	V_ipstat.ips_badaddr++;
	error = EADDRNOTAVAIL;
	goto bad;
	}
	}

	m->m_pkthdr.csum_flags \|= CSUM_IP;
	sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist;
	if (sw_csum & CSUM_DELAY_DATA) {
	in_delayed_cksum(m);
	sw_csum &= ~CSUM_DELAY_DATA;
	}
	m->m_pkthdr.csum_flags &= ifp->if_hwassist;

	/*
	* If small enough for interface, or the interface will take
	* care of the fragmentation for us, we can just send directly.
	*/
	if (ip->ip_len <= mtu \|\|
	(m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 \|\|
	((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);
	ip->ip_sum = 0;
	if (sw_csum & CSUM_DELAY_IP)
	ip->ip_sum = in_cksum(m, hlen);

	/*
	* Record statistics for this interface address.
	* With CSUM_TSO the byte/packet count will be slightly
	* incorrect because we count the IP+TCP headers only
	* once instead of for every generated packet.
	*/
	if (!(flags & IP_FORWARDING) && ia) {
	if (m->m_pkthdr.csum_flags & CSUM_TSO)
	ia->ia_ifa.if_opackets +=
	m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
	else
	ia->ia_ifa.if_opackets++;
	ia->ia_ifa.if_obytes += m->m_pkthdr.len;
	}
	#ifdef MBUF_STRESS_TEST
	if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
	m = m_fragment(m, M_DONTWAIT, mbuf_frag_size);
	#endif
	/*
	* Reset layer specific mbuf flags
	* to avoid confusing lower layers.
	*/
	m->m_flags &= ~(M_PROTOFLAGS);

	error = (*ifp->if_output)(ifp, m,
	(struct sockaddr *)dst, ro->ro_rt);
	goto done;
	}

	/* Balk when DF bit is set or the interface didn't support TSO. */
	if ((ip->ip_off & IP_DF) \|\| (m->m_pkthdr.csum_flags & CSUM_TSO)) {
	error = EMSGSIZE;
	V_ipstat.ips_cantfrag++;
	goto bad;
	}

	/*
	* Too large for interface; fragment if possible. If successful,
	* on return, m will point to a list of packets to be sent.
	*/
	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum);
	if (error)
	goto bad;
	for (; m; m = m0) {
	m0 = m->m_nextpkt;
	m->m_nextpkt = 0;
	if (error == 0) {
	/* Record statistics for this interface address. */
	if (ia != NULL) {
	ia->ia_ifa.if_opackets++;
	ia->ia_ifa.if_obytes += m->m_pkthdr.len;
	}
	/*
	* Reset layer specific mbuf flags
	* to avoid confusing upper layers.
	*/
	m->m_flags &= ~(M_PROTOFLAGS);

	error = (*ifp->if_output)(ifp, m,
	(struct sockaddr *)dst, ro->ro_rt);
	} else
	m_freem(m);
	}

	if (error == 0)
	V_ipstat.ips_fragmented++;

	done:
	if (ro == &iproute && ro->ro_rt) {
	RTFREE(ro->ro_rt);
	}
	return (error);
	bad:
	m_freem(m);
	goto done;
	}

	/*
	* Create a chain of fragments which fit the given mtu. m_frag points to the
	* mbuf to be fragmented; on return it points to the chain with the fragments.
	* Return 0 if no error. If error, m_frag may contain a partially built
	* chain of fragments that should be freed by the caller.
	*
	* if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
	* sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP).
	*/
	int
	ip_fragment(struct ip ip, struct mbuf *m_frag, int mtu,
	u_long if_hwassist_flags, int sw_csum)
	{
	+ INIT_VNET_INET(curvnet);
	int error = 0;
	int hlen = ip->ip_hl << 2;
	int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
	int off;
	struct mbuf m0 = m_frag; /* the original packet */
	int firstlen;
	struct mbuf **mnext;
	int nfrags;

	if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */
	V_ipstat.ips_cantfrag++;
	return EMSGSIZE;
	}

	/*
	* Must be able to put at least 8 bytes per fragment.
	*/
	if (len < 8)
	return EMSGSIZE;

	/*
	* If the interface will not calculate checksums on
	* fragmented packets, then do it here.
	*/
	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
	(if_hwassist_flags & CSUM_IP_FRAGS) == 0) {
	in_delayed_cksum(m0);
	m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}

	if (len > PAGE_SIZE) {
	/*
	* Fragment large datagrams such that each segment
	* contains a multiple of PAGE_SIZE amount of data,
	* plus headers. This enables a receiver to perform
	* page-flipping zero-copy optimizations.
	*
	* XXX When does this help given that sender and receiver
	* could have different page sizes, and also mtu could
	* be less than the receiver's page size ?
	*/
	int newlen;
	struct mbuf *m;

	for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next)
	off += m->m_len;

	/*
	* firstlen (off - hlen) must be aligned on an
	* 8-byte boundary
	*/
	if (off < hlen)
	goto smart_frag_failure;
	off = ((off - hlen) & ~7) + hlen;
	newlen = (~PAGE_MASK) & mtu;
	if ((newlen + sizeof (struct ip)) > mtu) {
	/* we failed, go back the default */
	smart_frag_failure:
	newlen = len;
	off = hlen + len;
	}
	len = newlen;

	} else {
	off = hlen + len;
	}

	firstlen = off - hlen;
	mnext = &m0->m_nextpkt; /* pointer to next packet */

	/*
	* Loop through length of segment after first fragment,
	* make new header and copy data of each part and link onto chain.
	* Here, m0 is the original packet, m is the fragment being created.
	* The fragments are linked off the m_nextpkt of the original
	* packet, which after processing serves as the first fragment.
	*/
	for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) {
	struct ip mhip; / ip header on the fragment */
	struct mbuf *m;
	int mhlen = sizeof (struct ip);

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL) {
	error = ENOBUFS;
	V_ipstat.ips_odropped++;
	goto done;
	}
	m->m_flags \|= (m0->m_flags & M_MCAST) \| M_FRAG;
	/*
	* In the first mbuf, leave room for the link header, then
	* copy the original IP header including options. The payload
	* goes into an additional mbuf chain returned by m_copy().
	*/
	m->m_data += max_linkhdr;
	mhip = mtod(m, struct ip *);
	mhip = ip;
	if (hlen > sizeof (struct ip)) {
	mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
	mhip->ip_v = IPVERSION;
	mhip->ip_hl = mhlen >> 2;
	}
	m->m_len = mhlen;
	/* XXX do we need to add ip->ip_off below ? */
	mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off;
	if (off + len >= ip->ip_len) { /* last fragment */
	len = ip->ip_len - off;
	m->m_flags \|= M_LASTFRAG;
	} else
	mhip->ip_off \|= IP_MF;
	mhip->ip_len = htons((u_short)(len + mhlen));
	m->m_next = m_copy(m0, off, len);
	if (m->m_next == NULL) { /* copy failed */
	m_free(m);
	error = ENOBUFS; /* ??? */
	V_ipstat.ips_odropped++;
	goto done;
	}
	m->m_pkthdr.len = mhlen + len;
	m->m_pkthdr.rcvif = NULL;
	#ifdef MAC
	mac_netinet_fragment(m0, m);
	#endif
	m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
	mhip->ip_off = htons(mhip->ip_off);
	mhip->ip_sum = 0;
	if (sw_csum & CSUM_DELAY_IP)
	mhip->ip_sum = in_cksum(m, mhlen);
	*mnext = m;
	mnext = &m->m_nextpkt;
	}
	V_ipstat.ips_ofragments += nfrags;

	/* set first marker for fragment chain */
	m0->m_flags \|= M_FIRSTFRAG \| M_FRAG;
	m0->m_pkthdr.csum_data = nfrags;

	/*
	* Update first fragment by trimming what's been copied out
	* and updating header.
	*/
	m_adj(m0, hlen + firstlen - ip->ip_len);
	m0->m_pkthdr.len = hlen + firstlen;
	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
	ip->ip_off \|= IP_MF;
	ip->ip_off = htons(ip->ip_off);
	ip->ip_sum = 0;
	if (sw_csum & CSUM_DELAY_IP)
	ip->ip_sum = in_cksum(m0, hlen);

	done:
	*m_frag = m0;
	return error;
	}

	void
	in_delayed_cksum(struct mbuf *m)
	{
	struct ip *ip;
	u_short csum, offset;

	ip = mtod(m, struct ip *);
	offset = ip->ip_hl << 2 ;
	csum = in_cksum_skip(m, ip->ip_len, offset);
	if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
	csum = 0xffff;
	offset += m->m_pkthdr.csum_data; /* checksum offset */

	if (offset + sizeof(u_short) > m->m_len) {
	printf("delayed m_pullup, m->len: %d off: %d p: %d\n",
	m->m_len, offset, ip->ip_p);
	/*
	* XXX
	* this shouldn't happen, but if it does, the
	* correct behavior may be to insert the checksum
	* in the appropriate next mbuf in the chain.
	*/
	return;
	}
	(u_short )(m->m_data + offset) = csum;
	}

	/*
	* IP socket option processing.
	*/
	int
	ip_ctloutput(struct socket so, struct sockopt sopt)
	{
	struct inpcb *inp = sotoinpcb(so);
	int error, optval;

	error = optval = 0;
	if (sopt->sopt_level != IPPROTO_IP) {
	return (EINVAL);
	}

	switch (sopt->sopt_dir) {
	case SOPT_SET:
	switch (sopt->sopt_name) {
	case IP_OPTIONS:
	#ifdef notyet
	case IP_RETOPTS:
	#endif
	{
	struct mbuf *m;
	if (sopt->sopt_valsize > MLEN) {
	error = EMSGSIZE;
	break;
	}
	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
	if (m == NULL) {
	error = ENOBUFS;
	break;
	}
	m->m_len = sopt->sopt_valsize;
	error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
	m->m_len);
	if (error) {
	m_free(m);
	break;
	}
	INP_WLOCK(inp);
	error = ip_pcbopts(inp, sopt->sopt_name, m);
	INP_WUNLOCK(inp);
	return (error);
	}

	case IP_TOS:
	case IP_TTL:
	case IP_MINTTL:
	case IP_RECVOPTS:
	case IP_RECVRETOPTS:
	case IP_RECVDSTADDR:
	case IP_RECVTTL:
	case IP_RECVIF:
	case IP_FAITH:
	case IP_ONESBCAST:
	case IP_DONTFRAG:
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	break;

	switch (sopt->sopt_name) {
	case IP_TOS:
	inp->inp_ip_tos = optval;
	break;

	case IP_TTL:
	inp->inp_ip_ttl = optval;
	break;

	case IP_MINTTL:
	if (optval > 0 && optval <= MAXTTL)
	inp->inp_ip_minttl = optval;
	else
	error = EINVAL;
	break;

	#define OPTSET(bit) do { \
	INP_WLOCK(inp); \
	if (optval) \
	inp->inp_flags \|= bit; \
	else \
	inp->inp_flags &= ~bit; \
	INP_WUNLOCK(inp); \
	} while (0)

	case IP_RECVOPTS:
	OPTSET(INP_RECVOPTS);
	break;

	case IP_RECVRETOPTS:
	OPTSET(INP_RECVRETOPTS);
	break;

	case IP_RECVDSTADDR:
	OPTSET(INP_RECVDSTADDR);
	break;

	case IP_RECVTTL:
	OPTSET(INP_RECVTTL);
	break;

	case IP_RECVIF:
	OPTSET(INP_RECVIF);
	break;

	case IP_FAITH:
	OPTSET(INP_FAITH);
	break;

	case IP_ONESBCAST:
	OPTSET(INP_ONESBCAST);
	break;
	case IP_DONTFRAG:
	OPTSET(INP_DONTFRAG);
	break;
	}
	break;
	#undef OPTSET

	/*
	* Multicast socket options are processed by the in_mcast
	* module.
	*/
	case IP_MULTICAST_IF:
	case IP_MULTICAST_VIF:
	case IP_MULTICAST_TTL:
	case IP_MULTICAST_LOOP:
	case IP_ADD_MEMBERSHIP:
	case IP_DROP_MEMBERSHIP:
	case IP_ADD_SOURCE_MEMBERSHIP:
	case IP_DROP_SOURCE_MEMBERSHIP:
	case IP_BLOCK_SOURCE:
	case IP_UNBLOCK_SOURCE:
	case IP_MSFILTER:
	case MCAST_JOIN_GROUP:
	case MCAST_LEAVE_GROUP:
	case MCAST_JOIN_SOURCE_GROUP:
	case MCAST_LEAVE_SOURCE_GROUP:
	case MCAST_BLOCK_SOURCE:
	case MCAST_UNBLOCK_SOURCE:
	error = inp_setmoptions(inp, sopt);
	break;

	case IP_PORTRANGE:
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	break;

	INP_WLOCK(inp);
	switch (optval) {
	case IP_PORTRANGE_DEFAULT:
	inp->inp_flags &= ~(INP_LOWPORT);
	inp->inp_flags &= ~(INP_HIGHPORT);
	break;

	case IP_PORTRANGE_HIGH:
	inp->inp_flags &= ~(INP_LOWPORT);
	inp->inp_flags \|= INP_HIGHPORT;
	break;

	case IP_PORTRANGE_LOW:
	inp->inp_flags &= ~(INP_HIGHPORT);
	inp->inp_flags \|= INP_LOWPORT;
	break;

	default:
	error = EINVAL;
	break;
	}
	INP_WUNLOCK(inp);
	break;

	#ifdef IPSEC
	case IP_IPSEC_POLICY:
	{
	caddr_t req;
	struct mbuf *m;

	if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
	break;
	if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
	break;
	req = mtod(m, caddr_t);
	error = ipsec4_set_policy(inp, sopt->sopt_name, req,
	m->m_len, (sopt->sopt_td != NULL) ?
	sopt->sopt_td->td_ucred : NULL);
	m_freem(m);
	break;
	}
	#endif /* IPSEC */

	default:
	error = ENOPROTOOPT;
	break;
	}
	break;

	case SOPT_GET:
	switch (sopt->sopt_name) {
	case IP_OPTIONS:
	case IP_RETOPTS:
	if (inp->inp_options)
	error = sooptcopyout(sopt,
	mtod(inp->inp_options,
	char *),
	inp->inp_options->m_len);
	else
	sopt->sopt_valsize = 0;
	break;

	case IP_TOS:
	case IP_TTL:
	case IP_MINTTL:
	case IP_RECVOPTS:
	case IP_RECVRETOPTS:
	case IP_RECVDSTADDR:
	case IP_RECVTTL:
	case IP_RECVIF:
	case IP_PORTRANGE:
	case IP_FAITH:
	case IP_ONESBCAST:
	case IP_DONTFRAG:
	switch (sopt->sopt_name) {

	case IP_TOS:
	optval = inp->inp_ip_tos;
	break;

	case IP_TTL:
	optval = inp->inp_ip_ttl;
	break;

	case IP_MINTTL:
	optval = inp->inp_ip_minttl;
	break;

	#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)

	case IP_RECVOPTS:
	optval = OPTBIT(INP_RECVOPTS);
	break;

	case IP_RECVRETOPTS:
	optval = OPTBIT(INP_RECVRETOPTS);
	break;

	case IP_RECVDSTADDR:
	optval = OPTBIT(INP_RECVDSTADDR);
	break;

	case IP_RECVTTL:
	optval = OPTBIT(INP_RECVTTL);
	break;

	case IP_RECVIF:
	optval = OPTBIT(INP_RECVIF);
	break;

	case IP_PORTRANGE:
	if (inp->inp_flags & INP_HIGHPORT)
	optval = IP_PORTRANGE_HIGH;
	else if (inp->inp_flags & INP_LOWPORT)
	optval = IP_PORTRANGE_LOW;
	else
	optval = 0;
	break;

	case IP_FAITH:
	optval = OPTBIT(INP_FAITH);
	break;

	case IP_ONESBCAST:
	optval = OPTBIT(INP_ONESBCAST);
	break;
	case IP_DONTFRAG:
	optval = OPTBIT(INP_DONTFRAG);
	break;
	}
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;

	/*
	* Multicast socket options are processed by the in_mcast
	* module.
	*/
	case IP_MULTICAST_IF:
	case IP_MULTICAST_VIF:
	case IP_MULTICAST_TTL:
	case IP_MULTICAST_LOOP:
	case IP_MSFILTER:
	error = inp_getmoptions(inp, sopt);
	break;

	#ifdef IPSEC
	case IP_IPSEC_POLICY:
	{
	struct mbuf *m = NULL;
	caddr_t req = NULL;
	size_t len = 0;

	if (m != 0) {
	req = mtod(m, caddr_t);
	len = m->m_len;
	}
	error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
	if (error == 0)
	error = soopt_mcopyout(sopt, m); /* XXX */
	if (error == 0)
	m_freem(m);
	break;
	}
	#endif /* IPSEC */

	default:
	error = ENOPROTOOPT;
	break;
	}
	break;
	}
	return (error);
	}

	/*
	* Routine called from ip_output() to loop back a copy of an IP multicast
	* packet to the input queue of a specified interface. Note that this
	* calls the output routine of the loopback "driver", but with an interface
	* pointer that might NOT be a loopback interface -- evil, but easier than
	* replicating that code here.
	*/
	static void
	ip_mloopback(struct ifnet ifp, struct mbuf m, struct sockaddr_in *dst,
	int hlen)
	{
	register struct ip *ip;
	struct mbuf *copym;

	/*
	* Make a deep copy of the packet because we're going to
	* modify the pack in order to generate checksums.
	*/
	copym = m_dup(m, M_DONTWAIT);
	if (copym != NULL && (copym->m_flags & M_EXT \|\| copym->m_len < hlen))
	copym = m_pullup(copym, hlen);
	if (copym != NULL) {
	/* If needed, compute the checksum and mark it as valid. */
	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	in_delayed_cksum(copym);
	copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	copym->m_pkthdr.csum_flags \|=
	CSUM_DATA_VALID \| CSUM_PSEUDO_HDR;
	copym->m_pkthdr.csum_data = 0xffff;
	}
	/*
	* We don't bother to fragment if the IP length is greater
	* than the interface's MTU. Can this possibly matter?
	*/
	ip = mtod(copym, struct ip *);
	ip->ip_len = htons(ip->ip_len);
	ip->ip_off = htons(ip->ip_off);
	ip->ip_sum = 0;
	ip->ip_sum = in_cksum(copym, hlen);
	#if 1 /* XXX */
	if (dst->sin_family != AF_INET) {
	printf("ip_mloopback: bad address family %d\n",
	dst->sin_family);
	dst->sin_family = AF_INET;
	}
	#endif
	if_simloop(ifp, copym, dst->sin_family, 0);
	}
	}
	Index: head/sys/netinet/raw_ip.c
	===================================================================
	--- head/sys/netinet/raw_ip.c (revision 183549)
	+++ head/sys/netinet/raw_ip.c (revision 183550)
	@@ -1,991 +1,1006 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)raw_ip.c 8.7 (Berkeley) 5/15/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_mroute.h>

	#include <netinet/ip_fw.h>
	#include <netinet/ip_dummynet.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#endif /IPSEC/

	#include <security/mac/mac_framework.h>

	struct inpcbhead ripcb;
	struct inpcbinfo ripcbinfo;

	/* control hooks for ipfw and dummynet */
	ip_fw_ctl_t *ip_fw_ctl_ptr = NULL;
	ip_dn_ctl_t *ip_dn_ctl_ptr = NULL;

	/*
	* Hooks for multicast routing. They all default to NULL, so leave them not
	* initialized and rely on BSS being set to 0.
	*/

	/*
	* The socket used to communicate with the multicast routing daemon.
	*/
	struct socket *ip_mrouter;

	/*
	* The various mrouter and rsvp functions.
	*/
	int (ip_mrouter_set)(struct socket , struct sockopt *);
	int (ip_mrouter_get)(struct socket , struct sockopt *);
	int (*ip_mrouter_done)(void);
	int (ip_mforward)(struct ip , struct ifnet , struct mbuf ,
	struct ip_moptions *);
	int (*mrt_ioctl)(int, caddr_t, int);
	int (*legal_vif_num)(int);
	u_long (*ip_mcast_src)(int);

	void (rsvp_input_p)(struct mbuf m, int off);
	int (ip_rsvp_vif)(struct socket , struct sockopt *);
	void (ip_rsvp_force_done)(struct socket );

	/*
	* Hash functions
	*/

	#define INP_PCBHASH_RAW_SIZE 256
	#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
	(((proto) + (laddr) + (faddr)) % (mask) + 1)

	static void
	rip_inshash(struct inpcb *inp)
	{
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
	struct inpcbhead *pcbhash;
	int hash;

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	INP_WLOCK_ASSERT(inp);

	if (inp->inp_ip_p != 0 &&
	inp->inp_laddr.s_addr != INADDR_ANY &&
	inp->inp_faddr.s_addr != INADDR_ANY) {
	hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
	inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
	} else
	hash = 0;
	pcbhash = &pcbinfo->ipi_hashbase[hash];
	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
	}

	static void
	rip_delhash(struct inpcb *inp)
	{

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	LIST_REMOVE(inp, inp_hash);
	}

	/*
	* Raw interface to IP protocol.
	*/

	/*
	* Initialize raw connection block q.
	*/
	static void
	rip_zone_change(void *tag)
	{
	+ INIT_VNET_INET(curvnet);

	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
	}

	static int
	rip_inpcb_init(void *mem, int size, int flags)
	{
	struct inpcb *inp = mem;

	INP_LOCK_INIT(inp, "inp", "rawinp");
	return (0);
	}

	void
	rip_init(void)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip");
	LIST_INIT(&V_ripcb);
	V_ripcbinfo.ipi_listhead = &V_ripcb;
	V_ripcbinfo.ipi_hashbase =
	hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask);
	V_ripcbinfo.ipi_porthashbase =
	hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask);
	V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb),
	NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
	EVENTHANDLER_PRI_ANY);
	}

	static int
	rip_append(struct inpcb last, struct ip ip, struct mbuf *n,
	struct sockaddr_in *ripsrc)
	{
	int policyfail = 0;

	INP_RLOCK_ASSERT(last);

	#ifdef IPSEC
	/* check AH/ESP integrity. */
	if (ipsec4_in_reject(n, last)) {
	policyfail = 1;
	}
	#endif /* IPSEC */
	#ifdef MAC
	if (!policyfail && mac_inpcb_check_deliver(last, n) != 0)
	policyfail = 1;
	#endif
	/* Check the minimum TTL for socket. */
	if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl)
	policyfail = 1;
	if (!policyfail) {
	struct mbuf *opts = NULL;
	struct socket *so;

	so = last->inp_socket;
	if ((last->inp_flags & INP_CONTROLOPTS) \|\|
	(so->so_options & (SO_TIMESTAMP \| SO_BINTIME)))
	ip_savecontrol(last, &opts, ip, n);
	SOCKBUF_LOCK(&so->so_rcv);
	if (sbappendaddr_locked(&so->so_rcv,
	(struct sockaddr *)ripsrc, n, opts) == 0) {
	/* should notify about lost packet */
	m_freem(n);
	if (opts)
	m_freem(opts);
	SOCKBUF_UNLOCK(&so->so_rcv);
	} else
	sorwakeup_locked(so);
	} else
	m_freem(n);
	return (policyfail);
	}

	/*
	* Setup generic address and protocol structures for raw_input routine, then
	* pass them along with mbuf chain.
	*/
	void
	rip_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip ip = mtod(m, struct ip );
	int proto = ip->ip_p;
	struct inpcb inp, last;
	struct sockaddr_in ripsrc;
	int hash;

	bzero(&ripsrc, sizeof(ripsrc));
	ripsrc.sin_len = sizeof(ripsrc);
	ripsrc.sin_family = AF_INET;
	ripsrc.sin_addr = ip->ip_src;
	last = NULL;
	hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr,
	ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
	INP_INFO_RLOCK(&V_ripcbinfo);
	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) {
	if (inp->inp_ip_p != proto)
	continue;
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	#endif
	if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
	continue;
	if (inp->inp_faddr.s_addr != ip->ip_src.s_addr)
	continue;
	INP_RLOCK(inp);
	if (jailed(inp->inp_socket->so_cred) &&
	(htonl(prison_getip(inp->inp_socket->so_cred)) !=
	ip->ip_dst.s_addr)) {
	INP_RUNLOCK(inp);
	continue;
	}
	if (last) {
	struct mbuf *n;

	n = m_copy(m, 0, (int)M_COPYALL);
	if (n != NULL)
	(void) rip_append(last, ip, n, &ripsrc);
	/* XXX count dropped packet */
	INP_RUNLOCK(last);
	}
	last = inp;
	}
	LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) {
	if (inp->inp_ip_p && inp->inp_ip_p != proto)
	continue;
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	#endif
	if (inp->inp_laddr.s_addr &&
	inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
	continue;
	if (inp->inp_faddr.s_addr &&
	inp->inp_faddr.s_addr != ip->ip_src.s_addr)
	continue;
	INP_RLOCK(inp);
	if (jailed(inp->inp_socket->so_cred) &&
	(htonl(prison_getip(inp->inp_socket->so_cred)) !=
	ip->ip_dst.s_addr)) {
	INP_RUNLOCK(inp);
	continue;
	}
	if (last) {
	struct mbuf *n;

	n = m_copy(m, 0, (int)M_COPYALL);
	if (n != NULL)
	(void) rip_append(last, ip, n, &ripsrc);
	/* XXX count dropped packet */
	INP_RUNLOCK(last);
	}
	last = inp;
	}
	INP_INFO_RUNLOCK(&V_ripcbinfo);
	if (last != NULL) {
	if (rip_append(last, ip, m, &ripsrc) != 0)
	V_ipstat.ips_delivered--;
	INP_RUNLOCK(last);
	} else {
	m_freem(m);
	V_ipstat.ips_noproto++;
	V_ipstat.ips_delivered--;
	}
	}

	/*
	* Generate IP header and pass packet to ip_output. Tack on options user may
	* have setup with control call.
	*/
	int
	rip_output(struct mbuf m, struct socket so, u_long dst)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct ip *ip;
	int error;
	struct inpcb *inp = sotoinpcb(so);
	int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) \|
	IP_ALLOWBROADCAST;

	/*
	* If the user handed us a complete IP packet, use it. Otherwise,
	* allocate an mbuf for a header and fill it in.
	*/
	if ((inp->inp_flags & INP_HDRINCL) == 0) {
	if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
	m_freem(m);
	return(EMSGSIZE);
	}
	M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
	if (m == NULL)
	return(ENOBUFS);

	INP_RLOCK(inp);
	ip = mtod(m, struct ip *);
	ip->ip_tos = inp->inp_ip_tos;
	if (inp->inp_flags & INP_DONTFRAG)
	ip->ip_off = IP_DF;
	else
	ip->ip_off = 0;
	ip->ip_p = inp->inp_ip_p;
	ip->ip_len = m->m_pkthdr.len;
	if (jailed(inp->inp_socket->so_cred))
	ip->ip_src.s_addr =
	htonl(prison_getip(inp->inp_socket->so_cred));
	else
	ip->ip_src = inp->inp_laddr;
	ip->ip_dst.s_addr = dst;
	ip->ip_ttl = inp->inp_ip_ttl;
	} else {
	if (m->m_pkthdr.len > IP_MAXPACKET) {
	m_freem(m);
	return(EMSGSIZE);
	}
	INP_RLOCK(inp);
	ip = mtod(m, struct ip *);
	if (jailed(inp->inp_socket->so_cred)) {
	if (ip->ip_src.s_addr !=
	htonl(prison_getip(inp->inp_socket->so_cred))) {
	INP_RUNLOCK(inp);
	m_freem(m);
	return (EPERM);
	}
	}

	/*
	* Don't allow both user specified and setsockopt options,
	* and don't allow packet length sizes that will crash.
	*/
	if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options)
	\|\| (ip->ip_len > m->m_pkthdr.len)
	\|\| (ip->ip_len < (ip->ip_hl << 2))) {
	INP_RUNLOCK(inp);
	m_freem(m);
	return (EINVAL);
	}
	if (ip->ip_id == 0)
	ip->ip_id = ip_newid();

	/*
	* XXX prevent ip_output from overwriting header fields.
	*/
	flags \|= IP_RAWOUTPUT;
	V_ipstat.ips_rawout++;
	}

	if (inp->inp_flags & INP_ONESBCAST)
	flags \|= IP_SENDONES;

	#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
	#endif

	error = ip_output(m, inp->inp_options, NULL, flags,
	inp->inp_moptions, inp);
	INP_RUNLOCK(inp);
	return (error);
	}

	/*
	* Raw IP socket option processing.
	*
	* IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
	* only be created by a privileged process, and as such, socket option
	* operations to manage system properties on any raw socket were allowed to
	* take place without explicit additional access control checks. However,
	* raw sockets can now also be created in jail(), and therefore explicit
	* checks are now required. Likewise, raw sockets can be used by a process
	* after it gives up privilege, so some caution is required. For options
	* passed down to the IP layer via ip_ctloutput(), checks are assumed to be
	* performed in ip_ctloutput() and therefore no check occurs here.
	* Unilaterally checking priv_check() here breaks normal IP socket option
	* operations on raw sockets.
	*
	* When adding new socket options here, make sure to add access control
	* checks here as necessary.
	*/
	int
	rip_ctloutput(struct socket so, struct sockopt sopt)
	{
	struct inpcb *inp = sotoinpcb(so);
	int error, optval;

	if (sopt->sopt_level != IPPROTO_IP)
	return (EINVAL);

	error = 0;
	switch (sopt->sopt_dir) {
	case SOPT_GET:
	switch (sopt->sopt_name) {
	case IP_HDRINCL:
	optval = inp->inp_flags & INP_HDRINCL;
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;

	case IP_FW_ADD: /* ADD actually returns the body... */
	case IP_FW_GET:
	case IP_FW_TABLE_GETSIZE:
	case IP_FW_TABLE_LIST:
	case IP_FW_NAT_GET_CONFIG:
	case IP_FW_NAT_GET_LOG:
	if (ip_fw_ctl_ptr != NULL)
	error = ip_fw_ctl_ptr(sopt);
	else
	error = ENOPROTOOPT;
	break;

	case IP_DUMMYNET_GET:
	if (ip_dn_ctl_ptr != NULL)
	error = ip_dn_ctl_ptr(sopt);
	else
	error = ENOPROTOOPT;
	break ;

	case MRT_INIT:
	case MRT_DONE:
	case MRT_ADD_VIF:
	case MRT_DEL_VIF:
	case MRT_ADD_MFC:
	case MRT_DEL_MFC:
	case MRT_VERSION:
	case MRT_ASSERT:
	case MRT_API_SUPPORT:
	case MRT_API_CONFIG:
	case MRT_ADD_BW_UPCALL:
	case MRT_DEL_BW_UPCALL:
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error != 0)
	return (error);
	error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
	EOPNOTSUPP;
	break;

	default:
	error = ip_ctloutput(so, sopt);
	break;
	}
	break;

	case SOPT_SET:
	switch (sopt->sopt_name) {
	case IP_HDRINCL:
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	break;
	if (optval)
	inp->inp_flags \|= INP_HDRINCL;
	else
	inp->inp_flags &= ~INP_HDRINCL;
	break;

	case IP_FW_ADD:
	case IP_FW_DEL:
	case IP_FW_FLUSH:
	case IP_FW_ZERO:
	case IP_FW_RESETLOG:
	case IP_FW_TABLE_ADD:
	case IP_FW_TABLE_DEL:
	case IP_FW_TABLE_FLUSH:
	case IP_FW_NAT_CFG:
	case IP_FW_NAT_DEL:
	if (ip_fw_ctl_ptr != NULL)
	error = ip_fw_ctl_ptr(sopt);
	else
	error = ENOPROTOOPT;
	break;

	case IP_DUMMYNET_CONFIGURE:
	case IP_DUMMYNET_DEL:
	case IP_DUMMYNET_FLUSH:
	if (ip_dn_ctl_ptr != NULL)
	error = ip_dn_ctl_ptr(sopt);
	else
	error = ENOPROTOOPT ;
	break ;

	case IP_RSVP_ON:
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error != 0)
	return (error);
	error = ip_rsvp_init(so);
	break;

	case IP_RSVP_OFF:
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error != 0)
	return (error);
	error = ip_rsvp_done();
	break;

	case IP_RSVP_VIF_ON:
	case IP_RSVP_VIF_OFF:
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error != 0)
	return (error);
	error = ip_rsvp_vif ?
	ip_rsvp_vif(so, sopt) : EINVAL;
	break;

	case MRT_INIT:
	case MRT_DONE:
	case MRT_ADD_VIF:
	case MRT_DEL_VIF:
	case MRT_ADD_MFC:
	case MRT_DEL_MFC:
	case MRT_VERSION:
	case MRT_ASSERT:
	case MRT_API_SUPPORT:
	case MRT_API_CONFIG:
	case MRT_ADD_BW_UPCALL:
	case MRT_DEL_BW_UPCALL:
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error != 0)
	return (error);
	error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
	EOPNOTSUPP;
	break;

	default:
	error = ip_ctloutput(so, sopt);
	break;
	}
	break;
	}

	return (error);
	}

	/*
	* This function exists solely to receive the PRC_IFDOWN messages which are
	* sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls
	* in_ifadown() to remove all routes corresponding to that address. It also
	* receives the PRC_IFUP messages from if_up() and reinstalls the interface
	* routes.
	*/
	void
	rip_ctlinput(int cmd, struct sockaddr sa, void vip)
	{
	+ INIT_VNET_INET(curvnet);
	struct in_ifaddr *ia;
	struct ifnet *ifp;
	int err;
	int flags;

	switch (cmd) {
	case PRC_IFDOWN:
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	if (ia->ia_ifa.ifa_addr == sa
	&& (ia->ia_flags & IFA_ROUTE)) {
	/*
	* in_ifscrub kills the interface route.
	*/
	in_ifscrub(ia->ia_ifp, ia);
	/*
	* in_ifadown gets rid of all the rest of the
	* routes. This is not quite the right thing
	* to do, but at least if we are running a
	* routing process they will come back.
	*/
	in_ifadown(&ia->ia_ifa, 0);
	break;
	}
	}
	break;

	case PRC_IFUP:
	TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
	if (ia->ia_ifa.ifa_addr == sa)
	break;
	}
	if (ia == 0 \|\| (ia->ia_flags & IFA_ROUTE))
	return;
	flags = RTF_UP;
	ifp = ia->ia_ifa.ifa_ifp;

	if ((ifp->if_flags & IFF_LOOPBACK)
	\|\| (ifp->if_flags & IFF_POINTOPOINT))
	flags \|= RTF_HOST;

	err = rtinit(&ia->ia_ifa, RTM_ADD, flags);
	if (err == 0)
	ia->ia_flags \|= IFA_ROUTE;
	break;
	}
	}

	u_long rip_sendspace = 9216;
	u_long rip_recvspace = 9216;

	SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
	&rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
	SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
	&rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");

	static int
	rip_attach(struct socket so, int proto, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));

	error = priv_check(td, PRIV_NETINET_RAW);
	if (error)
	return (error);
	if (proto >= IPPROTO_MAX \|\| proto < 0)
	return EPROTONOSUPPORT;
	error = soreserve(so, rip_sendspace, rip_recvspace);
	if (error)
	return (error);
	INP_INFO_WLOCK(&V_ripcbinfo);
	error = in_pcballoc(so, &V_ripcbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (error);
	}
	inp = (struct inpcb *)so->so_pcb;
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_ip_p = proto;
	inp->inp_ip_ttl = V_ip_defttl;
	rip_inshash(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	INP_WUNLOCK(inp);
	return (0);
	}

	static void
	rip_detach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
	("rip_detach: not closed"));

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	rip_delhash(inp);
	if (so == V_ip_mrouter && ip_mrouter_done)
	ip_mrouter_done();
	if (ip_rsvp_force_done)
	ip_rsvp_force_done(so);
	if (so == V_ip_rsvpd)
	ip_rsvp_done();
	in_pcbdetach(inp);
	in_pcbfree(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	}

	static void
	rip_dodisconnect(struct socket so, struct inpcb inp)
	{

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	rip_delhash(inp);
	inp->inp_faddr.s_addr = INADDR_ANY;
	rip_inshash(inp);
	SOCK_LOCK(so);
	so->so_state &= ~SS_ISCONNECTED;
	SOCK_UNLOCK(so);
	}

	static void
	rip_abort(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	rip_dodisconnect(so, inp);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	}

	static void
	rip_close(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_close: inp == NULL"));

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	rip_dodisconnect(so, inp);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	}

	static int
	rip_disconnect(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	if ((so->so_state & SS_ISCONNECTED) == 0)
	return (ENOTCONN);

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	rip_dodisconnect(so, inp);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_NET(so->so_vnet);
	+ INIT_VNET_INET(so->so_vnet);
	struct sockaddr_in addr = (struct sockaddr_in )nam;
	struct inpcb *inp;

	if (nam->sa_len != sizeof(*addr))
	return (EINVAL);

	if (jailed(td->td_ucred)) {
	if (addr->sin_addr.s_addr == INADDR_ANY)
	addr->sin_addr.s_addr =
	htonl(prison_getip(td->td_ucred));
	if (htonl(prison_getip(td->td_ucred)) != addr->sin_addr.s_addr)
	return (EADDRNOTAVAIL);
	}

	if (TAILQ_EMPTY(&V_ifnet) \|\|
	(addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) \|\|
	(addr->sin_addr.s_addr &&
	ifa_ifwithaddr((struct sockaddr *)addr) == 0))
	return (EADDRNOTAVAIL);

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	rip_delhash(inp);
	inp->inp_laddr = addr->sin_addr;
	rip_inshash(inp);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_NET(so->so_vnet);
	+ INIT_VNET_INET(so->so_vnet);
	struct sockaddr_in addr = (struct sockaddr_in )nam;
	struct inpcb *inp;

	if (nam->sa_len != sizeof(*addr))
	return (EINVAL);
	if (TAILQ_EMPTY(&V_ifnet))
	return (EADDRNOTAVAIL);
	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
	return (EAFNOSUPPORT);

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	rip_delhash(inp);
	inp->inp_faddr = addr->sin_addr;
	rip_inshash(inp);
	soisconnected(so);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip_shutdown(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));

	INP_WLOCK(inp);
	socantsendmore(so);
	INP_WUNLOCK(inp);
	return (0);
	}

	static int
	rip_send(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{
	struct inpcb *inp;
	u_long dst;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip_send: inp == NULL"));

	/*
	* Note: 'dst' reads below are unlocked.
	*/
	if (so->so_state & SS_ISCONNECTED) {
	if (nam) {
	m_freem(m);
	return (EISCONN);
	}
	dst = inp->inp_faddr.s_addr; /* Unlocked read. */
	} else {
	if (nam == NULL) {
	m_freem(m);
	return (ENOTCONN);
	}
	dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr;
	}
	return (rip_output(m, so, dst));
	}

	static int
	rip_pcblist(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	int error, i, n;
	struct inpcb inp, *inp_list;
	inp_gen_t gencnt;
	struct xinpgen xig;

	/*
	* The process of preparing the TCB list is too time-consuming and
	* resource-intensive to repeat twice on every request.
	*/
	if (req->oldptr == 0) {
	n = V_ripcbinfo.ipi_count;
	req->oldidx = 2 * (sizeof xig)
	+ (n + n/8) * sizeof(struct xinpcb);
	return (0);
	}

	if (req->newptr != 0)
	return (EPERM);

	/*
	* OK, now we're committed to doing something.
	*/
	INP_INFO_RLOCK(&V_ripcbinfo);
	gencnt = V_ripcbinfo.ipi_gencnt;
	n = V_ripcbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_ripcbinfo);

	xig.xig_len = sizeof xig;
	xig.xig_count = n;
	xig.xig_gen = gencnt;
	xig.xig_sogen = so_gencnt;
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	if (error)
	return (error);

	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
	if (inp_list == 0)
	return (ENOMEM);

	INP_INFO_RLOCK(&V_ripcbinfo);
	for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
	inp = LIST_NEXT(inp, inp_list)) {
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt &&
	cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) {
	/* XXX held references? */
	inp_list[i++] = inp;
	}
	INP_RUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(&V_ripcbinfo);
	n = i;

	error = 0;
	for (i = 0; i < n; i++) {
	inp = inp_list[i];
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt) {
	struct xinpcb xi;
	bzero(&xi, sizeof(xi));
	xi.xi_len = sizeof xi;
	/* XXX should avoid extra copy */
	bcopy(inp, &xi.xi_inp, sizeof *inp);
	if (inp->inp_socket)
	sotoxsocket(inp->inp_socket, &xi.xi_socket);
	INP_RUNLOCK(inp);
	error = SYSCTL_OUT(req, &xi, sizeof xi);
	} else
	INP_RUNLOCK(inp);
	}
	if (!error) {
	/*
	* Give the user an updated idea of our state. If the
	* generation differs from what we told her before, she knows
	* that something happened while we were processing this
	* request, and it might be necessary to retry.
	*/
	INP_INFO_RLOCK(&V_ripcbinfo);
	xig.xig_gen = V_ripcbinfo.ipi_gencnt;
	xig.xig_sogen = so_gencnt;
	xig.xig_count = V_ripcbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_ripcbinfo);
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	}
	free(inp_list, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_net_inet_raw, OID_AUTO/XXX/, pcblist, CTLFLAG_RD, 0, 0,
	rip_pcblist, "S,xinpcb", "List of active raw IP sockets");

	struct pr_usrreqs rip_usrreqs = {
	.pru_abort = rip_abort,
	.pru_attach = rip_attach,
	.pru_bind = rip_bind,
	.pru_connect = rip_connect,
	.pru_control = in_control,
	.pru_detach = rip_detach,
	.pru_disconnect = rip_disconnect,
	.pru_peeraddr = in_getpeeraddr,
	.pru_send = rip_send,
	.pru_shutdown = rip_shutdown,
	.pru_sockaddr = in_getsockaddr,
	.pru_sosetlabel = in_pcbsosetlabel,
	.pru_close = rip_close,
	};
	Index: head/sys/netinet/tcp_hostcache.c
	===================================================================
	--- head/sys/netinet/tcp_hostcache.c (revision 183549)
	+++ head/sys/netinet/tcp_hostcache.c (revision 183550)
	@@ -1,689 +1,704 @@
	/*-
	* Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote
	* products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* The tcp_hostcache moves the tcp-specific cached metrics from the routing
	* table to a dedicated structure indexed by the remote IP address. It keeps
	* information on the measured TCP parameters of past TCP sessions to allow
	* better initial start values to be used with later connections to/from the
	* same source. Depending on the network parameters (delay, bandwidth, max
	* MTU, congestion window) between local and remote sites, this can lead to
	* significant speed-ups for new TCP connections after the first one.
	*
	* Due to the tcp_hostcache, all TCP-specific metrics information in the
	* routing table have been removed. The inpcb no longer keeps a pointer to
	* the routing entry, and protocol-initiated route cloning has been removed
	* as well. With these changes, the routing table has gone back to being
	* more lightwight and only carries information related to packet forwarding.
	*
	* tcp_hostcache is designed for multiple concurrent access in SMP
	* environments and high contention. All bucket rows have their own lock and
	* thus multiple lookups and modifies can be done at the same time as long as
	* they are in different bucket rows. If a request for insertion of a new
	* record can't be satisfied, it simply returns an empty structure. Nobody
	* and nothing outside of tcp_hostcache.c will ever point directly to any
	* entry in the tcp_hostcache. All communication is done in an
	* object-oriented way and only functions of tcp_hostcache will manipulate
	* hostcache entries. Otherwise, we are unable to achieve good behaviour in
	* concurrent access situations. Since tcp_hostcache is only caching
	* information, there are no fatal consequences if we either can't satisfy
	* any particular request or have to drop/overwrite an existing entry because
	* of bucket limit memory constrains.
	*/

	/*
	* Many thanks to jlemon for basic structure of tcp_syncache which is being
	* followed here.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif
	#include <netinet/tcp.h>
	#include <netinet/tcp_var.h>
	#ifdef INET6
	#include <netinet6/tcp6_var.h>
	#endif

	#include <vm/uma.h>


	TAILQ_HEAD(hc_qhead, hc_metrics);

	struct hc_head {
	struct hc_qhead hch_bucket;
	u_int hch_length;
	struct mtx hch_mtx;
	};

	struct hc_metrics {
	/* housekeeping */
	TAILQ_ENTRY(hc_metrics) rmx_q;
	struct hc_head rmx_head; / head of bucket tail queue */
	struct in_addr ip4; /* IP address */
	struct in6_addr ip6; /* IP6 address */
	/* endpoint specific values for TCP */
	u_long rmx_mtu; /* MTU for this path */
	u_long rmx_ssthresh; /* outbound gateway buffer limit */
	u_long rmx_rtt; /* estimated round trip time */
	u_long rmx_rttvar; /* estimated rtt variance */
	u_long rmx_bandwidth; /* estimated bandwidth */
	u_long rmx_cwnd; /* congestion window */
	u_long rmx_sendpipe; /* outbound delay-bandwidth product */
	u_long rmx_recvpipe; /* inbound delay-bandwidth product */
	/* TCP hostcache internal data */
	int rmx_expire; /* lifetime for object */
	u_long rmx_hits; /* number of hits */
	u_long rmx_updates; /* number of updates */
	};

	/* Arbitrary values */
	#define TCP_HOSTCACHE_HASHSIZE 512
	#define TCP_HOSTCACHE_BUCKETLIMIT 30
	#define TCP_HOSTCACHE_EXPIRE 6060 / one hour */
	#define TCP_HOSTCACHE_PRUNE 560 / every 5 minutes */

	struct tcp_hostcache {
	struct hc_head *hashbase;
	uma_zone_t zone;
	u_int hashsize;
	u_int hashmask;
	u_int bucket_limit;
	u_int cache_count;
	u_int cache_limit;
	int expire;
	int prune;
	int purgeall;
	};
	static struct tcp_hostcache tcp_hostcache;

	static struct callout tcp_hc_callout;

	static struct hc_metrics tcp_hc_lookup(struct in_conninfo );
	static struct hc_metrics tcp_hc_insert(struct in_conninfo );
	static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
	static void tcp_hc_purge(void *);

	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0,
	"TCP Host cache");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
	- &tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, cachelimit,
	+ CTLFLAG_RDTUN, tcp_hostcache.cache_limit, 0,
	+ "Overall entry limit for hostcache");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
	- &tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, hashsize,
	+ CTLFLAG_RDTUN, tcp_hostcache.hashsize, 0,
	+ "Size of TCP hostcache hashtable");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
	- &tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
	+ CTLFLAG_RDTUN, tcp_hostcache.bucket_limit, 0,
	+ "Per-bucket hash limit for hostcache");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
	- &tcp_hostcache.cache_count, 0, "Current number of entries in hostcache");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, count,
	+ CTLFLAG_RD, tcp_hostcache.cache_count, 0,
	+ "Current number of entries in hostcache");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
	- &tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, expire,
	+ CTLFLAG_RW, tcp_hostcache.expire, 0,
	+ "Expire time of TCP hostcache entries");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW,
	- &tcp_hostcache.prune, 0, "Time between purge runs");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, prune,
	+ CTLFLAG_RW, tcp_hostcache.prune, 0, "Time between purge runs");

	-SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
	- &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_hostcache, OID_AUTO, purge,
	+ CTLFLAG_RW, tcp_hostcache.purgeall, 0,
	+ "Expire all entires on next purge run");

	SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_SKIP, 0, 0,
	sysctl_tcp_hc_list, "A", "List of all hostcache entries");


	static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");

	#define HOSTCACHE_HASH(ip) \
	(((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
	V_tcp_hostcache.hashmask)

	/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
	#define HOSTCACHE_HASH6(ip6) \
	(((ip6)->s6_addr32[0] ^ \
	(ip6)->s6_addr32[1] ^ \
	(ip6)->s6_addr32[2] ^ \
	(ip6)->s6_addr32[3]) & \
	V_tcp_hostcache.hashmask)

	#define THC_LOCK(lp) mtx_lock(lp)
	#define THC_UNLOCK(lp) mtx_unlock(lp)

	void
	tcp_hc_init(void)
	{
	+ INIT_VNET_INET(curvnet);
	int i;

	/*
	* Initialize hostcache structures.
	*/
	V_tcp_hostcache.cache_count = 0;
	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
	V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
	V_tcp_hostcache.cache_limit =
	V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
	V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
	V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;

	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
	&V_tcp_hostcache.hashsize);
	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
	&V_tcp_hostcache.cache_limit);
	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
	&V_tcp_hostcache.bucket_limit);
	if (!powerof2(V_tcp_hostcache.hashsize)) {
	printf("WARNING: hostcache hash size is not a power of 2.\n");
	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
	}
	V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;

	/*
	* Allocate the hash table.
	*/
	V_tcp_hostcache.hashbase = (struct hc_head *)
	malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
	M_HOSTCACHE, M_WAITOK \| M_ZERO);

	/*
	* Initialize the hash buckets.
	*/
	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
	TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
	V_tcp_hostcache.hashbase[i].hch_length = 0;
	mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
	NULL, MTX_DEF);
	}

	/*
	* Allocate the hostcache entries.
	*/
	V_tcp_hostcache.zone =
	uma_zcreate("hostcache", sizeof(struct hc_metrics),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);

	/*
	* Set up periodic cache cleanup.
	*/
	callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE);
	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
	tcp_hc_purge, 0);
	}

	/*
	* Internal function: look up an entry in the hostcache or return NULL.
	*
	* If an entry has been returned, the caller becomes responsible for
	* unlocking the bucket row after he is done reading/modifying the entry.
	*/
	static struct hc_metrics *
	tcp_hc_lookup(struct in_conninfo *inc)
	{
	+ INIT_VNET_INET(curvnet);
	int hash;
	struct hc_head *hc_head;
	struct hc_metrics *hc_entry;

	KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));

	/*
	* Hash the foreign ip address.
	*/
	if (inc->inc_isipv6)
	hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
	else
	hash = HOSTCACHE_HASH(&inc->inc_faddr);

	hc_head = &V_tcp_hostcache.hashbase[hash];

	/*
	* Acquire lock for this bucket row; we release the lock if we don't
	* find an entry, otherwise the caller has to unlock after he is
	* done.
	*/
	THC_LOCK(&hc_head->hch_mtx);

	/*
	* Iterate through entries in bucket row looking for a match.
	*/
	TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
	if (inc->inc_isipv6) {
	if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
	sizeof(inc->inc6_faddr)) == 0)
	return hc_entry;
	} else {
	if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
	sizeof(inc->inc_faddr)) == 0)
	return hc_entry;
	}
	}

	/*
	* We were unsuccessful and didn't find anything.
	*/
	THC_UNLOCK(&hc_head->hch_mtx);
	return NULL;
	}

	/*
	* Internal function: insert an entry into the hostcache or return NULL if
	* unable to allocate a new one.
	*
	* If an entry has been returned, the caller becomes responsible for
	* unlocking the bucket row after he is done reading/modifying the entry.
	*/
	static struct hc_metrics *
	tcp_hc_insert(struct in_conninfo *inc)
	{
	+ INIT_VNET_INET(curvnet);
	int hash;
	struct hc_head *hc_head;
	struct hc_metrics *hc_entry;

	KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));

	/*
	* Hash the foreign ip address.
	*/
	if (inc->inc_isipv6)
	hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
	else
	hash = HOSTCACHE_HASH(&inc->inc_faddr);

	hc_head = &V_tcp_hostcache.hashbase[hash];

	/*
	* Acquire lock for this bucket row; we release the lock if we don't
	* find an entry, otherwise the caller has to unlock after he is
	* done.
	*/
	THC_LOCK(&hc_head->hch_mtx);

	/*
	* If the bucket limit is reached, reuse the least-used element.
	*/
	if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit \|\|
	V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) {
	hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
	/*
	* At first we were dropping the last element, just to
	* reacquire it in the next two lines again, which isn't very
	* efficient. Instead just reuse the least used element.
	* We may drop something that is still "in-use" but we can be
	* "lossy".
	* Just give up if this bucket row is empty and we don't have
	* anything to replace.
	*/
	if (hc_entry == NULL) {
	THC_UNLOCK(&hc_head->hch_mtx);
	return NULL;
	}
	TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
	V_tcp_hostcache.hashbase[hash].hch_length--;
	V_tcp_hostcache.cache_count--;
	V_tcpstat.tcps_hc_bucketoverflow++;
	#if 0
	uma_zfree(V_tcp_hostcache.zone, hc_entry);
	#endif
	} else {
	/*
	* Allocate a new entry, or balk if not possible.
	*/
	hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
	if (hc_entry == NULL) {
	THC_UNLOCK(&hc_head->hch_mtx);
	return NULL;
	}
	}

	/*
	* Initialize basic information of hostcache entry.
	*/
	bzero(hc_entry, sizeof(*hc_entry));
	if (inc->inc_isipv6)
	bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6));
	else
	hc_entry->ip4 = inc->inc_faddr;
	hc_entry->rmx_head = hc_head;
	hc_entry->rmx_expire = V_tcp_hostcache.expire;

	/*
	* Put it upfront.
	*/
	TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
	V_tcp_hostcache.hashbase[hash].hch_length++;
	V_tcp_hostcache.cache_count++;
	V_tcpstat.tcps_hc_added++;

	return hc_entry;
	}

	/*
	* External function: look up an entry in the hostcache and fill out the
	* supplied TCP metrics structure. Fills in NULL when no entry was found or
	* a value is not set.
	*/
	void
	tcp_hc_get(struct in_conninfo inc, struct hc_metrics_lite hc_metrics_lite)
	{
	+ INIT_VNET_INET(curvnet);
	struct hc_metrics *hc_entry;

	/*
	* Find the right bucket.
	*/
	hc_entry = tcp_hc_lookup(inc);

	/*
	* If we don't have an existing object.
	*/
	if (hc_entry == NULL) {
	bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
	return;
	}
	hc_entry->rmx_hits++;
	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */

	hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
	hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
	hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
	hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
	hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
	hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
	hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
	hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;

	/*
	* Unlock bucket row.
	*/
	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
	}

	/*
	* External function: look up an entry in the hostcache and return the
	* discovered path MTU. Returns NULL if no entry is found or value is not
	* set.
	*/
	u_long
	tcp_hc_getmtu(struct in_conninfo *inc)
	{
	+ INIT_VNET_INET(curvnet);
	struct hc_metrics *hc_entry;
	u_long mtu;

	hc_entry = tcp_hc_lookup(inc);
	if (hc_entry == NULL) {
	return 0;
	}
	hc_entry->rmx_hits++;
	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */

	mtu = hc_entry->rmx_mtu;
	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
	return mtu;
	}

	/*
	* External function: update the MTU value of an entry in the hostcache.
	* Creates a new entry if none was found.
	*/
	void
	tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
	{
	+ INIT_VNET_INET(curvnet);
	struct hc_metrics *hc_entry;

	/*
	* Find the right bucket.
	*/
	hc_entry = tcp_hc_lookup(inc);

	/*
	* If we don't have an existing object, try to insert a new one.
	*/
	if (hc_entry == NULL) {
	hc_entry = tcp_hc_insert(inc);
	if (hc_entry == NULL)
	return;
	}
	hc_entry->rmx_updates++;
	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */

	hc_entry->rmx_mtu = mtu;

	/*
	* Put it upfront so we find it faster next time.
	*/
	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);

	/*
	* Unlock bucket row.
	*/
	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
	}

	/*
	* External function: update the TCP metrics of an entry in the hostcache.
	* Creates a new entry if none was found.
	*/
	void
	tcp_hc_update(struct in_conninfo inc, struct hc_metrics_lite hcml)
	{
	+ INIT_VNET_INET(curvnet);
	struct hc_metrics *hc_entry;

	hc_entry = tcp_hc_lookup(inc);
	if (hc_entry == NULL) {
	hc_entry = tcp_hc_insert(inc);
	if (hc_entry == NULL)
	return;
	}
	hc_entry->rmx_updates++;
	hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */

	if (hcml->rmx_rtt != 0) {
	if (hc_entry->rmx_rtt == 0)
	hc_entry->rmx_rtt = hcml->rmx_rtt;
	else
	hc_entry->rmx_rtt =
	(hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
	V_tcpstat.tcps_cachedrtt++;
	}
	if (hcml->rmx_rttvar != 0) {
	if (hc_entry->rmx_rttvar == 0)
	hc_entry->rmx_rttvar = hcml->rmx_rttvar;
	else
	hc_entry->rmx_rttvar =
	(hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
	V_tcpstat.tcps_cachedrttvar++;
	}
	if (hcml->rmx_ssthresh != 0) {
	if (hc_entry->rmx_ssthresh == 0)
	hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
	else
	hc_entry->rmx_ssthresh =
	(hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
	V_tcpstat.tcps_cachedssthresh++;
	}
	if (hcml->rmx_bandwidth != 0) {
	if (hc_entry->rmx_bandwidth == 0)
	hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
	else
	hc_entry->rmx_bandwidth =
	(hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
	/* V_tcpstat.tcps_cachedbandwidth++; */
	}
	if (hcml->rmx_cwnd != 0) {
	if (hc_entry->rmx_cwnd == 0)
	hc_entry->rmx_cwnd = hcml->rmx_cwnd;
	else
	hc_entry->rmx_cwnd =
	(hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
	/* V_tcpstat.tcps_cachedcwnd++; */
	}
	if (hcml->rmx_sendpipe != 0) {
	if (hc_entry->rmx_sendpipe == 0)
	hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
	else
	hc_entry->rmx_sendpipe =
	(hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
	/* V_tcpstat.tcps_cachedsendpipe++; */
	}
	if (hcml->rmx_recvpipe != 0) {
	if (hc_entry->rmx_recvpipe == 0)
	hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
	else
	hc_entry->rmx_recvpipe =
	(hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
	/* V_tcpstat.tcps_cachedrecvpipe++; */
	}

	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
	}

	/*
	* Sysctl function: prints the list and values of all hostcache entries in
	* unsorted order.
	*/
	static int
	sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	int bufsize;
	int linesize = 128;
	char p, buf;
	int len, i, error;
	struct hc_metrics *hc_entry;
	#ifdef INET6
	char ip6buf[INET6_ADDRSTRLEN];
	#endif

	bufsize = linesize * (V_tcp_hostcache.cache_count + 1);

	p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK\|M_ZERO);

	len = snprintf(p, linesize,
	"\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
	" CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
	p += len;

	#define msec(u) (((u) + 500) / 1000)
	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
	THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
	TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
	rmx_q) {
	len = snprintf(p, linesize,
	"%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
	"%4lu %4lu %4i\n",
	hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
	#ifdef INET6
	ip6_sprintf(ip6buf, &hc_entry->ip6),
	#else
	"IPv6?",
	#endif
	hc_entry->rmx_mtu,
	hc_entry->rmx_ssthresh,
	msec(hc_entry->rmx_rtt *
	(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
	msec(hc_entry->rmx_rttvar *
	(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
	hc_entry->rmx_bandwidth * 8,
	hc_entry->rmx_cwnd,
	hc_entry->rmx_sendpipe,
	hc_entry->rmx_recvpipe,
	hc_entry->rmx_hits,
	hc_entry->rmx_updates,
	hc_entry->rmx_expire);
	p += len;
	}
	THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
	}
	#undef msec
	error = SYSCTL_OUT(req, buf, p - buf);
	free(buf, M_TEMP);
	return(error);
	}

	/*
	* Expire and purge (old\|all) entries in the tcp_hostcache. Runs
	* periodically from the callout.
	*/
	static void
	tcp_hc_purge(void *arg)
	{
	+ INIT_VNET_INET(curvnet);
	struct hc_metrics hc_entry, hc_next;
	int all = (intptr_t)arg;
	int i;

	if (V_tcp_hostcache.purgeall) {
	all = 1;
	V_tcp_hostcache.purgeall = 0;
	}

	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
	THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
	TAILQ_FOREACH_SAFE(hc_entry,
	&V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
	if (all \|\| hc_entry->rmx_expire <= 0) {
	TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
	hc_entry, rmx_q);
	uma_zfree(V_tcp_hostcache.zone, hc_entry);
	V_tcp_hostcache.hashbase[i].hch_length--;
	V_tcp_hostcache.cache_count--;
	} else
	hc_entry->rmx_expire -= V_tcp_hostcache.prune;
	}
	THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
	}

	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
	tcp_hc_purge, arg);
	}
	Index: head/sys/netinet/tcp_input.c
	===================================================================
	--- head/sys/netinet/tcp_input.c (revision 183549)
	+++ head/sys/netinet/tcp_input.c (revision 183550)
	@@ -1,3172 +1,3188 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipfw.h" /* for ipfw_fwd */
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/proc.h> /* for proc0 declaration */
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>

	#define TCPSTATES /* for logging */

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_icmp.h> /* required for icmp_var.h */
	#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet6/tcp6_var.h>
	#include <netinet/tcpip.h>
	#include <netinet/tcp_syncache.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif /* TCPDEBUG */

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#endif /IPSEC/

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	static const int tcprexmtthresh = 3;

	struct tcpstat tcpstat;
	-SYSCTL_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
	- &tcpstat , tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_STATS, stats,
	+ CTLFLAG_RW, tcpstat , tcpstat,
	+ "TCP statistics (struct tcpstat, netinet/tcp_var.h)");

	int tcp_log_in_vain = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
	&tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports");

	static int blackhole = 0;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
	- &blackhole, 0, "Do not send RST on segments to closed ports");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
	+ blackhole, 0, "Do not send RST on segments to closed ports");

	int tcp_delack_enabled = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
	- &tcp_delack_enabled, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, delayed_ack,
	+ CTLFLAG_RW, tcp_delack_enabled, 0,
	"Delay ACK to try and piggyback it onto a data packet");

	static int drop_synfin = 0;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
	- &drop_synfin, 0, "Drop TCP packets with SYN+FIN set");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, drop_synfin,
	+ CTLFLAG_RW, drop_synfin, 0, "Drop TCP packets with SYN+FIN set");

	static int tcp_do_rfc3042 = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
	- &tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
	+ tcp_do_rfc3042, 0, "Enable RFC 3042 (Limited Transmit)");

	static int tcp_do_rfc3390 = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
	- &tcp_do_rfc3390, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
	+ tcp_do_rfc3390, 0,
	"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");

	int tcp_do_ecn = 0;
	int tcp_ecn_maxretries = 1;
	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
	-SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
	- &tcp_do_ecn, 0, "TCP ECN support");
	-SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
	- &tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, enable,
	+ CTLFLAG_RW, tcp_do_ecn, 0, "TCP ECN support");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_ecn, OID_AUTO, maxretries,
	+ CTLFLAG_RW, tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");

	static int tcp_insecure_rst = 0;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
	- &tcp_insecure_rst, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, insecure_rst,
	+ CTLFLAG_RW, tcp_insecure_rst, 0,
	"Follow the old (insecure) criteria for accepting RST packets");

	int tcp_do_autorcvbuf = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
	- &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_auto,
	+ CTLFLAG_RW, tcp_do_autorcvbuf, 0,
	+ "Enable automatic receive buffer sizing");

	int tcp_autorcvbuf_inc = 16*1024;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
	- &tcp_autorcvbuf_inc, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_inc,
	+ CTLFLAG_RW, tcp_autorcvbuf_inc, 0,
	"Incrementor step size of automatic receive buffer");

	int tcp_autorcvbuf_max = 256*1024;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
	- &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, recvbuf_max,
	+ CTLFLAG_RW, tcp_autorcvbuf_max, 0,
	+ "Max size of automatic receive buffer");

	struct inpcbhead tcb;
	#define tcb6 tcb /* for KAME src sync over BSD's /
	struct inpcbinfo tcbinfo;

	static void tcp_dooptions(struct tcpopt , u_char , int, int);
	static void tcp_do_segment(struct mbuf , struct tcphdr ,
	struct socket , struct tcpcb , int, int, uint8_t);
	static void tcp_dropwithreset(struct mbuf , struct tcphdr ,
	struct tcpcb *, int, int);
	static void tcp_pulloutofband(struct socket *,
	struct tcphdr , struct mbuf , int);
	static void tcp_xmit_timer(struct tcpcb *, int);
	static void tcp_newreno_partial_ack(struct tcpcb , struct tcphdr );
	static void inline
	tcp_congestion_exp(struct tcpcb *);

	static void inline
	tcp_congestion_exp(struct tcpcb *tp)
	{
	u_int win;

	win = min(tp->snd_wnd, tp->snd_cwnd) /
	2 / tp->t_maxseg;
	if (win < 2)
	win = 2;
	tp->snd_ssthresh = win * tp->t_maxseg;
	ENTER_FASTRECOVERY(tp);
	tp->snd_recover = tp->snd_max;
	if (tp->t_flags & TF_ECN_PERMIT)
	tp->t_flags \|= TF_ECN_SND_CWR;
	}

	/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
	#ifdef INET6
	#define ND6_HINT(tp) \
	do { \
	if ((tp) && (tp)->t_inpcb && \
	((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
	nd6_nud_hint(NULL, NULL, 0); \
	} while (0)
	#else
	#define ND6_HINT(tp)
	#endif

	/*
	* Indicate whether this ack should be delayed. We can delay the ack if
	* - there is no delayed ack timer in progress and
	* - our last ack wasn't a 0-sized window. We never want to delay
	* the ack that opens up a 0-sized window and
	* - delayed acks are enabled or
	* - this is a half-synchronized T/TCP connection.
	*/
	#define DELAY_ACK(tp) \
	((!tcp_timer_active(tp, TT_DELACK) && \
	(tp->t_flags & TF_RXWIN0SENT) == 0) && \
	(V_tcp_delack_enabled \|\| (tp->t_flags & TF_NEEDSYN)))

	/*
	* TCP input handling is split into multiple parts:
	* tcp6_input is a thin wrapper around tcp_input for the extended
	* ip6_protox[] call format in ip6_input
	* tcp_input handles primary segment validation, inpcb lookup and
	* SYN processing on listen sockets
	* tcp_do_segment processes the ACK and text of the segment for
	* establishing, established and closing connections
	*/
	#ifdef INET6
	int
	tcp6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	struct in6_ifaddr *ia6;

	IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);

	/*
	* draft-itojun-ipv6-tcp-to-anycast
	* better place to put this in?
	*/
	ia6 = ip6_getdstifaddr(m);
	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
	struct ip6_hdr *ip6;

	ip6 = mtod(m, struct ip6_hdr *);
	icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
	(caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
	return IPPROTO_DONE;
	}

	tcp_input(m, *offp);
	return IPPROTO_DONE;
	}
	#endif

	void
	tcp_input(struct mbuf *m, int off0)
	{
	+ INIT_VNET_INET(curvnet);
	+#ifdef INET6
	+ INIT_VNET_INET6(curvnet);
	+#endif
	+#ifdef IPSEC
	+ INIT_VNET_IPSEC(curvnet);
	+#endif
	struct tcphdr *th;
	struct ip *ip = NULL;
	struct ipovly *ipov;
	struct inpcb *inp = NULL;
	struct tcpcb *tp = NULL;
	struct socket *so = NULL;
	u_char *optp = NULL;
	int optlen = 0;
	int len, tlen, off;
	int drop_hdrlen;
	int thflags;
	int rstreason = 0; /* For badport_bandlim accounting purposes */
	uint8_t iptos;
	#ifdef IPFIREWALL_FORWARD
	struct m_tag *fwd_tag;
	#endif
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6;
	#else
	const void *ip6 = NULL;
	const int isipv6 = 0;
	#endif
	struct tcpopt to; /* options in this segment */
	char s = NULL; / address and port logging */

	#ifdef TCPDEBUG
	/*
	* The size of tcp_saveipgen must be the size of the max ip header,
	* now IPv6.
	*/
	u_char tcp_saveipgen[IP6_HDR_LEN];
	struct tcphdr tcp_savetcp;
	short ostate = 0;
	#endif

	#ifdef INET6
	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
	#endif

	to.to_flags = 0;
	V_tcpstat.tcps_rcvtotal++;

	if (isipv6) {
	#ifdef INET6
	/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
	ip6 = mtod(m, struct ip6_hdr *);
	tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
	if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
	V_tcpstat.tcps_rcvbadsum++;
	goto drop;
	}
	th = (struct tcphdr *)((caddr_t)ip6 + off0);

	/*
	* Be proactive about unspecified IPv6 address in source.
	* As we use all-zero to indicate unbounded/unconnected pcb,
	* unspecified IPv6 address can be used to confuse us.
	*
	* Note that packets with unspecified IPv6 destination is
	* already dropped in ip6_input.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
	/* XXX stat */
	goto drop;
	}
	#else
	th = NULL; /* XXX: Avoid compiler warning. */
	#endif
	} else {
	/*
	* Get IP and TCP header together in first mbuf.
	* Note: IP leaves IP header in first mbuf.
	*/
	if (off0 > sizeof (struct ip)) {
	ip_stripoptions(m, (struct mbuf *)0);
	off0 = sizeof(struct ip);
	}
	if (m->m_len < sizeof (struct tcpiphdr)) {
	if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
	== NULL) {
	V_tcpstat.tcps_rcvshort++;
	return;
	}
	}
	ip = mtod(m, struct ip *);
	ipov = (struct ipovly *)ip;
	th = (struct tcphdr *)((caddr_t)ip + off0);
	tlen = ip->ip_len;

	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
	th->th_sum = m->m_pkthdr.csum_data;
	else
	th->th_sum = in_pseudo(ip->ip_src.s_addr,
	ip->ip_dst.s_addr,
	htonl(m->m_pkthdr.csum_data +
	ip->ip_len +
	IPPROTO_TCP));
	th->th_sum ^= 0xffff;
	#ifdef TCPDEBUG
	ipov->ih_len = (u_short)tlen;
	ipov->ih_len = htons(ipov->ih_len);
	#endif
	} else {
	/*
	* Checksum extended TCP header and data.
	*/
	len = sizeof (struct ip) + tlen;
	bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
	ipov->ih_len = (u_short)tlen;
	ipov->ih_len = htons(ipov->ih_len);
	th->th_sum = in_cksum(m, len);
	}
	if (th->th_sum) {
	V_tcpstat.tcps_rcvbadsum++;
	goto drop;
	}
	/* Re-initialization for later version check */
	ip->ip_v = IPVERSION;
	}

	#ifdef INET6
	if (isipv6)
	iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	else
	#endif
	iptos = ip->ip_tos;

	/*
	* Check that TCP offset makes sense,
	* pull out TCP options and adjust length. XXX
	*/
	off = th->th_off << 2;
	if (off < sizeof (struct tcphdr) \|\| off > tlen) {
	V_tcpstat.tcps_rcvbadoff++;
	goto drop;
	}
	tlen -= off; /* tlen is used instead of ti->ti_len */
	if (off > sizeof (struct tcphdr)) {
	if (isipv6) {
	#ifdef INET6
	IP6_EXTHDR_CHECK(m, off0, off, );
	ip6 = mtod(m, struct ip6_hdr *);
	th = (struct tcphdr *)((caddr_t)ip6 + off0);
	#endif
	} else {
	if (m->m_len < sizeof(struct ip) + off) {
	if ((m = m_pullup(m, sizeof (struct ip) + off))
	== NULL) {
	V_tcpstat.tcps_rcvshort++;
	return;
	}
	ip = mtod(m, struct ip *);
	ipov = (struct ipovly *)ip;
	th = (struct tcphdr *)((caddr_t)ip + off0);
	}
	}
	optlen = off - sizeof (struct tcphdr);
	optp = (u_char *)(th + 1);
	}
	thflags = th->th_flags;

	/*
	* Convert TCP protocol specific fields to host format.
	*/
	th->th_seq = ntohl(th->th_seq);
	th->th_ack = ntohl(th->th_ack);
	th->th_win = ntohs(th->th_win);
	th->th_urp = ntohs(th->th_urp);

	/*
	* Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
	*/
	drop_hdrlen = off0 + off;

	/*
	* Locate pcb for segment.
	*/
	INP_INFO_WLOCK(&V_tcbinfo);
	findpcb:
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	#ifdef IPFIREWALL_FORWARD
	/*
	* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
	*/
	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);

	if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */
	struct sockaddr_in *next_hop;

	next_hop = (struct sockaddr_in *)(fwd_tag+1);
	/*
	* Transparently forwarded. Pretend to be the destination.
	* already got one like this?
	*/
	inp = in_pcblookup_hash(&V_tcbinfo,
	ip->ip_src, th->th_sport,
	ip->ip_dst, th->th_dport,
	0, m->m_pkthdr.rcvif);
	if (!inp) {
	/* It's new. Try to find the ambushing socket. */
	inp = in_pcblookup_hash(&V_tcbinfo,
	ip->ip_src, th->th_sport,
	next_hop->sin_addr,
	next_hop->sin_port ?
	ntohs(next_hop->sin_port) :
	th->th_dport,
	INPLOOKUP_WILDCARD,
	m->m_pkthdr.rcvif);
	}
	/* Remove the tag from the packet. We don't need it anymore. */
	m_tag_delete(m, fwd_tag);
	} else
	#endif /* IPFIREWALL_FORWARD */
	{
	if (isipv6) {
	#ifdef INET6
	inp = in6_pcblookup_hash(&V_tcbinfo,
	&ip6->ip6_src, th->th_sport,
	&ip6->ip6_dst, th->th_dport,
	INPLOOKUP_WILDCARD,
	m->m_pkthdr.rcvif);
	#endif
	} else
	inp = in_pcblookup_hash(&V_tcbinfo,
	ip->ip_src, th->th_sport,
	ip->ip_dst, th->th_dport,
	INPLOOKUP_WILDCARD,
	m->m_pkthdr.rcvif);
	}

	/*
	* If the INPCB does not exist then all data in the incoming
	* segment is discarded and an appropriate RST is sent back.
	* XXX MRT Send RST using which routing table?
	*/
	if (inp == NULL) {
	/*
	* Log communication attempts to ports that are not
	* in use.
	*/
	if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) \|\|
	tcp_log_in_vain == 2) {
	if ((s = tcp_log_addrs(NULL, th, (void *)ip, ip6)))
	log(LOG_INFO, "%s; %s: Connection attempt "
	"to closed port\n", s, __func__);
	}
	/*
	* When blackholing do not respond with a RST but
	* completely ignore the segment and drop it.
	*/
	if ((V_blackhole == 1 && (thflags & TH_SYN)) \|\|
	V_blackhole == 2)
	goto dropunlock;

	rstreason = BANDLIM_RST_CLOSEDPORT;
	goto dropwithreset;
	}
	INP_WLOCK(inp);

	#ifdef IPSEC
	#ifdef INET6
	if (isipv6 && ipsec6_in_reject(m, inp)) {
	V_ipsec6stat.in_polvio++;
	goto dropunlock;
	} else
	#endif /* INET6 */
	if (ipsec4_in_reject(m, inp) != 0) {
	V_ipsec4stat.in_polvio++;
	goto dropunlock;
	}
	#endif /* IPSEC */

	/*
	* Check the minimum TTL for socket.
	*/
	if (inp->inp_ip_minttl != 0) {
	#ifdef INET6
	if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
	goto dropunlock;
	else
	#endif
	if (inp->inp_ip_minttl > ip->ip_ttl)
	goto dropunlock;
	}

	/*
	* A previous connection in TIMEWAIT state is supposed to catch
	* stray or duplicate segments arriving late. If this segment
	* was a legitimate new connection attempt the old INPCB gets
	* removed and we can try again to find a listening socket.
	*/
	if (inp->inp_vflag & INP_TIMEWAIT) {
	if (thflags & TH_SYN)
	tcp_dooptions(&to, optp, optlen, TO_SYN);
	/*
	* NB: tcp_twcheck unlocks the INP and frees the mbuf.
	*/
	if (tcp_twcheck(inp, &to, th, m, tlen))
	goto findpcb;
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return;
	}
	/*
	* The TCPCB may no longer exist if the connection is winding
	* down or it is in the CLOSED state. Either way we drop the
	* segment and send an appropriate response.
	*/
	tp = intotcpcb(inp);
	if (tp == NULL \|\| tp->t_state == TCPS_CLOSED) {
	rstreason = BANDLIM_RST_CLOSEDPORT;
	goto dropwithreset;
	}

	#ifdef MAC
	INP_WLOCK_ASSERT(inp);
	if (mac_inpcb_check_deliver(inp, m))
	goto dropunlock;
	#endif
	so = inp->inp_socket;
	KASSERT(so != NULL, ("%s: so == NULL", __func__));
	#ifdef TCPDEBUG
	if (so->so_options & SO_DEBUG) {
	ostate = tp->t_state;
	if (isipv6) {
	#ifdef INET6
	bcopy((char )ip6, (char )tcp_saveipgen, sizeof(*ip6));
	#endif
	} else
	bcopy((char )ip, (char )tcp_saveipgen, sizeof(*ip));
	tcp_savetcp = *th;
	}
	#endif
	/*
	* When the socket is accepting connections (the INPCB is in LISTEN
	* state) we look into the SYN cache if this is a new connection
	* attempt or the completion of a previous one.
	*/
	if (so->so_options & SO_ACCEPTCONN) {
	struct in_conninfo inc;

	KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
	"tp not listening", __func__));

	bzero(&inc, sizeof(inc));
	inc.inc_isipv6 = isipv6;
	#ifdef INET6
	if (isipv6) {
	inc.inc6_faddr = ip6->ip6_src;
	inc.inc6_laddr = ip6->ip6_dst;
	} else
	#endif
	{
	inc.inc_faddr = ip->ip_src;
	inc.inc_laddr = ip->ip_dst;
	}
	inc.inc_fport = th->th_sport;
	inc.inc_lport = th->th_dport;

	/*
	* Check for an existing connection attempt in syncache if
	* the flag is only ACK. A successful lookup creates a new
	* socket appended to the listen queue in SYN_RECEIVED state.
	*/
	if ((thflags & (TH_RST\|TH_ACK\|TH_SYN)) == TH_ACK) {
	/*
	* Parse the TCP options here because
	* syncookies need access to the reflected
	* timestamp.
	*/
	tcp_dooptions(&to, optp, optlen, 0);
	/*
	* NB: syncache_expand() doesn't unlock
	* inp and tcpinfo locks.
	*/
	if (!syncache_expand(&inc, &to, th, &so, m)) {
	/*
	* No syncache entry or ACK was not
	* for our SYN/ACK. Send a RST.
	* NB: syncache did its own logging
	* of the failure cause.
	*/
	rstreason = BANDLIM_RST_OPENPORT;
	goto dropwithreset;
	}
	if (so == NULL) {
	/*
	* We completed the 3-way handshake
	* but could not allocate a socket
	* either due to memory shortage,
	* listen queue length limits or
	* global socket limits. Send RST
	* or wait and have the remote end
	* retransmit the ACK for another
	* try.
	*/
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Socket allocation failed due to "
	"limits or memory shortage, %s\n",
	s, __func__,
	V_tcp_sc_rst_sock_fail ?
	"sending RST" : "try again");
	if (V_tcp_sc_rst_sock_fail) {
	rstreason = BANDLIM_UNLIMITED;
	goto dropwithreset;
	} else
	goto dropunlock;
	}
	/*
	* Socket is created in state SYN_RECEIVED.
	* Unlock the listen socket, lock the newly
	* created socket and update the tp variable.
	*/
	INP_WUNLOCK(inp); /* listen socket */
	inp = sotoinpcb(so);
	INP_WLOCK(inp); /* new connection */
	tp = intotcpcb(inp);
	KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
	("%s: ", __func__));
	/*
	* Process the segment and the data it
	* contains. tcp_do_segment() consumes
	* the mbuf chain and unlocks the inpcb.
	*/
	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
	iptos);
	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
	return;
	}
	/*
	* Segment flag validation for new connection attempts:
	*
	* Our (SYN\|ACK) response was rejected.
	* Check with syncache and remove entry to prevent
	* retransmits.
	*
	* NB: syncache_chkrst does its own logging of failure
	* causes.
	*/
	if (thflags & TH_RST) {
	syncache_chkrst(&inc, th);
	goto dropunlock;
	}
	/*
	* We can't do anything without SYN.
	*/
	if ((thflags & TH_SYN) == 0) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"SYN is missing, segment ignored\n",
	s, __func__);
	V_tcpstat.tcps_badsyn++;
	goto dropunlock;
	}
	/*
	* (SYN\|ACK) is bogus on a listen socket.
	*/
	if (thflags & TH_ACK) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"SYN\|ACK invalid, segment rejected\n",
	s, __func__);
	syncache_badack(&inc); /* XXX: Not needed! */
	V_tcpstat.tcps_badsyn++;
	rstreason = BANDLIM_RST_OPENPORT;
	goto dropwithreset;
	}
	/*
	* If the drop_synfin option is enabled, drop all
	* segments with both the SYN and FIN bits set.
	* This prevents e.g. nmap from identifying the
	* TCP/IP stack.
	* XXX: Poor reasoning. nmap has other methods
	* and is constantly refining its stack detection
	* strategies.
	* XXX: This is a violation of the TCP specification
	* and was used by RFC1644.
	*/
	if ((thflags & TH_FIN) && V_drop_synfin) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"SYN\|FIN segment ignored (based on "
	"sysctl setting)\n", s, __func__);
	V_tcpstat.tcps_badsyn++;
	goto dropunlock;
	}
	/*
	* Segment's flags are (SYN) or (SYN\|FIN).
	*
	* TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
	* as they do not affect the state of the TCP FSM.
	* The data pointed to by TH_URG and th_urp is ignored.
	*/
	KASSERT((thflags & (TH_RST\|TH_ACK)) == 0,
	("%s: Listen socket: TH_RST or TH_ACK set", __func__));
	KASSERT(thflags & (TH_SYN),
	("%s: Listen socket: TH_SYN not set", __func__));
	#ifdef INET6
	/*
	* If deprecated address is forbidden,
	* we do not accept SYN to deprecated interface
	* address to prevent any new inbound connection from
	* getting established.
	* When we do not accept SYN, we send a TCP RST,
	* with deprecated source address (instead of dropping
	* it). We compromise it as it is much better for peer
	* to send a RST, and RST will be the final packet
	* for the exchange.
	*
	* If we do not forbid deprecated addresses, we accept
	* the SYN packet. RFC2462 does not suggest dropping
	* SYN in this case.
	* If we decipher RFC2462 5.5.4, it says like this:
	* 1. use of deprecated addr with existing
	* communication is okay - "SHOULD continue to be
	* used"
	* 2. use of it with new communication:
	* (2a) "SHOULD NOT be used if alternate address
	* with sufficient scope is available"
	* (2b) nothing mentioned otherwise.
	* Here we fall into (2b) case as we have no choice in
	* our source address selection - we must obey the peer.
	*
	* The wording in RFC2462 is confusing, and there are
	* multiple description text for deprecated address
	* handling - worse, they are not exactly the same.
	* I believe 5.5.4 is the best one, so we follow 5.5.4.
	*/
	if (isipv6 && !V_ip6_use_deprecated) {
	struct in6_ifaddr *ia6;

	if ((ia6 = ip6_getdstifaddr(m)) &&
	(ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Connection attempt to deprecated "
	"IPv6 address rejected\n",
	s, __func__);
	rstreason = BANDLIM_RST_OPENPORT;
	goto dropwithreset;
	}
	}
	#endif
	/*
	* Basic sanity checks on incoming SYN requests:
	* Don't respond if the destination is a link layer
	* broadcast according to RFC1122 4.2.3.10, p. 104.
	* If it is from this socket it must be forged.
	* Don't respond if the source or destination is a
	* global or subnet broad- or multicast address.
	* Note that it is quite possible to receive unicast
	* link-layer packets with a broadcast IP address. Use
	* in_broadcast() to find them.
	*/
	if (m->m_flags & (M_BCAST\|M_MCAST)) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Connection attempt from broad- or multicast "
	"link layer address ignored\n", s, __func__);
	goto dropunlock;
	}
	if (isipv6) {
	#ifdef INET6
	if (th->th_dport == th->th_sport &&
	IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Connection attempt to/from self "
	"ignored\n", s, __func__);
	goto dropunlock;
	}
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Connection attempt from/to multicast "
	"address ignored\n", s, __func__);
	goto dropunlock;
	}
	#endif
	} else {
	if (th->th_dport == th->th_sport &&
	ip->ip_dst.s_addr == ip->ip_src.s_addr) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Connection attempt from/to self "
	"ignored\n", s, __func__);
	goto dropunlock;
	}
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
	if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Listen socket: "
	"Connection attempt from/to broad- "
	"or multicast address ignored\n",
	s, __func__);
	goto dropunlock;
	}
	}
	/*
	* SYN appears to be valid. Create compressed TCP state
	* for syncache.
	*/
	#ifdef TCPDEBUG
	if (so->so_options & SO_DEBUG)
	tcp_trace(TA_INPUT, ostate, tp,
	(void *)tcp_saveipgen, &tcp_savetcp, 0);
	#endif
	tcp_dooptions(&to, optp, optlen, TO_SYN);
	syncache_add(&inc, &to, th, inp, &so, m);
	/*
	* Entry added to syncache and mbuf consumed.
	* Everything already unlocked by syncache_add().
	*/
	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
	return;
	}

	/*
	* Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
	* state. tcp_do_segment() always consumes the mbuf chain, unlocks
	* the inpcb, and unlocks pcbinfo.
	*/
	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
	return;

	dropwithreset:
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);

	/*
	* If inp is non-NULL, we call tcp_dropwithreset() holding both inpcb
	* and global locks. However, if NULL, we must hold neither as
	* firewalls may acquire the global lock in order to look for a
	* matching inpcb.
	*/
	if (inp != NULL) {
	tcp_dropwithreset(m, th, tp, tlen, rstreason);
	INP_WUNLOCK(inp);
	}
	INP_INFO_WUNLOCK(&V_tcbinfo);
	if (inp == NULL)
	tcp_dropwithreset(m, th, NULL, tlen, rstreason);
	m = NULL; /* mbuf chain got consumed. */
	goto drop;

	dropunlock:
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	if (inp != NULL)
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);

	drop:
	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
	if (s != NULL)
	free(s, M_TCPLOG);
	if (m != NULL)
	m_freem(m);
	return;
	}

	static void
	tcp_do_segment(struct mbuf m, struct tcphdr th, struct socket *so,
	struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
	{
	+ INIT_VNET_INET(tp->t_vnet);
	int thflags, acked, ourfinisacked, needoutput = 0;
	int headlocked = 1;
	int rstreason, todrop, win;
	u_long tiwin;
	struct tcpopt to;

	#ifdef TCPDEBUG
	/*
	* The size of tcp_saveipgen must be the size of the max ip header,
	* now IPv6.
	*/
	u_char tcp_saveipgen[IP6_HDR_LEN];
	struct tcphdr tcp_savetcp;
	short ostate = 0;
	#endif
	thflags = th->th_flags;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(tp->t_inpcb);
	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
	__func__));
	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
	__func__));

	/*
	* Segment received on connection.
	* Reset idle time and keep-alive timer.
	* XXX: This should be done after segment
	* validation to ignore broken/spoofed segs.
	*/
	tp->t_rcvtime = ticks;
	if (TCPS_HAVEESTABLISHED(tp->t_state))
	tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);

	/*
	* Unscale the window into a 32-bit value.
	* For the SYN_SENT state the scale is zero.
	*/
	tiwin = th->th_win << tp->snd_scale;

	/*
	* TCP ECN processing.
	*/
	if (tp->t_flags & TF_ECN_PERMIT) {
	switch (iptos & IPTOS_ECN_MASK) {
	case IPTOS_ECN_CE:
	tp->t_flags \|= TF_ECN_SND_ECE;
	V_tcpstat.tcps_ecn_ce++;
	break;
	case IPTOS_ECN_ECT0:
	V_tcpstat.tcps_ecn_ect0++;
	break;
	case IPTOS_ECN_ECT1:
	V_tcpstat.tcps_ecn_ect1++;
	break;
	}

	if (thflags & TH_CWR)
	tp->t_flags &= ~TF_ECN_SND_ECE;

	/*
	* Congestion experienced.
	* Ignore if we are already trying to recover.
	*/
	if ((thflags & TH_ECE) &&
	SEQ_LEQ(th->th_ack, tp->snd_recover)) {
	V_tcpstat.tcps_ecn_rcwnd++;
	tcp_congestion_exp(tp);
	}
	}

	/*
	* Parse options on any incoming segment.
	*/
	tcp_dooptions(&to, (u_char *)(th + 1),
	(th->th_off << 2) - sizeof(struct tcphdr),
	(thflags & TH_SYN) ? TO_SYN : 0);

	/*
	* If echoed timestamp is later than the current time,
	* fall back to non RFC1323 RTT calculation. Normalize
	* timestamp if syncookies were used when this connection
	* was established.
	*/
	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
	to.to_tsecr -= tp->ts_offset;
	if (TSTMP_GT(to.to_tsecr, ticks))
	to.to_tsecr = 0;
	}

	/*
	* Process options only when we get SYN/ACK back. The SYN case
	* for incoming connections is handled in tcp_syncache.
	* According to RFC1323 the window field in a SYN (i.e., a <SYN>
	* or <SYN,ACK>) segment itself is never scaled.
	* XXX this is traditional behavior, may need to be cleaned up.
	*/
	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
	if ((to.to_flags & TOF_SCALE) &&
	(tp->t_flags & TF_REQ_SCALE)) {
	tp->t_flags \|= TF_RCVD_SCALE;
	tp->snd_scale = to.to_wscale;
	}
	/*
	* Initial send window. It will be updated with
	* the next incoming segment to the scaled value.
	*/
	tp->snd_wnd = th->th_win;
	if (to.to_flags & TOF_TS) {
	tp->t_flags \|= TF_RCVD_TSTMP;
	tp->ts_recent = to.to_tsval;
	tp->ts_recent_age = ticks;
	}
	if (to.to_flags & TOF_MSS)
	tcp_mss(tp, to.to_mss);
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	(to.to_flags & TOF_SACKPERM) == 0)
	tp->t_flags &= ~TF_SACK_PERMIT;
	}

	/*
	* Header prediction: check for the two common cases
	* of a uni-directional data xfer. If the packet has
	* no control flags, is in-sequence, the window didn't
	* change and we're not retransmitting, it's a
	* candidate. If the length is zero and the ack moved
	* forward, we're the sender side of the xfer. Just
	* free the data acked & wake any higher level process
	* that was blocked waiting for space. If the length
	* is non-zero and the ack didn't move, we're the
	* receiver side. If we're getting packets in-order
	* (the reassembly queue is empty), add the data to
	* the socket buffer and note that we need a delayed ack.
	* Make sure that the hidden state-flags are also off.
	* Since we check for TCPS_ESTABLISHED first, it can only
	* be TH_NEEDSYN.
	*/
	if (tp->t_state == TCPS_ESTABLISHED &&
	th->th_seq == tp->rcv_nxt &&
	(thflags & (TH_SYN\|TH_FIN\|TH_RST\|TH_URG\|TH_ACK)) == TH_ACK &&
	tp->snd_nxt == tp->snd_max &&
	tiwin && tiwin == tp->snd_wnd &&
	((tp->t_flags & (TF_NEEDSYN\|TF_NEEDFIN)) == 0) &&
	LIST_EMPTY(&tp->t_segq) &&
	((to.to_flags & TOF_TS) == 0 \|\|
	TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {

	/*
	* If last ACK falls within this segment's sequence numbers,
	* record the timestamp.
	* NOTE that the test is modified according to the latest
	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
	*/
	if ((to.to_flags & TOF_TS) != 0 &&
	SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
	tp->ts_recent_age = ticks;
	tp->ts_recent = to.to_tsval;
	}

	if (tlen == 0) {
	if (SEQ_GT(th->th_ack, tp->snd_una) &&
	SEQ_LEQ(th->th_ack, tp->snd_max) &&
	tp->snd_cwnd >= tp->snd_wnd &&
	((!V_tcp_do_newreno &&
	!(tp->t_flags & TF_SACK_PERMIT) &&
	tp->t_dupacks < tcprexmtthresh) \|\|
	((V_tcp_do_newreno \|\|
	(tp->t_flags & TF_SACK_PERMIT)) &&
	!IN_FASTRECOVERY(tp) &&
	(to.to_flags & TOF_SACK) == 0 &&
	TAILQ_EMPTY(&tp->snd_holes)))) {
	KASSERT(headlocked,
	("%s: headlocked", __func__));
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	/*
	* This is a pure ack for outstanding data.
	*/
	++V_tcpstat.tcps_predack;
	/*
	* "bad retransmit" recovery.
	*/
	if (tp->t_rxtshift == 1 &&
	ticks < tp->t_badrxtwin) {
	++V_tcpstat.tcps_sndrexmitbad;
	tp->snd_cwnd = tp->snd_cwnd_prev;
	tp->snd_ssthresh =
	tp->snd_ssthresh_prev;
	tp->snd_recover = tp->snd_recover_prev;
	if (tp->t_flags & TF_WASFRECOVERY)
	ENTER_FASTRECOVERY(tp);
	tp->snd_nxt = tp->snd_max;
	tp->t_badrxtwin = 0;
	}

	/*
	* Recalculate the transmit timer / rtt.
	*
	* Some boxes send broken timestamp replies
	* during the SYN+ACK phase, ignore
	* timestamps of 0 or we could calculate a
	* huge RTT and blow up the retransmit timer.
	*/
	if ((to.to_flags & TOF_TS) != 0 &&
	to.to_tsecr) {
	if (!tp->t_rttlow \|\|
	tp->t_rttlow > ticks - to.to_tsecr)
	tp->t_rttlow = ticks - to.to_tsecr;
	tcp_xmit_timer(tp,
	ticks - to.to_tsecr + 1);
	} else if (tp->t_rtttime &&
	SEQ_GT(th->th_ack, tp->t_rtseq)) {
	if (!tp->t_rttlow \|\|
	tp->t_rttlow > ticks - tp->t_rtttime)
	tp->t_rttlow = ticks - tp->t_rtttime;
	tcp_xmit_timer(tp,
	ticks - tp->t_rtttime);
	}
	tcp_xmit_bandwidth_limit(tp, th->th_ack);
	acked = th->th_ack - tp->snd_una;
	V_tcpstat.tcps_rcvackpack++;
	V_tcpstat.tcps_rcvackbyte += acked;
	sbdrop(&so->so_snd, acked);
	if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
	SEQ_LEQ(th->th_ack, tp->snd_recover))
	tp->snd_recover = th->th_ack - 1;
	tp->snd_una = th->th_ack;
	/*
	* Pull snd_wl2 up to prevent seq wrap relative
	* to th_ack.
	*/
	tp->snd_wl2 = th->th_ack;
	tp->t_dupacks = 0;
	m_freem(m);
	ND6_HINT(tp); /* Some progress has been made. */

	/*
	* If all outstanding data are acked, stop
	* retransmit timer, otherwise restart timer
	* using current (possibly backed-off) value.
	* If process is waiting for space,
	* wakeup/selwakeup/signal. If data
	* are ready to send, let tcp_output
	* decide between more output or persist.
	*/
	#ifdef TCPDEBUG
	if (so->so_options & SO_DEBUG)
	tcp_trace(TA_INPUT, ostate, tp,
	(void *)tcp_saveipgen,
	&tcp_savetcp, 0);
	#endif
	if (tp->snd_una == tp->snd_max)
	tcp_timer_activate(tp, TT_REXMT, 0);
	else if (!tcp_timer_active(tp, TT_PERSIST))
	tcp_timer_activate(tp, TT_REXMT,
	tp->t_rxtcur);
	sowwakeup(so);
	if (so->so_snd.sb_cc)
	(void) tcp_output(tp);
	goto check_delack;
	}
	} else if (th->th_ack == tp->snd_una &&
	tlen <= sbspace(&so->so_rcv)) {
	int newsize = 0; /* automatic sockbuf scaling */

	KASSERT(headlocked, ("%s: headlocked", __func__));
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	/*
	* This is a pure, in-sequence data packet
	* with nothing on the reassembly queue and
	* we have enough buffer space to take it.
	*/
	/* Clean receiver SACK report if present */
	if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
	tcp_clean_sackreport(tp);
	++V_tcpstat.tcps_preddat;
	tp->rcv_nxt += tlen;
	/*
	* Pull snd_wl1 up to prevent seq wrap relative to
	* th_seq.
	*/
	tp->snd_wl1 = th->th_seq;
	/*
	* Pull rcv_up up to prevent seq wrap relative to
	* rcv_nxt.
	*/
	tp->rcv_up = tp->rcv_nxt;
	V_tcpstat.tcps_rcvpack++;
	V_tcpstat.tcps_rcvbyte += tlen;
	ND6_HINT(tp); /* Some progress has been made */
	#ifdef TCPDEBUG
	if (so->so_options & SO_DEBUG)
	tcp_trace(TA_INPUT, ostate, tp,
	(void *)tcp_saveipgen, &tcp_savetcp, 0);
	#endif
	/*
	* Automatic sizing of receive socket buffer. Often the send
	* buffer size is not optimally adjusted to the actual network
	* conditions at hand (delay bandwidth product). Setting the
	* buffer size too small limits throughput on links with high
	* bandwidth and high delay (eg. trans-continental/oceanic links).
	*
	* On the receive side the socket buffer memory is only rarely
	* used to any significant extent. This allows us to be much
	* more aggressive in scaling the receive socket buffer. For
	* the case that the buffer space is actually used to a large
	* extent and we run out of kernel memory we can simply drop
	* the new segments; TCP on the sender will just retransmit it
	* later. Setting the buffer size too big may only consume too
	* much kernel memory if the application doesn't read() from
	* the socket or packet loss or reordering makes use of the
	* reassembly queue.
	*
	* The criteria to step up the receive buffer one notch are:
	* 1. the number of bytes received during the time it takes
	* one timestamp to be reflected back to us (the RTT);
	* 2. received bytes per RTT is within seven eighth of the
	* current socket buffer size;
	* 3. receive buffer size has not hit maximal automatic size;
	*
	* This algorithm does one step per RTT at most and only if
	* we receive a bulk stream w/o packet losses or reorderings.
	* Shrinking the buffer during idle times is not necessary as
	* it doesn't consume any memory when idle.
	*
	* TODO: Only step up if the application is actually serving
	* the buffer to better manage the socket buffer resources.
	*/
	if (V_tcp_do_autorcvbuf &&
	to.to_tsecr &&
	(so->so_rcv.sb_flags & SB_AUTOSIZE)) {
	if (to.to_tsecr > tp->rfbuf_ts &&
	to.to_tsecr - tp->rfbuf_ts < hz) {
	if (tp->rfbuf_cnt >
	(so->so_rcv.sb_hiwat / 8 * 7) &&
	so->so_rcv.sb_hiwat <
	V_tcp_autorcvbuf_max) {
	newsize =
	min(so->so_rcv.sb_hiwat +
	V_tcp_autorcvbuf_inc,
	V_tcp_autorcvbuf_max);
	}
	/* Start over with next RTT. */
	tp->rfbuf_ts = 0;
	tp->rfbuf_cnt = 0;
	} else
	tp->rfbuf_cnt += tlen; /* add up */
	}

	/* Add data to socket buffer. */
	SOCKBUF_LOCK(&so->so_rcv);
	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
	m_freem(m);
	} else {
	/*
	* Set new socket buffer size.
	* Give up when limit is reached.
	*/
	if (newsize)
	if (!sbreserve_locked(&so->so_rcv,
	newsize, so, curthread))
	so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
	m_adj(m, drop_hdrlen); /* delayed header drop */
	sbappendstream_locked(&so->so_rcv, m);
	}
	/* NB: sorwakeup_locked() does an implicit unlock. */
	sorwakeup_locked(so);
	if (DELAY_ACK(tp)) {
	tp->t_flags \|= TF_DELACK;
	} else {
	tp->t_flags \|= TF_ACKNOW;
	tcp_output(tp);
	}
	goto check_delack;
	}
	}

	/*
	* Calculate amount of space in receive window,
	* and then do TCP input processing.
	* Receive window is amount of space in rcv queue,
	* but not less than advertised window.
	*/
	win = sbspace(&so->so_rcv);
	if (win < 0)
	win = 0;
	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));

	/* Reset receive buffer auto scaling when not in bulk receive mode. */
	tp->rfbuf_ts = 0;
	tp->rfbuf_cnt = 0;

	switch (tp->t_state) {

	/*
	* If the state is SYN_RECEIVED:
	* if seg contains an ACK, but not for our SYN/ACK, send a RST.
	*/
	case TCPS_SYN_RECEIVED:
	if ((thflags & TH_ACK) &&
	(SEQ_LEQ(th->th_ack, tp->snd_una) \|\|
	SEQ_GT(th->th_ack, tp->snd_max))) {
	rstreason = BANDLIM_RST_OPENPORT;
	goto dropwithreset;
	}
	break;

	/*
	* If the state is SYN_SENT:
	* if seg contains an ACK, but not for our SYN, drop the input.
	* if seg contains a RST, then drop the connection.
	* if seg does not contain SYN, then drop it.
	* Otherwise this is an acceptable SYN segment
	* initialize tp->rcv_nxt and tp->irs
	* if seg contains ack then advance tp->snd_una
	* if seg contains an ECE and ECN support is enabled, the stream
	* is ECN capable.
	* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
	* arrange for segment to be acked (eventually)
	* continue processing rest of data/controls, beginning with URG
	*/
	case TCPS_SYN_SENT:
	if ((thflags & TH_ACK) &&
	(SEQ_LEQ(th->th_ack, tp->iss) \|\|
	SEQ_GT(th->th_ack, tp->snd_max))) {
	rstreason = BANDLIM_UNLIMITED;
	goto dropwithreset;
	}
	if ((thflags & (TH_ACK\|TH_RST)) == (TH_ACK\|TH_RST))
	tp = tcp_drop(tp, ECONNREFUSED);
	if (thflags & TH_RST)
	goto drop;
	if (!(thflags & TH_SYN))
	goto drop;

	tp->irs = th->th_seq;
	tcp_rcvseqinit(tp);
	if (thflags & TH_ACK) {
	V_tcpstat.tcps_connects++;
	soisconnected(so);
	#ifdef MAC
	SOCK_LOCK(so);
	mac_socketpeer_set_from_mbuf(m, so);
	SOCK_UNLOCK(so);
	#endif
	/* Do window scaling on this connection? */
	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
	tp->rcv_scale = tp->request_r_scale;
	}
	tp->rcv_adv += tp->rcv_wnd;
	tp->snd_una++; /* SYN is acked */
	/*
	* If there's data, delay ACK; if there's also a FIN
	* ACKNOW will be turned on later.
	*/
	if (DELAY_ACK(tp) && tlen != 0)
	tcp_timer_activate(tp, TT_DELACK,
	tcp_delacktime);
	else
	tp->t_flags \|= TF_ACKNOW;

	if ((thflags & TH_ECE) && V_tcp_do_ecn) {
	tp->t_flags \|= TF_ECN_PERMIT;
	V_tcpstat.tcps_ecn_shs++;
	}

	/*
	* Received <SYN,ACK> in SYN_SENT[*] state.
	* Transitions:
	* SYN_SENT --> ESTABLISHED
	* SYN_SENT* --> FIN_WAIT_1
	*/
	tp->t_starttime = ticks;
	if (tp->t_flags & TF_NEEDFIN) {
	tp->t_state = TCPS_FIN_WAIT_1;
	tp->t_flags &= ~TF_NEEDFIN;
	thflags &= ~TH_SYN;
	} else {
	tp->t_state = TCPS_ESTABLISHED;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
	}
	} else {
	/*
	* Received initial SYN in SYN-SENT[*] state =>
	* simultaneous open. If segment contains CC option
	* and there is a cached CC, apply TAO test.
	* If it succeeds, connection is * half-synchronized.
	* Otherwise, do 3-way handshake:
	* SYN-SENT -> SYN-RECEIVED
	* SYN-SENT* -> SYN-RECEIVED*
	* If there was no CC option, clear cached CC value.
	*/
	tp->t_flags \|= (TF_ACKNOW \| TF_NEEDSYN);
	tcp_timer_activate(tp, TT_REXMT, 0);
	tp->t_state = TCPS_SYN_RECEIVED;
	}

	KASSERT(headlocked, ("%s: trimthenstep6: head not locked",
	__func__));
	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	* Advance th->th_seq to correspond to first data byte.
	* If data, trim to stay within window,
	* dropping FIN if necessary.
	*/
	th->th_seq++;
	if (tlen > tp->rcv_wnd) {
	todrop = tlen - tp->rcv_wnd;
	m_adj(m, -todrop);
	tlen = tp->rcv_wnd;
	thflags &= ~TH_FIN;
	V_tcpstat.tcps_rcvpackafterwin++;
	V_tcpstat.tcps_rcvbyteafterwin += todrop;
	}
	tp->snd_wl1 = th->th_seq - 1;
	tp->rcv_up = th->th_seq;
	/*
	* Client side of transaction: already sent SYN and data.
	* If the remote host used T/TCP to validate the SYN,
	* our data will be ACK'd; if so, enter normal data segment
	* processing in the middle of step 5, ack processing.
	* Otherwise, goto step 6.
	*/
	if (thflags & TH_ACK)
	goto process_ACK;

	goto step6;

	/*
	* If the state is LAST_ACK or CLOSING or TIME_WAIT:
	* do normal processing.
	*
	* NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
	*/
	case TCPS_LAST_ACK:
	case TCPS_CLOSING:
	break; /* continue normal processing */
	}

	/*
	* States other than LISTEN or SYN_SENT.
	* First check the RST flag and sequence number since reset segments
	* are exempt from the timestamp and connection count tests. This
	* fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
	* below which allowed reset segments in half the sequence space
	* to fall though and be processed (which gives forged reset
	* segments with a random sequence number a 50 percent chance of
	* killing a connection).
	* Then check timestamp, if present.
	* Then check the connection count, if present.
	* Then check that at least some bytes of segment are within
	* receive window. If segment begins before rcv_nxt,
	* drop leading data (and SYN); if nothing left, just ack.
	*
	*
	* If the RST bit is set, check the sequence number to see
	* if this is a valid reset segment.
	* RFC 793 page 37:
	* In all states except SYN-SENT, all reset (RST) segments
	* are validated by checking their SEQ-fields. A reset is
	* valid if its sequence number is in the window.
	* Note: this does not take into account delayed ACKs, so
	* we should test against last_ack_sent instead of rcv_nxt.
	* The sequence number in the reset segment is normally an
	* echo of our outgoing acknowlegement numbers, but some hosts
	* send a reset with the sequence number at the rightmost edge
	* of our receive window, and we have to handle this case.
	* Note 2: Paul Watson's paper "Slipping in the Window" has shown
	* that brute force RST attacks are possible. To combat this,
	* we use a much stricter check while in the ESTABLISHED state,
	* only accepting RSTs where the sequence number is equal to
	* last_ack_sent. In all other states (the states in which a
	* RST is more likely), the more permissive check is used.
	* If we have multiple segments in flight, the intial reset
	* segment sequence numbers will be to the left of last_ack_sent,
	* but they will eventually catch up.
	* In any case, it never made sense to trim reset segments to
	* fit the receive window since RFC 1122 says:
	* 4.2.2.12 RST Segment: RFC-793 Section 3.4
	*
	* A TCP SHOULD allow a received RST segment to include data.
	*
	* DISCUSSION
	* It has been suggested that a RST segment could contain
	* ASCII text that encoded and explained the cause of the
	* RST. No standard has yet been established for such
	* data.
	*
	* If the reset segment passes the sequence number test examine
	* the state:
	* SYN_RECEIVED STATE:
	* If passive open, return to LISTEN state.
	* If active open, inform user that connection was refused.
	* ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
	* Inform user that connection was reset, and close tcb.
	* CLOSING, LAST_ACK STATES:
	* Close the tcb.
	* TIME_WAIT STATE:
	* Drop the segment - see Stevens, vol. 2, p. 964 and
	* RFC 1337.
	*/
	if (thflags & TH_RST) {
	if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
	SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
	switch (tp->t_state) {

	case TCPS_SYN_RECEIVED:
	so->so_error = ECONNREFUSED;
	goto close;

	case TCPS_ESTABLISHED:
	if (V_tcp_insecure_rst == 0 &&
	!(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
	SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
	!(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
	SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
	V_tcpstat.tcps_badrst++;
	goto drop;
	}
	/* FALLTHROUGH */
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
	so->so_error = ECONNRESET;
	close:
	tp->t_state = TCPS_CLOSED;
	V_tcpstat.tcps_drops++;
	KASSERT(headlocked, ("%s: trimthenstep6: "
	"tcp_close: head not locked", __func__));
	tp = tcp_close(tp);
	break;

	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	KASSERT(headlocked, ("%s: trimthenstep6: "
	"tcp_close.2: head not locked", __func__));
	tp = tcp_close(tp);
	break;
	}
	}
	goto drop;
	}

	/*
	* RFC 1323 PAWS: If we have a timestamp reply on this segment
	* and it's less than ts_recent, drop it.
	*/
	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
	TSTMP_LT(to.to_tsval, tp->ts_recent)) {

	/* Check to see if ts_recent is over 24 days old. */
	if ((int)(ticks - tp->ts_recent_age) > TCP_PAWS_IDLE) {
	/*
	* Invalidate ts_recent. If this segment updates
	* ts_recent, the age will be reset later and ts_recent
	* will get a valid value. If it does not, setting
	* ts_recent to zero will at least satisfy the
	* requirement that zero be placed in the timestamp
	* echo reply when ts_recent isn't valid. The
	* age isn't reset until we get a valid ts_recent
	* because we don't want out-of-order segments to be
	* dropped when ts_recent is old.
	*/
	tp->ts_recent = 0;
	} else {
	V_tcpstat.tcps_rcvduppack++;
	V_tcpstat.tcps_rcvdupbyte += tlen;
	V_tcpstat.tcps_pawsdrop++;
	if (tlen)
	goto dropafterack;
	goto drop;
	}
	}

	/*
	* In the SYN-RECEIVED state, validate that the packet belongs to
	* this connection before trimming the data to fit the receive
	* window. Check the sequence number versus IRS since we know
	* the sequence numbers haven't wrapped. This is a partial fix
	* for the "LAND" DoS attack.
	*/
	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
	rstreason = BANDLIM_RST_OPENPORT;
	goto dropwithreset;
	}

	todrop = tp->rcv_nxt - th->th_seq;
	if (todrop > 0) {
	if (thflags & TH_SYN) {
	thflags &= ~TH_SYN;
	th->th_seq++;
	if (th->th_urp > 1)
	th->th_urp--;
	else
	thflags &= ~TH_URG;
	todrop--;
	}
	/*
	* Following if statement from Stevens, vol. 2, p. 960.
	*/
	if (todrop > tlen
	\|\| (todrop == tlen && (thflags & TH_FIN) == 0)) {
	/*
	* Any valid FIN must be to the left of the window.
	* At this point the FIN must be a duplicate or out
	* of sequence; drop it.
	*/
	thflags &= ~TH_FIN;

	/*
	* Send an ACK to resynchronize and drop any data.
	* But keep on processing for RST or ACK.
	*/
	tp->t_flags \|= TF_ACKNOW;
	todrop = tlen;
	V_tcpstat.tcps_rcvduppack++;
	V_tcpstat.tcps_rcvdupbyte += todrop;
	} else {
	V_tcpstat.tcps_rcvpartduppack++;
	V_tcpstat.tcps_rcvpartdupbyte += todrop;
	}
	drop_hdrlen += todrop; /* drop from the top afterwards */
	th->th_seq += todrop;
	tlen -= todrop;
	if (th->th_urp > todrop)
	th->th_urp -= todrop;
	else {
	thflags &= ~TH_URG;
	th->th_urp = 0;
	}
	}

	/*
	* If new data are received on a connection after the
	* user processes are gone, then RST the other end.
	*/
	if ((so->so_state & SS_NOFDREF) &&
	tp->t_state > TCPS_CLOSE_WAIT && tlen) {
	char *s;

	KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head "
	"not locked", __func__));
	if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
	log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
	"was closed, sending RST and removing tcpcb\n",
	s, __func__, tcpstates[tp->t_state], tlen);
	free(s, M_TCPLOG);
	}
	tp = tcp_close(tp);
	V_tcpstat.tcps_rcvafterclose++;
	rstreason = BANDLIM_UNLIMITED;
	goto dropwithreset;
	}

	/*
	* If segment ends after window, drop trailing data
	* (and PUSH and FIN); if nothing left, just ACK.
	*/
	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
	if (todrop > 0) {
	V_tcpstat.tcps_rcvpackafterwin++;
	if (todrop >= tlen) {
	V_tcpstat.tcps_rcvbyteafterwin += tlen;
	/*
	* If window is closed can only take segments at
	* window edge, and have to drop data and PUSH from
	* incoming segments. Continue processing, but
	* remember to ack. Otherwise, drop segment
	* and ack.
	*/
	if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
	tp->t_flags \|= TF_ACKNOW;
	V_tcpstat.tcps_rcvwinprobe++;
	} else
	goto dropafterack;
	} else
	V_tcpstat.tcps_rcvbyteafterwin += todrop;
	m_adj(m, -todrop);
	tlen -= todrop;
	thflags &= ~(TH_PUSH\|TH_FIN);
	}

	/*
	* If last ACK falls within this segment's sequence numbers,
	* record its timestamp.
	* NOTE:
	* 1) That the test incorporates suggestions from the latest
	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
	* 2) That updating only on newer timestamps interferes with
	* our earlier PAWS tests, so this check should be solely
	* predicated on the sequence space of this segment.
	* 3) That we modify the segment boundary check to be
	* Last.ACK.Sent <= SEG.SEQ + SEG.Len
	* instead of RFC1323's
	* Last.ACK.Sent < SEG.SEQ + SEG.Len,
	* This modified check allows us to overcome RFC1323's
	* limitations as described in Stevens TCP/IP Illustrated
	* Vol. 2 p.869. In such cases, we can still calculate the
	* RTT correctly when RCV.NXT == Last.ACK.Sent.
	*/
	if ((to.to_flags & TOF_TS) != 0 &&
	SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
	SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
	((thflags & (TH_SYN\|TH_FIN)) != 0))) {
	tp->ts_recent_age = ticks;
	tp->ts_recent = to.to_tsval;
	}

	/*
	* If a SYN is in the window, then this is an
	* error and we send an RST and drop the connection.
	*/
	if (thflags & TH_SYN) {
	KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: "
	"head not locked", __func__));
	tp = tcp_drop(tp, ECONNRESET);
	rstreason = BANDLIM_UNLIMITED;
	goto drop;
	}

	/*
	* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
	* flag is on (half-synchronized state), then queue data for
	* later processing; else drop segment and return.
	*/
	if ((thflags & TH_ACK) == 0) {
	if (tp->t_state == TCPS_SYN_RECEIVED \|\|
	(tp->t_flags & TF_NEEDSYN))
	goto step6;
	else if (tp->t_flags & TF_ACKNOW)
	goto dropafterack;
	else
	goto drop;
	}

	/*
	* Ack processing.
	*/
	switch (tp->t_state) {

	/*
	* In SYN_RECEIVED state, the ack ACKs our SYN, so enter
	* ESTABLISHED state and continue processing.
	* The ACK was checked above.
	*/
	case TCPS_SYN_RECEIVED:

	V_tcpstat.tcps_connects++;
	soisconnected(so);
	/* Do window scaling? */
	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
	tp->rcv_scale = tp->request_r_scale;
	tp->snd_wnd = tiwin;
	}
	/*
	* Make transitions:
	* SYN-RECEIVED -> ESTABLISHED
	* SYN-RECEIVED* -> FIN-WAIT-1
	*/
	tp->t_starttime = ticks;
	if (tp->t_flags & TF_NEEDFIN) {
	tp->t_state = TCPS_FIN_WAIT_1;
	tp->t_flags &= ~TF_NEEDFIN;
	} else {
	tp->t_state = TCPS_ESTABLISHED;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
	}
	/*
	* If segment contains data or ACK, will call tcp_reass()
	* later; if not, do so now to pass queued data to user.
	*/
	if (tlen == 0 && (thflags & TH_FIN) == 0)
	(void) tcp_reass(tp, (struct tcphdr *)0, 0,
	(struct mbuf *)0);
	tp->snd_wl1 = th->th_seq - 1;
	/* FALLTHROUGH */

	/*
	* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
	* ACKs. If the ack is in the range
	* tp->snd_una < th->th_ack <= tp->snd_max
	* then advance tp->snd_una to th->th_ack and drop
	* data from the retransmission queue. If this ACK reflects
	* more up to date window information we update our window information.
	*/
	case TCPS_ESTABLISHED:
	case TCPS_FIN_WAIT_1:
	case TCPS_FIN_WAIT_2:
	case TCPS_CLOSE_WAIT:
	case TCPS_CLOSING:
	case TCPS_LAST_ACK:
	if (SEQ_GT(th->th_ack, tp->snd_max)) {
	V_tcpstat.tcps_rcvacktoomuch++;
	goto dropafterack;
	}
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	((to.to_flags & TOF_SACK) \|\|
	!TAILQ_EMPTY(&tp->snd_holes)))
	tcp_sack_doack(tp, &to, th->th_ack);
	if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
	if (tlen == 0 && tiwin == tp->snd_wnd) {
	V_tcpstat.tcps_rcvdupack++;
	/*
	* If we have outstanding data (other than
	* a window probe), this is a completely
	* duplicate ack (ie, window info didn't
	* change), the ack is the biggest we've
	* seen and we've seen exactly our rexmt
	* threshhold of them, assume a packet
	* has been dropped and retransmit it.
	* Kludge snd_nxt & the congestion
	* window so we send only this one
	* packet.
	*
	* We know we're losing at the current
	* window size so do congestion avoidance
	* (set ssthresh to half the current window
	* and pull our congestion window back to
	* the new ssthresh).
	*
	* Dup acks mean that packets have left the
	* network (they're now cached at the receiver)
	* so bump cwnd by the amount in the receiver
	* to keep a constant cwnd packets in the
	* network.
	*
	* When using TCP ECN, notify the peer that
	* we reduced the cwnd.
	*/
	if (!tcp_timer_active(tp, TT_REXMT) \|\|
	th->th_ack != tp->snd_una)
	tp->t_dupacks = 0;
	else if (++tp->t_dupacks > tcprexmtthresh \|\|
	((V_tcp_do_newreno \|\|
	(tp->t_flags & TF_SACK_PERMIT)) &&
	IN_FASTRECOVERY(tp))) {
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	IN_FASTRECOVERY(tp)) {
	int awnd;

	/*
	* Compute the amount of data in flight first.
	* We can inject new data into the pipe iff
	* we have less than 1/2 the original window's
	* worth of data in flight.
	*/
	awnd = (tp->snd_nxt - tp->snd_fack) +
	tp->sackhint.sack_bytes_rexmit;
	if (awnd < tp->snd_ssthresh) {
	tp->snd_cwnd += tp->t_maxseg;
	if (tp->snd_cwnd > tp->snd_ssthresh)
	tp->snd_cwnd = tp->snd_ssthresh;
	}
	} else
	tp->snd_cwnd += tp->t_maxseg;
	(void) tcp_output(tp);
	goto drop;
	} else if (tp->t_dupacks == tcprexmtthresh) {
	tcp_seq onxt = tp->snd_nxt;

	/*
	* If we're doing sack, check to
	* see if we're already in sack
	* recovery. If we're not doing sack,
	* check to see if we're in newreno
	* recovery.
	*/
	if (tp->t_flags & TF_SACK_PERMIT) {
	if (IN_FASTRECOVERY(tp)) {
	tp->t_dupacks = 0;
	break;
	}
	} else if (V_tcp_do_newreno \|\|
	V_tcp_do_ecn) {
	if (SEQ_LEQ(th->th_ack,
	tp->snd_recover)) {
	tp->t_dupacks = 0;
	break;
	}
	}
	tcp_congestion_exp(tp);
	tcp_timer_activate(tp, TT_REXMT, 0);
	tp->t_rtttime = 0;
	if (tp->t_flags & TF_SACK_PERMIT) {
	V_tcpstat.tcps_sack_recovery_episode++;
	tp->sack_newdata = tp->snd_nxt;
	tp->snd_cwnd = tp->t_maxseg;
	(void) tcp_output(tp);
	goto drop;
	}
	tp->snd_nxt = th->th_ack;
	tp->snd_cwnd = tp->t_maxseg;
	(void) tcp_output(tp);
	KASSERT(tp->snd_limited <= 2,
	("%s: tp->snd_limited too big",
	__func__));
	tp->snd_cwnd = tp->snd_ssthresh +
	tp->t_maxseg *
	(tp->t_dupacks - tp->snd_limited);
	if (SEQ_GT(onxt, tp->snd_nxt))
	tp->snd_nxt = onxt;
	goto drop;
	} else if (V_tcp_do_rfc3042) {
	u_long oldcwnd = tp->snd_cwnd;
	tcp_seq oldsndmax = tp->snd_max;
	u_int sent;

	KASSERT(tp->t_dupacks == 1 \|\|
	tp->t_dupacks == 2,
	("%s: dupacks not 1 or 2",
	__func__));
	if (tp->t_dupacks == 1)
	tp->snd_limited = 0;
	tp->snd_cwnd =
	(tp->snd_nxt - tp->snd_una) +
	(tp->t_dupacks - tp->snd_limited) *
	tp->t_maxseg;
	(void) tcp_output(tp);
	sent = tp->snd_max - oldsndmax;
	if (sent > tp->t_maxseg) {
	KASSERT((tp->t_dupacks == 2 &&
	tp->snd_limited == 0) \|\|
	(sent == tp->t_maxseg + 1 &&
	tp->t_flags & TF_SENTFIN),
	("%s: sent too much",
	__func__));
	tp->snd_limited = 2;
	} else if (sent > 0)
	++tp->snd_limited;
	tp->snd_cwnd = oldcwnd;
	goto drop;
	}
	} else
	tp->t_dupacks = 0;
	break;
	}

	KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
	("%s: th_ack <= snd_una", __func__));

	/*
	* If the congestion window was inflated to account
	* for the other side's cached packets, retract it.
	*/
	if (V_tcp_do_newreno \|\| (tp->t_flags & TF_SACK_PERMIT)) {
	if (IN_FASTRECOVERY(tp)) {
	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
	if (tp->t_flags & TF_SACK_PERMIT)
	tcp_sack_partialack(tp, th);
	else
	tcp_newreno_partial_ack(tp, th);
	} else {
	/*
	* Out of fast recovery.
	* Window inflation should have left us
	* with approximately snd_ssthresh
	* outstanding data.
	* But in case we would be inclined to
	* send a burst, better to do it via
	* the slow start mechanism.
	*/
	if (SEQ_GT(th->th_ack +
	tp->snd_ssthresh,
	tp->snd_max))
	tp->snd_cwnd = tp->snd_max -
	th->th_ack +
	tp->t_maxseg;
	else
	tp->snd_cwnd = tp->snd_ssthresh;
	}
	}
	} else {
	if (tp->t_dupacks >= tcprexmtthresh &&
	tp->snd_cwnd > tp->snd_ssthresh)
	tp->snd_cwnd = tp->snd_ssthresh;
	}
	tp->t_dupacks = 0;
	/*
	* If we reach this point, ACK is not a duplicate,
	* i.e., it ACKs something we sent.
	*/
	if (tp->t_flags & TF_NEEDSYN) {
	/*
	* T/TCP: Connection was half-synchronized, and our
	* SYN has been ACK'd (so connection is now fully
	* synchronized). Go to non-starred state,
	* increment snd_una for ACK of SYN, and check if
	* we can do window scaling.
	*/
	tp->t_flags &= ~TF_NEEDSYN;
	tp->snd_una++;
	/* Do window scaling? */
	if ((tp->t_flags & (TF_RCVD_SCALE\|TF_REQ_SCALE)) ==
	(TF_RCVD_SCALE\|TF_REQ_SCALE)) {
	tp->rcv_scale = tp->request_r_scale;
	/* Send window already scaled. */
	}
	}

	process_ACK:
	KASSERT(headlocked, ("%s: process_ACK: head not locked",
	__func__));
	INP_WLOCK_ASSERT(tp->t_inpcb);

	acked = th->th_ack - tp->snd_una;
	V_tcpstat.tcps_rcvackpack++;
	V_tcpstat.tcps_rcvackbyte += acked;

	/*
	* If we just performed our first retransmit, and the ACK
	* arrives within our recovery window, then it was a mistake
	* to do the retransmit in the first place. Recover our
	* original cwnd and ssthresh, and proceed to transmit where
	* we left off.
	*/
	if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
	++V_tcpstat.tcps_sndrexmitbad;
	tp->snd_cwnd = tp->snd_cwnd_prev;
	tp->snd_ssthresh = tp->snd_ssthresh_prev;
	tp->snd_recover = tp->snd_recover_prev;
	if (tp->t_flags & TF_WASFRECOVERY)
	ENTER_FASTRECOVERY(tp);
	tp->snd_nxt = tp->snd_max;
	tp->t_badrxtwin = 0; /* XXX probably not required */
	}

	/*
	* If we have a timestamp reply, update smoothed
	* round trip time. If no timestamp is present but
	* transmit timer is running and timed sequence
	* number was acked, update smoothed round trip time.
	* Since we now have an rtt measurement, cancel the
	* timer backoff (cf., Phil Karn's retransmit alg.).
	* Recompute the initial retransmit timer.
	*
	* Some boxes send broken timestamp replies
	* during the SYN+ACK phase, ignore
	* timestamps of 0 or we could calculate a
	* huge RTT and blow up the retransmit timer.
	*/
	if ((to.to_flags & TOF_TS) != 0 &&
	to.to_tsecr) {
	if (!tp->t_rttlow \|\| tp->t_rttlow > ticks - to.to_tsecr)
	tp->t_rttlow = ticks - to.to_tsecr;
	tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
	} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
	if (!tp->t_rttlow \|\| tp->t_rttlow > ticks - tp->t_rtttime)
	tp->t_rttlow = ticks - tp->t_rtttime;
	tcp_xmit_timer(tp, ticks - tp->t_rtttime);
	}
	tcp_xmit_bandwidth_limit(tp, th->th_ack);

	/*
	* If all outstanding data is acked, stop retransmit
	* timer and remember to restart (more output or persist).
	* If there is more data to be acked, restart retransmit
	* timer, using current (possibly backed-off) value.
	*/
	if (th->th_ack == tp->snd_max) {
	tcp_timer_activate(tp, TT_REXMT, 0);
	needoutput = 1;
	} else if (!tcp_timer_active(tp, TT_PERSIST))
	tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);

	/*
	* If no data (only SYN) was ACK'd,
	* skip rest of ACK processing.
	*/
	if (acked == 0)
	goto step6;

	/*
	* When new data is acked, open the congestion window.
	* If the window gives us less than ssthresh packets
	* in flight, open exponentially (maxseg per packet).
	* Otherwise open linearly: maxseg per window
	* (maxseg^2 / cwnd per packet).
	* If cwnd > maxseg^2, fix the cwnd increment at 1 byte
	* to avoid capping cwnd (as suggested in RFC2581).
	*/
	if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) \|\|
	!IN_FASTRECOVERY(tp)) {
	u_int cw = tp->snd_cwnd;
	u_int incr = tp->t_maxseg;
	if (cw > tp->snd_ssthresh)
	incr = max((incr * incr / cw), 1);
	tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
	}
	SOCKBUF_LOCK(&so->so_snd);
	if (acked > so->so_snd.sb_cc) {
	tp->snd_wnd -= so->so_snd.sb_cc;
	sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
	ourfinisacked = 1;
	} else {
	sbdrop_locked(&so->so_snd, acked);
	tp->snd_wnd -= acked;
	ourfinisacked = 0;
	}
	/* NB: sowwakeup_locked() does an implicit unlock. */
	sowwakeup_locked(so);
	/* Detect una wraparound. */
	if ((V_tcp_do_newreno \|\| (tp->t_flags & TF_SACK_PERMIT)) &&
	!IN_FASTRECOVERY(tp) &&
	SEQ_GT(tp->snd_una, tp->snd_recover) &&
	SEQ_LEQ(th->th_ack, tp->snd_recover))
	tp->snd_recover = th->th_ack - 1;
	if ((V_tcp_do_newreno \|\| (tp->t_flags & TF_SACK_PERMIT)) &&
	IN_FASTRECOVERY(tp) &&
	SEQ_GEQ(th->th_ack, tp->snd_recover))
	EXIT_FASTRECOVERY(tp);
	tp->snd_una = th->th_ack;
	if (tp->t_flags & TF_SACK_PERMIT) {
	if (SEQ_GT(tp->snd_una, tp->snd_recover))
	tp->snd_recover = tp->snd_una;
	}
	if (SEQ_LT(tp->snd_nxt, tp->snd_una))
	tp->snd_nxt = tp->snd_una;

	switch (tp->t_state) {

	/*
	* In FIN_WAIT_1 STATE in addition to the processing
	* for the ESTABLISHED state if our FIN is now acknowledged
	* then enter FIN_WAIT_2.
	*/
	case TCPS_FIN_WAIT_1:
	if (ourfinisacked) {
	/*
	* If we can't receive any more
	* data, then closing user can proceed.
	* Starting the timer is contrary to the
	* specification, but if we don't get a FIN
	* we'll hang forever.
	*
	* XXXjl:
	* we should release the tp also, and use a
	* compressed state.
	*/
	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
	int timeout;

	soisdisconnected(so);
	timeout = (tcp_fast_finwait2_recycle) ?
	tcp_finwait2_timeout : tcp_maxidle;
	tcp_timer_activate(tp, TT_2MSL, timeout);
	}
	tp->t_state = TCPS_FIN_WAIT_2;
	}
	break;

	/*
	* In CLOSING STATE in addition to the processing for
	* the ESTABLISHED state if the ACK acknowledges our FIN
	* then enter the TIME-WAIT state, otherwise ignore
	* the segment.
	*/
	case TCPS_CLOSING:
	if (ourfinisacked) {
	KASSERT(headlocked, ("%s: process_ACK: "
	"head not locked", __func__));
	tcp_twstart(tp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	m_freem(m);
	return;
	}
	break;

	/*
	* In LAST_ACK, we may still be waiting for data to drain
	* and/or to be acked, as well as for the ack of our FIN.
	* If our FIN is now acknowledged, delete the TCB,
	* enter the closed state and return.
	*/
	case TCPS_LAST_ACK:
	if (ourfinisacked) {
	KASSERT(headlocked, ("%s: process_ACK: "
	"tcp_close: head not locked", __func__));
	tp = tcp_close(tp);
	goto drop;
	}
	break;
	}
	}

	step6:
	KASSERT(headlocked, ("%s: step6: head not locked", __func__));
	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	* Update window information.
	* Don't look at window if no ACK: TAC's send garbage on first SYN.
	*/
	if ((thflags & TH_ACK) &&
	(SEQ_LT(tp->snd_wl1, th->th_seq) \|\|
	(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) \|\|
	(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
	/* keep track of pure window updates */
	if (tlen == 0 &&
	tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
	V_tcpstat.tcps_rcvwinupd++;
	tp->snd_wnd = tiwin;
	tp->snd_wl1 = th->th_seq;
	tp->snd_wl2 = th->th_ack;
	if (tp->snd_wnd > tp->max_sndwnd)
	tp->max_sndwnd = tp->snd_wnd;
	needoutput = 1;
	}

	/*
	* Process segments with URG.
	*/
	if ((thflags & TH_URG) && th->th_urp &&
	TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	/*
	* This is a kludge, but if we receive and accept
	* random urgent pointers, we'll crash in
	* soreceive. It's hard to imagine someone
	* actually wanting to send this much urgent data.
	*/
	SOCKBUF_LOCK(&so->so_rcv);
	if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
	th->th_urp = 0; /* XXX */
	thflags &= ~TH_URG; /* XXX */
	SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */
	goto dodata; /* XXX */
	}
	/*
	* If this segment advances the known urgent pointer,
	* then mark the data stream. This should not happen
	* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
	* a FIN has been received from the remote side.
	* In these states we ignore the URG.
	*
	* According to RFC961 (Assigned Protocols),
	* the urgent pointer points to the last octet
	* of urgent data. We continue, however,
	* to consider it to indicate the first octet
	* of data past the urgent section as the original
	* spec states (in one of two places).
	*/
	if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
	tp->rcv_up = th->th_seq + th->th_urp;
	so->so_oobmark = so->so_rcv.sb_cc +
	(tp->rcv_up - tp->rcv_nxt) - 1;
	if (so->so_oobmark == 0)
	so->so_rcv.sb_state \|= SBS_RCVATMARK;
	sohasoutofband(so);
	tp->t_oobflags &= ~(TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
	}
	SOCKBUF_UNLOCK(&so->so_rcv);
	/*
	* Remove out of band data so doesn't get presented to user.
	* This can happen independent of advancing the URG pointer,
	* but if two URG's are pending at once, some out-of-band
	* data may creep in... ick.
	*/
	if (th->th_urp <= (u_long)tlen &&
	!(so->so_options & SO_OOBINLINE)) {
	/* hdr drop is delayed */
	tcp_pulloutofband(so, th, m, drop_hdrlen);
	}
	} else {
	/*
	* If no out of band data is expected,
	* pull receive urgent pointer along
	* with the receive window.
	*/
	if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
	tp->rcv_up = tp->rcv_nxt;
	}
	dodata: /* XXX */
	KASSERT(headlocked, ("%s: dodata: head not locked", __func__));
	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	* Process the segment text, merging it into the TCP sequencing queue,
	* and arranging for acknowledgment of receipt if necessary.
	* This process logically involves adjusting tp->rcv_wnd as data
	* is presented to the user (this happens in tcp_usrreq.c,
	* case PRU_RCVD). If a FIN has already been received on this
	* connection then we just ignore the text.
	*/
	if ((tlen \|\| (thflags & TH_FIN)) &&
	TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	tcp_seq save_start = th->th_seq;
	m_adj(m, drop_hdrlen); /* delayed header drop */
	/*
	* Insert segment which includes th into TCP reassembly queue
	* with control block tp. Set thflags to whether reassembly now
	* includes a segment with FIN. This handles the common case
	* inline (segment is the next to be received on an established
	* connection, and the queue is empty), avoiding linkage into
	* and removal from the queue and repetition of various
	* conversions.
	* Set DELACK for segments received in order, but ack
	* immediately when segments are out of order (so
	* fast retransmit can work).
	*/
	if (th->th_seq == tp->rcv_nxt &&
	LIST_EMPTY(&tp->t_segq) &&
	TCPS_HAVEESTABLISHED(tp->t_state)) {
	if (DELAY_ACK(tp))
	tp->t_flags \|= TF_DELACK;
	else
	tp->t_flags \|= TF_ACKNOW;
	tp->rcv_nxt += tlen;
	thflags = th->th_flags & TH_FIN;
	V_tcpstat.tcps_rcvpack++;
	V_tcpstat.tcps_rcvbyte += tlen;
	ND6_HINT(tp);
	SOCKBUF_LOCK(&so->so_rcv);
	if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
	m_freem(m);
	else
	sbappendstream_locked(&so->so_rcv, m);
	/* NB: sorwakeup_locked() does an implicit unlock. */
	sorwakeup_locked(so);
	} else {
	/*
	* XXX: Due to the header drop above "th" is
	* theoretically invalid by now. Fortunately
	* m_adj() doesn't actually frees any mbufs
	* when trimming from the head.
	*/
	thflags = tcp_reass(tp, th, &tlen, m);
	tp->t_flags \|= TF_ACKNOW;
	}
	if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
	tcp_update_sack_list(tp, save_start, save_start + tlen);
	#if 0
	/*
	* Note the amount of data that peer has sent into
	* our window, in order to estimate the sender's
	* buffer size.
	* XXX: Unused.
	*/
	len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
	#endif
	} else {
	m_freem(m);
	thflags &= ~TH_FIN;
	}

	/*
	* If FIN is received ACK the FIN and let the user know
	* that the connection is closing.
	*/
	if (thflags & TH_FIN) {
	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	socantrcvmore(so);
	/*
	* If connection is half-synchronized
	* (ie NEEDSYN flag on) then delay ACK,
	* so it may be piggybacked when SYN is sent.
	* Otherwise, since we received a FIN then no
	* more input can be expected, send ACK now.
	*/
	if (tp->t_flags & TF_NEEDSYN)
	tp->t_flags \|= TF_DELACK;
	else
	tp->t_flags \|= TF_ACKNOW;
	tp->rcv_nxt++;
	}
	switch (tp->t_state) {

	/*
	* In SYN_RECEIVED and ESTABLISHED STATES
	* enter the CLOSE_WAIT state.
	*/
	case TCPS_SYN_RECEIVED:
	tp->t_starttime = ticks;
	/* FALLTHROUGH */
	case TCPS_ESTABLISHED:
	tp->t_state = TCPS_CLOSE_WAIT;
	break;

	/*
	* If still in FIN_WAIT_1 STATE FIN has not been acked so
	* enter the CLOSING state.
	*/
	case TCPS_FIN_WAIT_1:
	tp->t_state = TCPS_CLOSING;
	break;

	/*
	* In FIN_WAIT_2 state enter the TIME_WAIT state,
	* starting the time-wait timer, turning off the other
	* standard timers.
	*/
	case TCPS_FIN_WAIT_2:
	KASSERT(headlocked == 1, ("%s: dodata: "
	"TCP_FIN_WAIT_2: head not locked", __func__));
	tcp_twstart(tp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return;
	}
	}
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	#ifdef TCPDEBUG
	if (so->so_options & SO_DEBUG)
	tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
	&tcp_savetcp, 0);
	#endif

	/*
	* Return any desired output.
	*/
	if (needoutput \|\| (tp->t_flags & TF_ACKNOW))
	(void) tcp_output(tp);

	check_delack:
	KASSERT(headlocked == 0, ("%s: check_delack: head locked",
	__func__));
	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(tp->t_inpcb);
	if (tp->t_flags & TF_DELACK) {
	tp->t_flags &= ~TF_DELACK;
	tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
	}
	INP_WUNLOCK(tp->t_inpcb);
	return;

	dropafterack:
	KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__));
	/*
	* Generate an ACK dropping incoming segment if it occupies
	* sequence space, where the ACK reflects our state.
	*
	* We can now skip the test for the RST flag since all
	* paths to this code happen after packets containing
	* RST have been dropped.
	*
	* In the SYN-RECEIVED state, don't send an ACK unless the
	* segment we received passes the SYN-RECEIVED ACK test.
	* If it fails send a RST. This breaks the loop in the
	* "LAND" DoS attack, and also prevents an ACK storm
	* between two listening ports that have been sent forged
	* SYN segments, each with the source address of the other.
	*/
	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
	(SEQ_GT(tp->snd_una, th->th_ack) \|\|
	SEQ_GT(th->th_ack, tp->snd_max)) ) {
	rstreason = BANDLIM_RST_OPENPORT;
	goto dropwithreset;
	}
	#ifdef TCPDEBUG
	if (so->so_options & SO_DEBUG)
	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	&tcp_savetcp, 0);
	#endif
	KASSERT(headlocked, ("%s: headlocked should be 1", __func__));
	INP_INFO_WUNLOCK(&V_tcbinfo);
	tp->t_flags \|= TF_ACKNOW;
	(void) tcp_output(tp);
	INP_WUNLOCK(tp->t_inpcb);
	m_freem(m);
	return;

	dropwithreset:
	KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__));

	/*
	* If tp is non-NULL, we call tcp_dropwithreset() holding both inpcb
	* and global locks. However, if NULL, we must hold neither as
	* firewalls may acquire the global lock in order to look for a
	* matching inpcb.
	*/
	if (tp != NULL) {
	tcp_dropwithreset(m, th, tp, tlen, rstreason);
	INP_WUNLOCK(tp->t_inpcb);
	}
	INP_INFO_WUNLOCK(&V_tcbinfo);
	if (tp == NULL)
	tcp_dropwithreset(m, th, NULL, tlen, rstreason);
	return;

	drop:
	/*
	* Drop space held by incoming segment and return.
	*/
	#ifdef TCPDEBUG
	if (tp == NULL \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
	&tcp_savetcp, 0);
	#endif
	if (tp != NULL)
	INP_WUNLOCK(tp->t_inpcb);
	if (headlocked)
	INP_INFO_WUNLOCK(&V_tcbinfo);
	m_freem(m);
	return;
	}

	/*
	* Issue RST and make ACK acceptable to originator of segment.
	* The mbuf must still include the original packet header.
	* tp may be NULL.
	*/
	static void
	tcp_dropwithreset(struct mbuf m, struct tcphdr th, struct tcpcb *tp,
	int tlen, int rstreason)
	{
	struct ip *ip;
	#ifdef INET6
	struct ip6_hdr *ip6;
	#endif

	if (tp != NULL) {
	INP_WLOCK_ASSERT(tp->t_inpcb);
	}

	/* Don't bother if destination was broadcast/multicast. */
	if ((th->th_flags & TH_RST) \|\| m->m_flags & (M_BCAST\|M_MCAST))
	goto drop;
	#ifdef INET6
	if (mtod(m, struct ip *)->ip_v == 6) {
	ip6 = mtod(m, struct ip6_hdr *);
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
	goto drop;
	/* IPv6 anycast check is done at tcp6_input() */
	} else
	#endif
	{
	ip = mtod(m, struct ip *);
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
	goto drop;
	}

	/* Perform bandwidth limiting. */
	if (badport_bandlim(rstreason) < 0)
	goto drop;

	/* tcp_respond consumes the mbuf chain. */
	if (th->th_flags & TH_ACK) {
	tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
	th->th_ack, TH_RST);
	} else {
	if (th->th_flags & TH_SYN)
	tlen++;
	tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
	(tcp_seq)0, TH_RST\|TH_ACK);
	}
	return;
	drop:
	m_freem(m);
	return;
	}

	/*
	* Parse TCP options and place in tcpopt.
	*/
	static void
	tcp_dooptions(struct tcpopt to, u_char cp, int cnt, int flags)
	{
	+ INIT_VNET_INET(curvnet);
	int opt, optlen;

	to->to_flags = 0;
	for (; cnt > 0; cnt -= optlen, cp += optlen) {
	opt = cp[0];
	if (opt == TCPOPT_EOL)
	break;
	if (opt == TCPOPT_NOP)
	optlen = 1;
	else {
	if (cnt < 2)
	break;
	optlen = cp[1];
	if (optlen < 2 \|\| optlen > cnt)
	break;
	}
	switch (opt) {
	case TCPOPT_MAXSEG:
	if (optlen != TCPOLEN_MAXSEG)
	continue;
	if (!(flags & TO_SYN))
	continue;
	to->to_flags \|= TOF_MSS;
	bcopy((char *)cp + 2,
	(char *)&to->to_mss, sizeof(to->to_mss));
	to->to_mss = ntohs(to->to_mss);
	break;
	case TCPOPT_WINDOW:
	if (optlen != TCPOLEN_WINDOW)
	continue;
	if (!(flags & TO_SYN))
	continue;
	to->to_flags \|= TOF_SCALE;
	to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
	break;
	case TCPOPT_TIMESTAMP:
	if (optlen != TCPOLEN_TIMESTAMP)
	continue;
	to->to_flags \|= TOF_TS;
	bcopy((char *)cp + 2,
	(char *)&to->to_tsval, sizeof(to->to_tsval));
	to->to_tsval = ntohl(to->to_tsval);
	bcopy((char *)cp + 6,
	(char *)&to->to_tsecr, sizeof(to->to_tsecr));
	to->to_tsecr = ntohl(to->to_tsecr);
	break;
	#ifdef TCP_SIGNATURE
	/*
	* XXX In order to reply to a host which has set the
	* TCP_SIGNATURE option in its initial SYN, we have to
	* record the fact that the option was observed here
	* for the syncache code to perform the correct response.
	*/
	case TCPOPT_SIGNATURE:
	if (optlen != TCPOLEN_SIGNATURE)
	continue;
	to->to_flags \|= TOF_SIGNATURE;
	to->to_signature = cp + 2;
	break;
	#endif
	case TCPOPT_SACK_PERMITTED:
	if (optlen != TCPOLEN_SACK_PERMITTED)
	continue;
	if (!(flags & TO_SYN))
	continue;
	if (!V_tcp_do_sack)
	continue;
	to->to_flags \|= TOF_SACKPERM;
	break;
	case TCPOPT_SACK:
	if (optlen <= 2 \|\| (optlen - 2) % TCPOLEN_SACK != 0)
	continue;
	if (flags & TO_SYN)
	continue;
	to->to_flags \|= TOF_SACK;
	to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
	to->to_sacks = cp + 2;
	V_tcpstat.tcps_sack_rcv_blocks++;
	break;
	default:
	continue;
	}
	}
	}

	/*
	* Pull out of band byte out of a segment so
	* it doesn't appear in the user's data queue.
	* It is still reflected in the segment length for
	* sequencing purposes.
	*/
	static void
	tcp_pulloutofband(struct socket so, struct tcphdr th, struct mbuf *m,
	int off)
	{
	int cnt = off + th->th_urp - 1;

	while (cnt >= 0) {
	if (m->m_len > cnt) {
	char *cp = mtod(m, caddr_t) + cnt;
	struct tcpcb *tp = sototcpcb(so);

	INP_WLOCK_ASSERT(tp->t_inpcb);

	tp->t_iobc = *cp;
	tp->t_oobflags \|= TCPOOB_HAVEDATA;
	bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
	m->m_len--;
	if (m->m_flags & M_PKTHDR)
	m->m_pkthdr.len--;
	return;
	}
	cnt -= m->m_len;
	m = m->m_next;
	if (m == NULL)
	break;
	}
	panic("tcp_pulloutofband");
	}

	/*
	* Collect new round-trip time estimate
	* and update averages and current timeout.
	*/
	static void
	tcp_xmit_timer(struct tcpcb *tp, int rtt)
	{
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	int delta;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	V_tcpstat.tcps_rttupdated++;
	tp->t_rttupdated++;
	if (tp->t_srtt != 0) {
	/*
	* srtt is stored as fixed point with 5 bits after the
	* binary point (i.e., scaled by 8). The following magic
	* is equivalent to the smoothing algorithm in rfc793 with
	* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
	* point). Adjust rtt to origin 0.
	*/
	delta = ((rtt - 1) << TCP_DELTA_SHIFT)
	- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));

	if ((tp->t_srtt += delta) <= 0)
	tp->t_srtt = 1;

	/*
	* We accumulate a smoothed rtt variance (actually, a
	* smoothed mean difference), then set the retransmit
	* timer to smoothed rtt + 4 times the smoothed variance.
	* rttvar is stored as fixed point with 4 bits after the
	* binary point (scaled by 16). The following is
	* equivalent to rfc793 smoothing with an alpha of .75
	* (rttvar = rttvar*3/4 + \|delta\| / 4). This replaces
	* rfc793's wired-in beta.
	*/
	if (delta < 0)
	delta = -delta;
	delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
	if ((tp->t_rttvar += delta) <= 0)
	tp->t_rttvar = 1;
	if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	} else {
	/*
	* No rtt measurement yet - use the unsmoothed rtt.
	* Set the variance to half the rtt (so our first
	* retransmit happens at 3*rtt).
	*/
	tp->t_srtt = rtt << TCP_RTT_SHIFT;
	tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
	}
	tp->t_rtttime = 0;
	tp->t_rxtshift = 0;

	/*
	* the retransmit should happen at rtt + 4 * rttvar.
	* Because of the way we do the smoothing, srtt and rttvar
	* will each average +1/2 tick of bias. When we compute
	* the retransmit timer, we want 1/2 tick of rounding and
	* 1 extra tick because of +-1/2 tick uncertainty in the
	* firing of the timer. The bias will give us exactly the
	* 1.5 tick we need. But, because the bias is
	* statistical, we have to test that we don't drop below
	* the minimum feasible timer (which is 2 ticks).
	*/
	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
	max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);

	/*
	* We received an ack for a packet that wasn't retransmitted;
	* it is probably safe to discard any error indications we've
	* received recently. This isn't quite right, but close enough
	* for now (a route might have failed after we sent a segment,
	* and the return path might not be symmetrical).
	*/
	tp->t_softerror = 0;
	}

	/*
	* Determine a reasonable value for maxseg size.
	* If the route is known, check route for mtu.
	* If none, use an mss that can be handled on the outgoing
	* interface without forcing IP to fragment; if bigger than
	* an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
	* to utilize large mbufs. If no route is found, route has no mtu,
	* or the destination isn't local, use a default, hopefully conservative
	* size (usually 512 or the default IP max size, but no more than the mtu
	* of the interface), as we can't discover anything about intervening
	* gateways or networks. We also initialize the congestion/slow start
	* window to be a single segment if the destination isn't local.
	* While looking at the routing entry, we also initialize other path-dependent
	* parameters from pre-set or cached values in the routing entry.
	*
	* Also take into account the space needed for options that we
	* send regularly. Make maxseg shorter by that amount to assure
	* that we can send maxseg amount of data even when the options
	* are present. Store the upper limit of the length of options plus
	* data in maxopd.
	*
	* In case of T/TCP, we call this routine during implicit connection
	* setup as well (offer = -1), to initialize maxseg from the cached
	* MSS of our peer.
	*
	* NOTE that this routine is only called when we process an incoming
	* segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
	*/
	void
	tcp_mss_update(struct tcpcb tp, int offer, struct hc_metrics_lite metricptr)
	{
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	int mss;
	u_long maxmtu;
	struct inpcb *inp = tp->t_inpcb;
	struct hc_metrics_lite metrics;
	int origoffer = offer;
	int mtuflags = 0;
	#ifdef INET6
	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
	size_t min_protoh = isipv6 ?
	sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
	sizeof (struct tcpiphdr);
	#else
	const size_t min_protoh = sizeof(struct tcpiphdr);
	#endif

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/* Initialize. */
	#ifdef INET6
	if (isipv6) {
	maxmtu = tcp_maxmtu6(&inp->inp_inc, &mtuflags);
	tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
	} else
	#endif
	{
	maxmtu = tcp_maxmtu(&inp->inp_inc, &mtuflags);
	tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
	}

	/*
	* No route to sender, stay with default mss and return.
	*/
	if (maxmtu == 0)
	return;

	/* Check the interface for TSO capabilities. */
	if (mtuflags & CSUM_TSO)
	tp->t_flags \|= TF_TSO;

	/* What have we got? */
	switch (offer) {
	case 0:
	/*
	* Offer == 0 means that there was no MSS on the SYN
	* segment, in this case we use tcp_mssdflt as
	* already assigned to t_maxopd above.
	*/
	offer = tp->t_maxopd;
	break;

	case -1:
	/*
	* Offer == -1 means that we didn't receive SYN yet.
	*/
	/* FALLTHROUGH */

	default:
	/*
	* Prevent DoS attack with too small MSS. Round up
	* to at least minmss.
	*/
	offer = max(offer, V_tcp_minmss);
	}

	/*
	* rmx information is now retrieved from tcp_hostcache.
	*/
	tcp_hc_get(&inp->inp_inc, &metrics);
	if (metricptr != NULL)
	bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));

	/*
	* If there's a discovered mtu int tcp hostcache, use it
	* else, use the link mtu.
	*/
	if (metrics.rmx_mtu)
	mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
	else {
	#ifdef INET6
	if (isipv6) {
	mss = maxmtu - min_protoh;
	if (!V_path_mtu_discovery &&
	!in6_localaddr(&inp->in6p_faddr))
	mss = min(mss, V_tcp_v6mssdflt);
	} else
	#endif
	{
	mss = maxmtu - min_protoh;
	if (!V_path_mtu_discovery &&
	!in_localaddr(inp->inp_faddr))
	mss = min(mss, V_tcp_mssdflt);
	}
	/*
	* XXX - The above conditional (mss = maxmtu - min_protoh)
	* probably violates the TCP spec.
	* The problem is that, since we don't know the
	* other end's MSS, we are supposed to use a conservative
	* default. But, if we do that, then MTU discovery will
	* never actually take place, because the conservative
	* default is much less than the MTUs typically seen
	* on the Internet today. For the moment, we'll sweep
	* this under the carpet.
	*
	* The conservative default might not actually be a problem
	* if the only case this occurs is when sending an initial
	* SYN with options and data to a host we've never talked
	* to before. Then, they will reply with an MSS value which
	* will get recorded and the new parameters should get
	* recomputed. For Further Study.
	*/
	}
	mss = min(mss, offer);

	/*
	* Sanity check: make sure that maxopd will be large
	* enough to allow some data on segments even if the
	* all the option space is used (40bytes). Otherwise
	* funny things may happen in tcp_output.
	*/
	mss = max(mss, 64);

	/*
	* maxopd stores the maximum length of data AND options
	* in a segment; maxseg is the amount of data in a normal
	* segment. We need to store this value (maxopd) apart
	* from maxseg, because now every segment carries options
	* and thus we normally have somewhat less data in segments.
	*/
	tp->t_maxopd = mss;

	/*
	* origoffer==-1 indicates that no segments were received yet.
	* In this case we just guess.
	*/
	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_NOOPT)) == TF_REQ_TSTMP &&
	(origoffer == -1 \|\|
	(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
	mss -= TCPOLEN_TSTAMP_APPA;

	#if (MCLBYTES & (MCLBYTES - 1)) == 0
	if (mss > MCLBYTES)
	mss &= ~(MCLBYTES-1);
	#else
	if (mss > MCLBYTES)
	mss = mss / MCLBYTES * MCLBYTES;
	#endif
	tp->t_maxseg = mss;
	}

	void
	tcp_mss(struct tcpcb *tp, int offer)
	{
	int rtt, mss;
	u_long bufsize;
	struct inpcb *inp;
	struct socket *so;
	struct hc_metrics_lite metrics;
	#ifdef INET6
	int isipv6;
	#endif
	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));

	tcp_mss_update(tp, offer, &metrics);

	mss = tp->t_maxseg;
	inp = tp->t_inpcb;
	#ifdef INET6
	isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
	#endif

	/*
	* If there's a pipesize, change the socket buffer to that size,
	* don't change if sb_hiwat is different than default (then it
	* has been changed on purpose with setsockopt).
	* Make the socket buffers an integral number of mss units;
	* if the mss is larger than the socket buffer, decrease the mss.
	*/
	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_snd);
	if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
	bufsize = metrics.rmx_sendpipe;
	else
	bufsize = so->so_snd.sb_hiwat;
	if (bufsize < mss)
	mss = bufsize;
	else {
	bufsize = roundup(bufsize, mss);
	if (bufsize > sb_max)
	bufsize = sb_max;
	if (bufsize > so->so_snd.sb_hiwat)
	(void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
	}
	SOCKBUF_UNLOCK(&so->so_snd);
	tp->t_maxseg = mss;

	SOCKBUF_LOCK(&so->so_rcv);
	if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
	bufsize = metrics.rmx_recvpipe;
	else
	bufsize = so->so_rcv.sb_hiwat;
	if (bufsize > mss) {
	bufsize = roundup(bufsize, mss);
	if (bufsize > sb_max)
	bufsize = sb_max;
	if (bufsize > so->so_rcv.sb_hiwat)
	(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
	}
	SOCKBUF_UNLOCK(&so->so_rcv);
	/*
	* While we're here, check the others too.
	*/
	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
	tp->t_srtt = rtt;
	tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
	V_tcpstat.tcps_usedrtt++;
	if (metrics.rmx_rttvar) {
	tp->t_rttvar = metrics.rmx_rttvar;
	V_tcpstat.tcps_usedrttvar++;
	} else {
	/* default variation is +- 1 rtt */
	tp->t_rttvar =
	tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
	}
	TCPT_RANGESET(tp->t_rxtcur,
	((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
	tp->t_rttmin, TCPTV_REXMTMAX);
	}
	if (metrics.rmx_ssthresh) {
	/*
	* There's some sort of gateway or interface
	* buffer limit on the path. Use this to set
	* the slow start threshhold, but set the
	* threshold to no less than 2*mss.
	*/
	tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
	V_tcpstat.tcps_usedssthresh++;
	}
	if (metrics.rmx_bandwidth)
	tp->snd_bandwidth = metrics.rmx_bandwidth;

	/*
	* Set the slow-start flight size depending on whether this
	* is a local network or not.
	*
	* Extend this so we cache the cwnd too and retrieve it here.
	* Make cwnd even bigger than RFC3390 suggests but only if we
	* have previous experience with the remote host. Be careful
	* not make cwnd bigger than remote receive window or our own
	* send socket buffer. Maybe put some additional upper bound
	* on the retrieved cwnd. Should do incremental updates to
	* hostcache when cwnd collapses so next connection doesn't
	* overloads the path again.
	*
	* RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
	* We currently check only in syncache_socket for that.
	*/
	#define TCP_METRICS_CWND
	#ifdef TCP_METRICS_CWND
	if (metrics.rmx_cwnd)
	tp->snd_cwnd = max(mss,
	min(metrics.rmx_cwnd / 2,
	min(tp->snd_wnd, so->so_snd.sb_hiwat)));
	else
	#endif
	if (V_tcp_do_rfc3390)
	tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
	#ifdef INET6
	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) \|\|
	(!isipv6 && in_localaddr(inp->inp_faddr)))
	#else
	else if (in_localaddr(inp->inp_faddr))
	#endif
	tp->snd_cwnd = mss * V_ss_fltsz_local;
	else
	tp->snd_cwnd = mss * V_ss_fltsz;
	}

	/*
	* Determine the MSS option to send on an outgoing SYN.
	*/
	int
	tcp_mssopt(struct in_conninfo *inc)
	{
	+ INIT_VNET_INET(curvnet);
	int mss = 0;
	u_long maxmtu = 0;
	u_long thcmtu = 0;
	size_t min_protoh;
	#ifdef INET6
	int isipv6 = inc->inc_isipv6 ? 1 : 0;
	#endif

	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));

	#ifdef INET6
	if (isipv6) {
	mss = V_tcp_v6mssdflt;
	maxmtu = tcp_maxmtu6(inc, NULL);
	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
	min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	} else
	#endif
	{
	mss = V_tcp_mssdflt;
	maxmtu = tcp_maxmtu(inc, NULL);
	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
	min_protoh = sizeof(struct tcpiphdr);
	}
	if (maxmtu && thcmtu)
	mss = min(maxmtu, thcmtu) - min_protoh;
	else if (maxmtu \|\| thcmtu)
	mss = max(maxmtu, thcmtu) - min_protoh;

	return (mss);
	}


	/*
	* On a partial ack arrives, force the retransmission of the
	* next unacknowledged segment. Do not clear tp->t_dupacks.
	* By setting snd_nxt to ti_ack, this forces retransmission timer to
	* be started again.
	*/
	static void
	tcp_newreno_partial_ack(struct tcpcb tp, struct tcphdr th)
	{
	tcp_seq onxt = tp->snd_nxt;
	u_long ocwnd = tp->snd_cwnd;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	tcp_timer_activate(tp, TT_REXMT, 0);
	tp->t_rtttime = 0;
	tp->snd_nxt = th->th_ack;
	/*
	* Set snd_cwnd to one segment beyond acknowledged offset.
	* (tp->snd_una has not yet been updated when this function is called.)
	*/
	tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
	tp->t_flags \|= TF_ACKNOW;
	(void) tcp_output(tp);
	tp->snd_cwnd = ocwnd;
	if (SEQ_GT(onxt, tp->snd_nxt))
	tp->snd_nxt = onxt;
	/*
	* Partial window deflation. Relies on fact that tp->snd_una
	* not updated yet.
	*/
	if (tp->snd_cwnd > th->th_ack - tp->snd_una)
	tp->snd_cwnd -= th->th_ack - tp->snd_una;
	else
	tp->snd_cwnd = 0;
	tp->snd_cwnd += tp->t_maxseg;
	}
	Index: head/sys/netinet/tcp_offload.c
	===================================================================
	--- head/sys/netinet/tcp_offload.c (revision 183549)
	+++ head/sys/netinet/tcp_offload.c (revision 183550)
	@@ -1,144 +1,147 @@
	/*-
	* Copyright (c) 2007, Chelsio Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice,
	* this list of conditions and the following disclaimer.
	*
	* 2. Neither the name of the Chelsio Corporation nor the names of its
	* contributors may be used to endorse or promote products derived from
	* this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/types.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_var.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_pcb.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcp_offload.h>
	#include <netinet/toedev.h>

	uint32_t toedev_registration_count;

	int
	tcp_offload_connect(struct socket so, struct sockaddr nam)
	{
	struct ifnet *ifp;
	struct toedev *tdev;
	struct rtentry *rt;
	int error;

	if (toedev_registration_count == 0)
	return (EINVAL);

	/*
	* Look up the route used for the connection to
	* determine if it uses an interface capable of
	* offloading the connection.
	*/
	rt = rtalloc1(nam, 0 /report/, 0 /ignflags/);
	if (rt)
	RT_UNLOCK(rt);
	else
	return (EHOSTUNREACH);

	ifp = rt->rt_ifp;
	if ((ifp->if_capenable & IFCAP_TOE) == 0) {
	error = EINVAL;
	goto fail;
	}

	tdev = TOEDEV(ifp);
	if (tdev == NULL) {
	error = EPERM;
	goto fail;
	}

	if (tdev->tod_can_offload(tdev, so) == 0) {
	error = EPERM;
	goto fail;
	}

	return (tdev->tod_connect(tdev, so, rt, nam));
	fail:
	RTFREE(rt);
	return (error);
	}


	/*
	* This file contains code as a short-term staging area before it is moved in
	* to sys/netinet/tcp_offload.c
	*/

	void
	tcp_offload_twstart(struct tcpcb *tp)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(tp->t_inpcb);
	tcp_twstart(tp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	}

	struct tcpcb *
	tcp_offload_close(struct tcpcb *tp)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(tp->t_inpcb);
	tp = tcp_close(tp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	if (tp)
	INP_WUNLOCK(tp->t_inpcb);

	return (tp);
	}

	struct tcpcb *
	tcp_offload_drop(struct tcpcb *tp, int error)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(tp->t_inpcb);
	tp = tcp_drop(tp, error);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	if (tp)
	INP_WUNLOCK(tp->t_inpcb);

	return (tp);
	}

	Index: head/sys/netinet/tcp_output.c
	===================================================================
	--- head/sys/netinet/tcp_output.c (revision 183549)
	+++ head/sys/netinet/tcp_output.c (revision 183550)
	@@ -1,1455 +1,1462 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/domain.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#endif
	#include <netinet/tcp.h>
	#define TCPOUTFLAGS
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#endif /IPSEC/

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	#ifdef notyet
	extern struct mbuf *m_copypack();
	#endif

	int path_mtu_discovery = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
	- &path_mtu_discovery, 1, "Enable Path MTU Discovery");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, path_mtu_discovery,
	+ CTLFLAG_RW, path_mtu_discovery, 1, "Enable Path MTU Discovery");

	int ss_fltsz = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
	- &ss_fltsz, 1, "Slow start flight size");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO,
	+ slowstart_flightsize, CTLFLAG_RW,
	+ ss_fltsz, 1, "Slow start flight size");

	int ss_fltsz_local = 4;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
	- &ss_fltsz_local, 1, "Slow start flight size for local networks");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO,
	+ local_slowstart_flightsize, CTLFLAG_RW,
	+ ss_fltsz_local, 1, "Slow start flight size for local networks");

	int tcp_do_newreno = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
	- &tcp_do_newreno, 0, "Enable NewReno Algorithms");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
	+ tcp_do_newreno, 0, "Enable NewReno Algorithms");

	int tcp_do_tso = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
	- &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
	+ tcp_do_tso, 0, "Enable TCP Segmentation Offload");

	int tcp_do_autosndbuf = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
	- &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_auto,
	+ CTLFLAG_RW,
	+ tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");

	int tcp_autosndbuf_inc = 8*1024;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
	- &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_inc,
	+ CTLFLAG_RW, tcp_autosndbuf_inc, 0,
	+ "Incrementor step size of automatic send buffer");

	int tcp_autosndbuf_max = 256*1024;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
	- &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, sendbuf_max,
	+ CTLFLAG_RW, tcp_autosndbuf_max, 0,
	+ "Max size of automatic send buffer");


	/*
	* Tcp output routine: figure out what should be sent and send it.
	*/
	int
	tcp_output(struct tcpcb *tp)
	{
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	struct socket *so = tp->t_inpcb->inp_socket;
	long len, recwin, sendwin;
	int off, flags, error;
	struct mbuf *m;
	struct ip *ip = NULL;
	struct ipovly *ipov = NULL;
	struct tcphdr *th;
	u_char opt[TCP_MAXOLEN];
	unsigned ipoptlen, optlen, hdrlen;
	#ifdef IPSEC
	unsigned ipsec_optlen = 0;
	#endif
	int idle, sendalot;
	int sack_rxmit, sack_bytes_rxmt;
	struct sackhole *p;
	int tso = 0;
	struct tcpopt to;
	#if 0
	int maxburst = TCP_MAXBURST;
	#endif
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6;

	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
	#endif

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	* Determine length of data that should be transmitted,
	* and flags that will be used.
	* If there is some data or critical controls (SYN, RST)
	* to send, then transmit; otherwise, investigate further.
	*/
	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
	/*
	* We have been idle for "a while" and no acks are
	* expected to clock out any data we send --
	* slow start to get ack "clock" running again.
	*
	* Set the slow-start flight size depending on whether
	* this is a local network or not.
	*/
	int ss = V_ss_fltsz;
	#ifdef INET6
	if (isipv6) {
	if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
	ss = V_ss_fltsz_local;
	} else
	#endif /* INET6 */
	if (in_localaddr(tp->t_inpcb->inp_faddr))
	ss = V_ss_fltsz_local;
	tp->snd_cwnd = tp->t_maxseg * ss;
	}
	tp->t_flags &= ~TF_LASTIDLE;
	if (idle) {
	if (tp->t_flags & TF_MORETOCOME) {
	tp->t_flags \|= TF_LASTIDLE;
	idle = 0;
	}
	}
	again:
	/*
	* If we've recently taken a timeout, snd_max will be greater than
	* snd_nxt. There may be SACK information that allows us to avoid
	* resending already delivered data. Adjust snd_nxt accordingly.
	*/
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	SEQ_LT(tp->snd_nxt, tp->snd_max))
	tcp_sack_adjust(tp);
	sendalot = 0;
	off = tp->snd_nxt - tp->snd_una;
	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
	sendwin = min(sendwin, tp->snd_bwnd);

	flags = tcp_outflags[tp->t_state];
	/*
	* Send any SACK-generated retransmissions. If we're explicitly trying
	* to send out new data (when sendalot is 1), bypass this function.
	* If we retransmit in fast recovery mode, decrement snd_cwnd, since
	* we're replacing a (future) new transmission with a retransmission
	* now, and we previously incremented snd_cwnd in tcp_input().
	*/
	/*
	* Still in sack recovery , reset rxmit flag to zero.
	*/
	sack_rxmit = 0;
	sack_bytes_rxmt = 0;
	len = 0;
	p = NULL;
	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
	(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
	long cwin;

	cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
	if (cwin < 0)
	cwin = 0;
	/* Do not retransmit SACK segments beyond snd_recover */
	if (SEQ_GT(p->end, tp->snd_recover)) {
	/*
	* (At least) part of sack hole extends beyond
	* snd_recover. Check to see if we can rexmit data
	* for this hole.
	*/
	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
	/*
	* Can't rexmit any more data for this hole.
	* That data will be rexmitted in the next
	* sack recovery episode, when snd_recover
	* moves past p->rxmit.
	*/
	p = NULL;
	goto after_sack_rexmit;
	} else
	/* Can rexmit part of the current hole */
	len = ((long)ulmin(cwin,
	tp->snd_recover - p->rxmit));
	} else
	len = ((long)ulmin(cwin, p->end - p->rxmit));
	off = p->rxmit - tp->snd_una;
	KASSERT(off >= 0,("%s: sack block to the left of una : %d",
	__func__, off));
	if (len > 0) {
	sack_rxmit = 1;
	sendalot = 1;
	V_tcpstat.tcps_sack_rexmits++;
	V_tcpstat.tcps_sack_rexmit_bytes +=
	min(len, tp->t_maxseg);
	}
	}
	after_sack_rexmit:
	/*
	* Get standard flags, and add SYN or FIN if requested by 'hidden'
	* state flags.
	*/
	if (tp->t_flags & TF_NEEDFIN)
	flags \|= TH_FIN;
	if (tp->t_flags & TF_NEEDSYN)
	flags \|= TH_SYN;

	SOCKBUF_LOCK(&so->so_snd);
	/*
	* If in persist timeout with window of 0, send 1 byte.
	* Otherwise, if window is small but nonzero
	* and timer expired, we will send what we can
	* and go to transmit state.
	*/
	if (tp->t_flags & TF_FORCEDATA) {
	if (sendwin == 0) {
	/*
	* If we still have some data to send, then
	* clear the FIN bit. Usually this would
	* happen below when it realizes that we
	* aren't sending all the data. However,
	* if we have exactly 1 byte of unsent data,
	* then it won't clear the FIN bit below,
	* and if we are in persist state, we wind
	* up sending the packet without recording
	* that we sent the FIN bit.
	*
	* We can't just blindly clear the FIN bit,
	* because if we don't have any more data
	* to send then the probe will be the FIN
	* itself.
	*/
	if (off < so->so_snd.sb_cc)
	flags &= ~TH_FIN;
	sendwin = 1;
	} else {
	tcp_timer_activate(tp, TT_PERSIST, 0);
	tp->t_rxtshift = 0;
	}
	}

	/*
	* If snd_nxt == snd_max and we have transmitted a FIN, the
	* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
	* a negative length. This can also occur when TCP opens up
	* its congestion window while receiving additional duplicate
	* acks after fast-retransmit because TCP will reset snd_nxt
	* to snd_max after the fast-retransmit.
	*
	* In the normal retransmit-FIN-only case, however, snd_nxt will
	* be set to snd_una, the offset will be 0, and the length may
	* wind up 0.
	*
	* If sack_rxmit is true we are retransmitting from the scoreboard
	* in which case len is already set.
	*/
	if (sack_rxmit == 0) {
	if (sack_bytes_rxmt == 0)
	len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
	else {
	long cwin;

	/*
	* We are inside of a SACK recovery episode and are
	* sending new data, having retransmitted all the
	* data possible in the scoreboard.
	*/
	len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd)
	- off);
	/*
	* Don't remove this (len > 0) check !
	* We explicitly check for len > 0 here (although it
	* isn't really necessary), to work around a gcc
	* optimization issue - to force gcc to compute
	* len above. Without this check, the computation
	* of len is bungled by the optimizer.
	*/
	if (len > 0) {
	cwin = tp->snd_cwnd -
	(tp->snd_nxt - tp->sack_newdata) -
	sack_bytes_rxmt;
	if (cwin < 0)
	cwin = 0;
	len = lmin(len, cwin);
	}
	}
	}

	/*
	* Lop off SYN bit if it has already been sent. However, if this
	* is SYN-SENT state and if segment contains data and if we don't
	* know that foreign host supports TAO, suppress sending segment.
	*/
	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
	if (tp->t_state != TCPS_SYN_RECEIVED)
	flags &= ~TH_SYN;
	off--, len++;
	}

	/*
	* Be careful not to send data and/or FIN on SYN segments.
	* This measure is needed to prevent interoperability problems
	* with not fully conformant TCP implementations.
	*/
	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
	len = 0;
	flags &= ~TH_FIN;
	}

	if (len < 0) {
	/*
	* If FIN has been sent but not acked,
	* but we haven't been called to retransmit,
	* len will be < 0. Otherwise, window shrank
	* after we sent into it. If window shrank to 0,
	* cancel pending retransmit, pull snd_nxt back
	* to (closed) window, and set the persist timer
	* if it isn't already going. If the window didn't
	* close completely, just wait for an ACK.
	*/
	len = 0;
	if (sendwin == 0) {
	tcp_timer_activate(tp, TT_REXMT, 0);
	tp->t_rxtshift = 0;
	tp->snd_nxt = tp->snd_una;
	if (!tcp_timer_active(tp, TT_PERSIST))
	tcp_setpersist(tp);
	}
	}

	/* len will be >= 0 after this point. */
	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));

	/*
	* Automatic sizing of send socket buffer. Often the send buffer
	* size is not optimally adjusted to the actual network conditions
	* at hand (delay bandwidth product). Setting the buffer size too
	* small limits throughput on links with high bandwidth and high
	* delay (eg. trans-continental/oceanic links). Setting the
	* buffer size too big consumes too much real kernel memory,
	* especially with many connections on busy servers.
	*
	* The criteria to step up the send buffer one notch are:
	* 1. receive window of remote host is larger than send buffer
	* (with a fudge factor of 5/4th);
	* 2. send buffer is filled to 7/8th with data (so we actually
	* have data to make use of it);
	* 3. send buffer fill has not hit maximal automatic size;
	* 4. our send window (slow start and cogestion controlled) is
	* larger than sent but unacknowledged data in send buffer.
	*
	* The remote host receive window scaling factor may limit the
	* growing of the send buffer before it reaches its allowed
	* maximum.
	*
	* It scales directly with slow start or congestion window
	* and does at most one step per received ACK. This fast
	* scaling has the drawback of growing the send buffer beyond
	* what is strictly necessary to make full use of a given
	* delay*bandwith product. However testing has shown this not
	* to be much of an problem. At worst we are trading wasting
	* of available bandwith (the non-use of it) for wasting some
	* socket buffer memory.
	*
	* TODO: Shrink send buffer during idle periods together
	* with congestion window. Requires another timer. Has to
	* wait for upcoming tcp timer rewrite.
	*/
	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
	if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
	so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
	so->so_snd.sb_cc < V_tcp_autosndbuf_max &&
	sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
	if (!sbreserve_locked(&so->so_snd,
	min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc,
	V_tcp_autosndbuf_max), so, curthread))
	so->so_snd.sb_flags &= ~SB_AUTOSIZE;
	}
	}

	/*
	* Truncate to the maximum segment length or enable TCP Segmentation
	* Offloading (if supported by hardware) and ensure that FIN is removed
	* if the length no longer contains the last data byte.
	*
	* TSO may only be used if we are in a pure bulk sending state. The
	* presence of TCP-MD5, SACK retransmits, SACK advertizements and
	* IP options prevent using TSO. With TSO the TCP header is the same
	* (except for the sequence number) for all generated packets. This
	* makes it impossible to transmit any options which vary per generated
	* segment or packet.
	*
	* The length of TSO bursts is limited to TCP_MAXWIN. That limit and
	* removal of FIN (if not already catched here) are handled later after
	* the exact length of the TCP options are known.
	*/
	#ifdef IPSEC
	/*
	* Pre-calculate here as we save another lookup into the darknesses
	* of IPsec that way and can actually decide if TSO is ok.
	*/
	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
	#endif
	if (len > tp->t_maxseg) {
	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
	((tp->t_flags & TF_SIGNATURE) == 0) &&
	tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
	tp->t_inpcb->inp_options == NULL &&
	tp->t_inpcb->in6p_options == NULL
	#ifdef IPSEC
	&& ipsec_optlen == 0
	#endif
	) {
	tso = 1;
	} else {
	len = tp->t_maxseg;
	sendalot = 1;
	tso = 0;
	}
	}
	if (sack_rxmit) {
	if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
	flags &= ~TH_FIN;
	} else {
	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
	flags &= ~TH_FIN;
	}

	recwin = sbspace(&so->so_rcv);

	/*
	* Sender silly window avoidance. We transmit under the following
	* conditions when len is non-zero:
	*
	* - We have a full segment (or more with TSO)
	* - This is the last buffer in a write()/send() and we are
	* either idle or running NODELAY
	* - we've timed out (e.g. persist timer)
	* - we have more then 1/2 the maximum send window's worth of
	* data (receiver may be limited the window size)
	* - we need to retransmit
	*/
	if (len) {
	if (len >= tp->t_maxseg)
	goto send;
	/*
	* NOTE! on localhost connections an 'ack' from the remote
	* end may occur synchronously with the output and cause
	* us to flush a buffer queued with moretocome. XXX
	*
	* note: the len + off check is almost certainly unnecessary.
	*/
	if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
	(idle \|\| (tp->t_flags & TF_NODELAY)) &&
	len + off >= so->so_snd.sb_cc &&
	(tp->t_flags & TF_NOPUSH) == 0) {
	goto send;
	}
	if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
	goto send;
	if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
	goto send;
	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
	goto send;
	if (sack_rxmit)
	goto send;
	}

	/*
	* Compare available window to amount of window
	* known to peer (as advertised window less
	* next expected input). If the difference is at least two
	* max size segments, or at least 50% of the maximum possible
	* window, then want to send a window update to peer.
	* Skip this if the connection is in T/TCP half-open state.
	* Don't send pure window updates when the peer has closed
	* the connection and won't ever send more data.
	*/
	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
	!TCPS_HAVERCVDFIN(tp->t_state)) {
	/*
	* "adv" is the amount we can increase the window,
	* taking into account that we are limited by
	* TCP_MAXWIN << tp->rcv_scale.
	*/
	long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) -
	(tp->rcv_adv - tp->rcv_nxt);

	if (adv >= (long) (2 * tp->t_maxseg))
	goto send;
	if (2 * adv >= (long) so->so_rcv.sb_hiwat)
	goto send;
	}

	/*
	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
	* is also a catch-all for the retransmit timer timeout case.
	*/
	if (tp->t_flags & TF_ACKNOW)
	goto send;
	if ((flags & TH_RST) \|\|
	((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
	goto send;
	if (SEQ_GT(tp->snd_up, tp->snd_una))
	goto send;
	/*
	* If our state indicates that FIN should be sent
	* and we have not yet done so, then we need to send.
	*/
	if (flags & TH_FIN &&
	((tp->t_flags & TF_SENTFIN) == 0 \|\| tp->snd_nxt == tp->snd_una))
	goto send;
	/*
	* In SACK, it is possible for tcp_output to fail to send a segment
	* after the retransmission timer has been turned off. Make sure
	* that the retransmission timer is set.
	*/
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	SEQ_GT(tp->snd_max, tp->snd_una) &&
	!tcp_timer_active(tp, TT_REXMT) &&
	!tcp_timer_active(tp, TT_PERSIST)) {
	tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
	goto just_return;
	}
	/*
	* TCP window updates are not reliable, rather a polling protocol
	* using ``persist'' packets is used to insure receipt of window
	* updates. The three ``states'' for the output side are:
	* idle not doing retransmits or persists
	* persisting to move a small or zero window
	* (re)transmitting and thereby not persisting
	*
	* tcp_timer_active(tp, TT_PERSIST)
	* is true when we are in persist state.
	* (tp->t_flags & TF_FORCEDATA)
	* is set when we are called to send a persist packet.
	* tcp_timer_active(tp, TT_REXMT)
	* is set when we are retransmitting
	* The output side is idle when both timers are zero.
	*
	* If send window is too small, there is data to transmit, and no
	* retransmit or persist is pending, then go to persist state.
	* If nothing happens soon, send when timer expires:
	* if window is nonzero, transmit what we can,
	* otherwise force out a byte.
	*/
	if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) &&
	!tcp_timer_active(tp, TT_PERSIST)) {
	tp->t_rxtshift = 0;
	tcp_setpersist(tp);
	}

	/*
	* No reason to send a segment, just return.
	*/
	just_return:
	SOCKBUF_UNLOCK(&so->so_snd);
	return (0);

	send:
	SOCKBUF_LOCK_ASSERT(&so->so_snd);
	/*
	* Before ESTABLISHED, force sending of initial options
	* unless TCP set not to do any options.
	* NOTE: we assume that the IP/TCP header plus TCP options
	* always fit in a single mbuf, leaving room for a maximum
	* link header, i.e.
	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
	*/
	optlen = 0;
	#ifdef INET6
	if (isipv6)
	hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	else
	#endif
	hdrlen = sizeof (struct tcpiphdr);

	/*
	* Compute options for segment.
	* We only have to care about SYN and established connection
	* segments. Options for SYN-ACK segments are handled in TCP
	* syncache.
	*/
	if ((tp->t_flags & TF_NOOPT) == 0) {
	to.to_flags = 0;
	/* Maximum segment size. */
	if (flags & TH_SYN) {
	tp->snd_nxt = tp->iss;
	to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
	to.to_flags \|= TOF_MSS;
	}
	/* Window scaling. */
	if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
	to.to_wscale = tp->request_r_scale;
	to.to_flags \|= TOF_SCALE;
	}
	/* Timestamps. */
	if ((tp->t_flags & TF_RCVD_TSTMP) \|\|
	((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
	to.to_tsval = ticks + tp->ts_offset;
	to.to_tsecr = tp->ts_recent;
	to.to_flags \|= TOF_TS;
	/* Set receive buffer autosizing timestamp. */
	if (tp->rfbuf_ts == 0 &&
	(so->so_rcv.sb_flags & SB_AUTOSIZE))
	tp->rfbuf_ts = ticks;
	}
	/* Selective ACK's. */
	if (tp->t_flags & TF_SACK_PERMIT) {
	if (flags & TH_SYN)
	to.to_flags \|= TOF_SACKPERM;
	else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	(tp->t_flags & TF_SACK_PERMIT) &&
	tp->rcv_numsacks > 0) {
	to.to_flags \|= TOF_SACK;
	to.to_nsacks = tp->rcv_numsacks;
	to.to_sacks = (u_char *)tp->sackblks;
	}
	}
	#ifdef TCP_SIGNATURE
	/* TCP-MD5 (RFC2385). */
	if (tp->t_flags & TF_SIGNATURE)
	to.to_flags \|= TOF_SIGNATURE;
	#endif /* TCP_SIGNATURE */

	/* Processing the options. */
	hdrlen += optlen = tcp_addoptions(&to, opt);
	}

	#ifdef INET6
	if (isipv6)
	ipoptlen = ip6_optlen(tp->t_inpcb);
	else
	#endif
	if (tp->t_inpcb->inp_options)
	ipoptlen = tp->t_inpcb->inp_options->m_len -
	offsetof(struct ipoption, ipopt_list);
	else
	ipoptlen = 0;
	#ifdef IPSEC
	ipoptlen += ipsec_optlen;
	#endif

	/*
	* Adjust data length if insertion of options will
	* bump the packet length beyond the t_maxopd length.
	* Clear the FIN bit because we cut off the tail of
	* the segment.
	*
	* When doing TSO limit a burst to TCP_MAXWIN minus the
	* IP, TCP and Options length to keep ip->ip_len from
	* overflowing. Prevent the last segment from being
	* fractional thus making them all equal sized and set
	* the flag to continue sending. TSO is disabled when
	* IP options or IPSEC are present.
	*/
	if (len + optlen + ipoptlen > tp->t_maxopd) {
	flags &= ~TH_FIN;
	if (tso) {
	if (len > TCP_MAXWIN - hdrlen - optlen) {
	len = TCP_MAXWIN - hdrlen - optlen;
	len = len - (len % (tp->t_maxopd - optlen));
	sendalot = 1;
	} else if (tp->t_flags & TF_NEEDFIN)
	sendalot = 1;
	} else {
	len = tp->t_maxopd - optlen - ipoptlen;
	sendalot = 1;
	}
	}

	/#ifdef DIAGNOSTIC/
	#ifdef INET6
	if (max_linkhdr + hdrlen > MCLBYTES)
	#else
	if (max_linkhdr + hdrlen > MHLEN)
	#endif
	panic("tcphdr too big");
	/#endif/

	/*
	* This KASSERT is here to catch edge cases at a well defined place.
	* Before, those had triggered (random) panic conditions further down.
	*/
	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));

	/*
	* Grab a header mbuf, attaching a copy of data to
	* be transmitted, and initialize the header from
	* the template for sends on this connection.
	*/
	if (len) {
	struct mbuf *mb;
	u_int moff;

	if ((tp->t_flags & TF_FORCEDATA) && len == 1)
	V_tcpstat.tcps_sndprobe++;
	else if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
	V_tcpstat.tcps_sndrexmitpack++;
	V_tcpstat.tcps_sndrexmitbyte += len;
	} else {
	V_tcpstat.tcps_sndpack++;
	V_tcpstat.tcps_sndbyte += len;
	}
	#ifdef notyet
	if ((m = m_copypack(so->so_snd.sb_mb, off,
	(int)len, max_linkhdr + hdrlen)) == 0) {
	SOCKBUF_UNLOCK(&so->so_snd);
	error = ENOBUFS;
	goto out;
	}
	/*
	* m_copypack left space for our hdr; use it.
	*/
	m->m_len += hdrlen;
	m->m_data -= hdrlen;
	#else
	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL) {
	SOCKBUF_UNLOCK(&so->so_snd);
	error = ENOBUFS;
	goto out;
	}
	#ifdef INET6
	if (MHLEN < hdrlen + max_linkhdr) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	SOCKBUF_UNLOCK(&so->so_snd);
	m_freem(m);
	error = ENOBUFS;
	goto out;
	}
	}
	#endif
	m->m_data += max_linkhdr;
	m->m_len = hdrlen;

	/*
	* Start the m_copy functions from the closest mbuf
	* to the offset in the socket buffer chain.
	*/
	mb = sbsndptr(&so->so_snd, off, len, &moff);

	if (len <= MHLEN - hdrlen - max_linkhdr) {
	m_copydata(mb, moff, (int)len,
	mtod(m, caddr_t) + hdrlen);
	m->m_len += len;
	} else {
	m->m_next = m_copy(mb, moff, (int)len);
	if (m->m_next == NULL) {
	SOCKBUF_UNLOCK(&so->so_snd);
	(void) m_free(m);
	error = ENOBUFS;
	goto out;
	}
	}
	#endif
	/*
	* If we're sending everything we've got, set PUSH.
	* (This will keep happy those implementations which only
	* give data to the user when a buffer fills or
	* a PUSH comes in.)
	*/
	if (off + len == so->so_snd.sb_cc)
	flags \|= TH_PUSH;
	SOCKBUF_UNLOCK(&so->so_snd);
	} else {
	SOCKBUF_UNLOCK(&so->so_snd);
	if (tp->t_flags & TF_ACKNOW)
	V_tcpstat.tcps_sndacks++;
	else if (flags & (TH_SYN\|TH_FIN\|TH_RST))
	V_tcpstat.tcps_sndctrl++;
	else if (SEQ_GT(tp->snd_up, tp->snd_una))
	V_tcpstat.tcps_sndurg++;
	else
	V_tcpstat.tcps_sndwinup++;

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m == NULL) {
	error = ENOBUFS;
	goto out;
	}
	#ifdef INET6
	if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
	MHLEN >= hdrlen) {
	MH_ALIGN(m, hdrlen);
	} else
	#endif
	m->m_data += max_linkhdr;
	m->m_len = hdrlen;
	}
	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
	m->m_pkthdr.rcvif = (struct ifnet *)0;
	#ifdef MAC
	mac_inpcb_create_mbuf(tp->t_inpcb, m);
	#endif
	#ifdef INET6
	if (isipv6) {
	ip6 = mtod(m, struct ip6_hdr *);
	th = (struct tcphdr *)(ip6 + 1);
	tcpip_fillheaders(tp->t_inpcb, ip6, th);
	} else
	#endif /* INET6 */
	{
	ip = mtod(m, struct ip *);
	ipov = (struct ipovly *)ip;
	th = (struct tcphdr *)(ip + 1);
	tcpip_fillheaders(tp->t_inpcb, ip, th);
	}

	/*
	* Fill in fields, remembering maximum advertised
	* window for use in delaying messages about window sizes.
	* If resending a FIN, be sure not to use a new sequence number.
	*/
	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
	tp->snd_nxt == tp->snd_max)
	tp->snd_nxt--;
	/*
	* If we are starting a connection, send ECN setup
	* SYN packet. If we are on a retransmit, we may
	* resend those bits a number of times as per
	* RFC 3168.
	*/
	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
	if (tp->t_rxtshift >= 1) {
	if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
	flags \|= TH_ECE\|TH_CWR;
	} else
	flags \|= TH_ECE\|TH_CWR;
	}

	if (tp->t_state == TCPS_ESTABLISHED &&
	(tp->t_flags & TF_ECN_PERMIT)) {
	/*
	* If the peer has ECN, mark data packets with
	* ECN capable transmission (ECT).
	* Ignore pure ack packets, retransmissions and window probes.
	*/
	if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
	!((tp->t_flags & TF_FORCEDATA) && len == 1)) {
	#ifdef INET6
	if (isipv6)
	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << 20);
	else
	#endif
	ip->ip_tos \|= IPTOS_ECN_ECT0;
	V_tcpstat.tcps_ecn_ect0++;
	}

	/*
	* Reply with proper ECN notifications.
	*/
	if (tp->t_flags & TF_ECN_SND_CWR) {
	flags \|= TH_CWR;
	tp->t_flags &= ~TF_ECN_SND_CWR;
	}
	if (tp->t_flags & TF_ECN_SND_ECE)
	flags \|= TH_ECE;
	}

	/*
	* If we are doing retransmissions, then snd_nxt will
	* not reflect the first unsent octet. For ACK only
	* packets, we do not want the sequence number of the
	* retransmitted packet, we want the sequence number
	* of the next unsent octet. So, if there is no data
	* (and no SYN or FIN), use snd_max instead of snd_nxt
	* when filling in ti_seq. But if we are in persist
	* state, snd_max might reflect one byte beyond the
	* right edge of the window, so use snd_nxt in that
	* case, since we know we aren't doing a retransmission.
	* (retransmit and persist are mutually exclusive...)
	*/
	if (sack_rxmit == 0) {
	if (len \|\| (flags & (TH_SYN\|TH_FIN)) \|\|
	tcp_timer_active(tp, TT_PERSIST))
	th->th_seq = htonl(tp->snd_nxt);
	else
	th->th_seq = htonl(tp->snd_max);
	} else {
	th->th_seq = htonl(p->rxmit);
	p->rxmit += len;
	tp->sackhint.sack_bytes_rexmit += len;
	}
	th->th_ack = htonl(tp->rcv_nxt);
	if (optlen) {
	bcopy(opt, th + 1, optlen);
	th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
	}
	th->th_flags = flags;
	/*
	* Calculate receive window. Don't shrink window,
	* but avoid silly window syndrome.
	*/
	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
	recwin < (long)tp->t_maxseg)
	recwin = 0;
	if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
	recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
	recwin = (long)TCP_MAXWIN << tp->rcv_scale;

	/*
	* According to RFC1323 the window field in a SYN (i.e., a <SYN>
	* or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
	* case is handled in syncache.
	*/
	if (flags & TH_SYN)
	th->th_win = htons((u_short)
	(min(sbspace(&so->so_rcv), TCP_MAXWIN)));
	else
	th->th_win = htons((u_short)(recwin >> tp->rcv_scale));

	/*
	* Adjust the RXWIN0SENT flag - indicate that we have advertised
	* a 0 window. This may cause the remote transmitter to stall. This
	* flag tells soreceive() to disable delayed acknowledgements when
	* draining the buffer. This can occur if the receiver is attempting
	* to read more data than can be buffered prior to transmitting on
	* the connection.
	*/
	if (recwin == 0)
	tp->t_flags \|= TF_RXWIN0SENT;
	else
	tp->t_flags &= ~TF_RXWIN0SENT;
	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
	th->th_flags \|= TH_URG;
	} else
	/*
	* If no urgent pointer to send, then we pull
	* the urgent pointer to the left edge of the send window
	* so that it doesn't drift into the send window on sequence
	* number wraparound.
	*/
	tp->snd_up = tp->snd_una; /* drag it along */

	#ifdef TCP_SIGNATURE
	if (tp->t_flags & TF_SIGNATURE) {
	int sigoff = to.to_signature - opt;
	tcp_signature_compute(m, 0, len, optlen,
	(u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND);
	}
	#endif

	/*
	* Put TCP length in extended header, and then
	* checksum extended header and data.
	*/
	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
	#ifdef INET6
	if (isipv6)
	/*
	* ip6_plen is not need to be filled now, and will be filled
	* in ip6_output.
	*/
	th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
	sizeof(struct tcphdr) + optlen + len);
	else
	#endif /* INET6 */
	{
	m->m_pkthdr.csum_flags = CSUM_TCP;
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));

	/* IP version must be set here for ipv4/ipv6 checking later */
	KASSERT(ip->ip_v == IPVERSION,
	("%s: IP version incorrect: %d", __func__, ip->ip_v));
	}

	/*
	* Enable TSO and specify the size of the segments.
	* The TCP pseudo header checksum is always provided.
	* XXX: Fixme: This is currently not the case for IPv6.
	*/
	if (tso) {
	m->m_pkthdr.csum_flags = CSUM_TSO;
	m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
	}

	/*
	* In transmit state, time the transmission and arrange for
	* the retransmit. In persist state, just set snd_max.
	*/
	if ((tp->t_flags & TF_FORCEDATA) == 0 \|\|
	!tcp_timer_active(tp, TT_PERSIST)) {
	tcp_seq startseq = tp->snd_nxt;

	/*
	* Advance snd_nxt over sequence space of this segment.
	*/
	if (flags & (TH_SYN\|TH_FIN)) {
	if (flags & TH_SYN)
	tp->snd_nxt++;
	if (flags & TH_FIN) {
	tp->snd_nxt++;
	tp->t_flags \|= TF_SENTFIN;
	}
	}
	if (sack_rxmit)
	goto timer;
	tp->snd_nxt += len;
	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
	tp->snd_max = tp->snd_nxt;
	/*
	* Time this transmission if not a retransmission and
	* not currently timing anything.
	*/
	if (tp->t_rtttime == 0) {
	tp->t_rtttime = ticks;
	tp->t_rtseq = startseq;
	V_tcpstat.tcps_segstimed++;
	}
	}

	/*
	* Set retransmit timer if not currently set,
	* and not doing a pure ack or a keep-alive probe.
	* Initial value for retransmit timer is smoothed
	* round-trip time + 2 * round-trip time variance.
	* Initialize shift counter which is used for backoff
	* of retransmit time.
	*/
	timer:
	if (!tcp_timer_active(tp, TT_REXMT) &&
	((sack_rxmit && tp->snd_nxt != tp->snd_max) \|\|
	(tp->snd_nxt != tp->snd_una))) {
	if (tcp_timer_active(tp, TT_PERSIST)) {
	tcp_timer_activate(tp, TT_PERSIST, 0);
	tp->t_rxtshift = 0;
	}
	tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
	}
	} else {
	/*
	* Persist case, update snd_max but since we are in
	* persist mode (no window) we do not update snd_nxt.
	*/
	int xlen = len;
	if (flags & TH_SYN)
	++xlen;
	if (flags & TH_FIN) {
	++xlen;
	tp->t_flags \|= TF_SENTFIN;
	}
	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
	tp->snd_max = tp->snd_nxt + len;
	}

	#ifdef TCPDEBUG
	/*
	* Trace.
	*/
	if (so->so_options & SO_DEBUG) {
	u_short save = 0;
	#ifdef INET6
	if (!isipv6)
	#endif
	{
	save = ipov->ih_len;
	ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */);
	}
	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
	#ifdef INET6
	if (!isipv6)
	#endif
	ipov->ih_len = save;
	}
	#endif

	/*
	* Fill in IP length and desired time to live and
	* send to IP level. There should be a better way
	* to handle ttl and tos; we could keep them in
	* the template, but need a way to checksum without them.
	*/
	/*
	* m->m_pkthdr.len should have been set before cksum calcuration,
	* because in6_cksum() need it.
	*/
	#ifdef INET6
	if (isipv6) {
	/*
	* we separately set hoplimit for every segment, since the
	* user might want to change the value via setsockopt.
	* Also, desired default hop limit might be changed via
	* Neighbor Discovery.
	*/
	ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);

	/* TODO: IPv6 IP6TOS_ECT bit on */
	error = ip6_output(m,
	tp->t_inpcb->in6p_outputopts, NULL,
	((so->so_options & SO_DONTROUTE) ?
	IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb);
	} else
	#endif /* INET6 */
	{
	ip->ip_len = m->m_pkthdr.len;
	#ifdef INET6
	if (INP_CHECK_SOCKAF(so, AF_INET6))
	ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
	#endif /* INET6 */
	/*
	* If we do path MTU discovery, then we set DF on every packet.
	* This might not be the best thing to do according to RFC3390
	* Section 2. However the tcp hostcache migitates the problem
	* so it affects only the first tcp connection with a host.
	*/
	if (V_path_mtu_discovery)
	ip->ip_off \|= IP_DF;

	error = ip_output(m, tp->t_inpcb->inp_options, NULL,
	((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
	tp->t_inpcb);
	}
	if (error) {

	/*
	* We know that the packet was lost, so back out the
	* sequence number advance, if any.
	*
	* If the error is EPERM the packet got blocked by the
	* local firewall. Normally we should terminate the
	* connection but the blocking may have been spurious
	* due to a firewall reconfiguration cycle. So we treat
	* it like a packet loss and let the retransmit timer and
	* timeouts do their work over time.
	* XXX: It is a POLA question whether calling tcp_drop right
	* away would be the really correct behavior instead.
	*/
	if (((tp->t_flags & TF_FORCEDATA) == 0 \|\|
	!tcp_timer_active(tp, TT_PERSIST)) &&
	((flags & TH_SYN) == 0) &&
	(error != EPERM)) {
	if (sack_rxmit) {
	p->rxmit -= len;
	tp->sackhint.sack_bytes_rexmit -= len;
	KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
	("sackhint bytes rtx >= 0"));
	} else
	tp->snd_nxt -= len;
	}
	out:
	SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
	switch (error) {
	case EPERM:
	tp->t_softerror = error;
	return (error);
	case ENOBUFS:
	if (!tcp_timer_active(tp, TT_REXMT) &&
	!tcp_timer_active(tp, TT_PERSIST))
	tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
	tp->snd_cwnd = tp->t_maxseg;
	return (0);
	case EMSGSIZE:
	/*
	* For some reason the interface we used initially
	* to send segments changed to another or lowered
	* its MTU.
	*
	* tcp_mtudisc() will find out the new MTU and as
	* its last action, initiate retransmission, so it
	* is important to not do so here.
	*
	* If TSO was active we either got an interface
	* without TSO capabilits or TSO was turned off.
	* Disable it for this connection as too and
	* immediatly retry with MSS sized segments generated
	* by this function.
	*/
	if (tso)
	tp->t_flags &= ~TF_TSO;
	tcp_mtudisc(tp->t_inpcb, 0);
	return (0);
	case EHOSTDOWN:
	case EHOSTUNREACH:
	case ENETDOWN:
	case ENETUNREACH:
	if (TCPS_HAVERCVDSYN(tp->t_state)) {
	tp->t_softerror = error;
	return (0);
	}
	/* FALLTHROUGH */
	default:
	return (error);
	}
	}
	V_tcpstat.tcps_sndtotal++;

	/*
	* Data sent (as far as we can tell).
	* If this advertises a larger window than any other segment,
	* then remember the size of the advertised window.
	* Any pending ACK has now been sent.
	*/
	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
	tp->rcv_adv = tp->rcv_nxt + recwin;
	tp->last_ack_sent = tp->rcv_nxt;
	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
	if (tcp_timer_active(tp, TT_DELACK))
	tcp_timer_activate(tp, TT_DELACK, 0);
	#if 0
	/*
	* This completely breaks TCP if newreno is turned on. What happens
	* is that if delayed-acks are turned on on the receiver, this code
	* on the transmitter effectively destroys the TCP window, forcing
	* it to four packets (1.5Kx4 = 6K window).
	*/
	if (sendalot && (!V_tcp_do_newreno \|\| --maxburst))
	goto again;
	#endif
	if (sendalot)
	goto again;
	return (0);
	}

	void
	tcp_setpersist(struct tcpcb *tp)
	{
	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
	int tt;

	if (tcp_timer_active(tp, TT_REXMT))
	panic("tcp_setpersist: retransmit pending");
	/*
	* Start/restart persistance timer.
	*/
	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
	TCPTV_PERSMIN, TCPTV_PERSMAX);
	tcp_timer_activate(tp, TT_PERSIST, tt);
	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
	tp->t_rxtshift++;
	}

	/*
	* Insert TCP options according to the supplied parameters to the place
	* optp in a consistent way. Can handle unaligned destinations.
	*
	* The order of the option processing is crucial for optimal packing and
	* alignment for the scarce option space.
	*
	* The optimal order for a SYN/SYN-ACK segment is:
	* MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) +
	* Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40.
	*
	* The SACK options should be last. SACK blocks consume 8*n+2 bytes.
	* So a full size SACK blocks option is 34 bytes (with 4 SACK blocks).
	* At minimum we need 10 bytes (to generate 1 SACK block). If both
	* TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present,
	* we only have 10 bytes for SACK options (40 - (12 + 18)).
	*/
	int
	tcp_addoptions(struct tcpopt to, u_char optp)
	{
	+ INIT_VNET_INET(curvnet);
	u_int mask, optlen = 0;

	for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) {
	if ((to->to_flags & mask) != mask)
	continue;
	if (optlen == TCP_MAXOLEN)
	break;
	switch (to->to_flags & mask) {
	case TOF_MSS:
	while (optlen % 4) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG)
	continue;
	optlen += TCPOLEN_MAXSEG;
	*optp++ = TCPOPT_MAXSEG;
	*optp++ = TCPOLEN_MAXSEG;
	to->to_mss = htons(to->to_mss);
	bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss));
	optp += sizeof(to->to_mss);
	break;
	case TOF_SCALE:
	while (!optlen \|\| optlen % 2 != 1) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW)
	continue;
	optlen += TCPOLEN_WINDOW;
	*optp++ = TCPOPT_WINDOW;
	*optp++ = TCPOLEN_WINDOW;
	*optp++ = to->to_wscale;
	break;
	case TOF_SACKPERM:
	while (optlen % 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED)
	continue;
	optlen += TCPOLEN_SACK_PERMITTED;
	*optp++ = TCPOPT_SACK_PERMITTED;
	*optp++ = TCPOLEN_SACK_PERMITTED;
	break;
	case TOF_TS:
	while (!optlen \|\| optlen % 4 != 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP)
	continue;
	optlen += TCPOLEN_TIMESTAMP;
	*optp++ = TCPOPT_TIMESTAMP;
	*optp++ = TCPOLEN_TIMESTAMP;
	to->to_tsval = htonl(to->to_tsval);
	to->to_tsecr = htonl(to->to_tsecr);
	bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval));
	optp += sizeof(to->to_tsval);
	bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr));
	optp += sizeof(to->to_tsecr);
	break;
	case TOF_SIGNATURE:
	{
	int siglen = TCPOLEN_SIGNATURE - 2;

	while (!optlen \|\| optlen % 4 != 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE)
	continue;
	optlen += TCPOLEN_SIGNATURE;
	*optp++ = TCPOPT_SIGNATURE;
	*optp++ = TCPOLEN_SIGNATURE;
	to->to_signature = optp;
	while (siglen--)
	*optp++ = 0;
	break;
	}
	case TOF_SACK:
	{
	int sackblks = 0;
	struct sackblk sack = (struct sackblk )to->to_sacks;
	tcp_seq sack_seq;

	while (!optlen \|\| optlen % 4 != 2) {
	optlen += TCPOLEN_NOP;
	*optp++ = TCPOPT_NOP;
	}
	if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK)
	continue;
	optlen += TCPOLEN_SACKHDR;
	*optp++ = TCPOPT_SACK;
	sackblks = min(to->to_nsacks,
	(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
	optp++ = TCPOLEN_SACKHDR + sackblks TCPOLEN_SACK;
	while (sackblks--) {
	sack_seq = htonl(sack->start);
	bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
	optp += sizeof(sack_seq);
	sack_seq = htonl(sack->end);
	bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
	optp += sizeof(sack_seq);
	optlen += TCPOLEN_SACK;
	sack++;
	}
	V_tcpstat.tcps_sack_send_blocks++;
	break;
	}
	default:
	panic("%s: unknown TCP option type", __func__);
	break;
	}
	}

	/* Terminate and pad TCP options to a 4 byte boundary. */
	if (optlen % 4) {
	optlen += TCPOLEN_EOL;
	*optp++ = TCPOPT_EOL;
	}
	/*
	* According to RFC 793 (STD0007):
	* "The content of the header beyond the End-of-Option option
	* must be header padding (i.e., zero)."
	* and later: "The padding is composed of zeros."
	*/
	while (optlen % 4) {
	optlen += TCPOLEN_PAD;
	*optp++ = TCPOPT_PAD;
	}

	KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__));
	return (optlen);
	}
	Index: head/sys/netinet/tcp_reass.c
	===================================================================
	--- head/sys/netinet/tcp_reass.c (revision 183549)
	+++ head/sys/netinet/tcp_reass.c (revision 183550)
	@@ -1,286 +1,289 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#include <netinet/ip6.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet6/tcp6_var.h>
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif /* TCPDEBUG */

	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
	"TCP Segment Reassembly Queue");

	static int tcp_reass_maxseg = 0;
	-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
	- &tcp_reass_maxseg, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments,
	+ CTLFLAG_RDTUN, tcp_reass_maxseg, 0,
	"Global maximum number of TCP Segments in Reassembly Queue");

	int tcp_reass_qsize = 0;
	-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
	- &tcp_reass_qsize, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments,
	+ CTLFLAG_RD, tcp_reass_qsize, 0,
	"Global number of TCP Segments currently in Reassembly Queue");

	static int tcp_reass_maxqlen = 48;
	-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW,
	- &tcp_reass_maxqlen, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen,
	+ CTLFLAG_RW, tcp_reass_maxqlen, 0,
	"Maximum number of TCP Segments per individual Reassembly Queue");

	static int tcp_reass_overflows = 0;
	-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
	- &tcp_reass_overflows, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows,
	+ CTLFLAG_RD, tcp_reass_overflows, 0,
	"Global number of TCP Segment Reassembly Queue Overflows");

	/* Initialize TCP reassembly queue */
	static void
	tcp_reass_zone_change(void *tag)
	{
	+ INIT_VNET_INET(curvnet);

	V_tcp_reass_maxseg = nmbclusters / 16;
	uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg);
	}

	uma_zone_t tcp_reass_zone;

	void
	tcp_reass_init(void)
	{
	+ INIT_VNET_INET(curvnet);

	V_tcp_reass_maxseg = nmbclusters / 16;
	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
	&V_tcp_reass_maxseg);
	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg);
	EVENTHANDLER_REGISTER(nmbclusters_change,
	tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
	}

	int
	tcp_reass(struct tcpcb tp, struct tcphdr th, int tlenp, struct mbuf m)
	{
	+ INIT_VNET_INET(curvnet);
	struct tseg_qent *q;
	struct tseg_qent *p = NULL;
	struct tseg_qent *nq;
	struct tseg_qent *te = NULL;
	struct socket *so = tp->t_inpcb->inp_socket;
	int flags;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	* XXX: tcp_reass() is rather inefficient with its data structures
	* and should be rewritten (see NetBSD for optimizations). While
	* doing that it should move to its own file tcp_reass.c.
	*/

	/*
	* Call with th==NULL after become established to
	* force pre-ESTABLISHED data up to user socket.
	*/
	if (th == NULL)
	goto present;

	/*
	* Limit the number of segments in the reassembly queue to prevent
	* holding on to too many segments (and thus running out of mbufs).
	* Make sure to let the missing segment through which caused this
	* queue. Always keep one global queue entry spare to be able to
	* process the missing segment.
	*/
	if (th->th_seq != tp->rcv_nxt &&
	(V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg \|\|
	tp->t_segqlen >= V_tcp_reass_maxqlen)) {
	V_tcp_reass_overflows++;
	V_tcpstat.tcps_rcvmemdrop++;
	m_freem(m);
	*tlenp = 0;
	return (0);
	}

	/*
	* Allocate a new queue entry. If we can't, or hit the zone limit
	* just drop the pkt.
	*/
	te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
	if (te == NULL) {
	V_tcpstat.tcps_rcvmemdrop++;
	m_freem(m);
	*tlenp = 0;
	return (0);
	}
	tp->t_segqlen++;
	V_tcp_reass_qsize++;

	/*
	* Find a segment which begins after this one does.
	*/
	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
	if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
	break;
	p = q;
	}

	/*
	* If there is a preceding segment, it may provide some of
	* our data already. If so, drop the data from the incoming
	* segment. If it provides all of our data, drop us.
	*/
	if (p != NULL) {
	int i;
	/* conversion to int (in i) handles seq wraparound */
	i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
	if (i > 0) {
	if (i >= *tlenp) {
	V_tcpstat.tcps_rcvduppack++;
	V_tcpstat.tcps_rcvdupbyte += *tlenp;
	m_freem(m);
	uma_zfree(tcp_reass_zone, te);
	tp->t_segqlen--;
	V_tcp_reass_qsize--;
	/*
	* Try to present any queued data
	* at the left window edge to the user.
	* This is needed after the 3-WHS
	* completes.
	*/
	goto present; /* ??? */
	}
	m_adj(m, i);
	*tlenp -= i;
	th->th_seq += i;
	}
	}
	V_tcpstat.tcps_rcvoopack++;
	V_tcpstat.tcps_rcvoobyte += *tlenp;

	/*
	* While we overlap succeeding segments trim them or,
	* if they are completely covered, dequeue them.
	*/
	while (q) {
	int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
	if (i <= 0)
	break;
	if (i < q->tqe_len) {
	q->tqe_th->th_seq += i;
	q->tqe_len -= i;
	m_adj(q->tqe_m, i);
	break;
	}

	nq = LIST_NEXT(q, tqe_q);
	LIST_REMOVE(q, tqe_q);
	m_freem(q->tqe_m);
	uma_zfree(tcp_reass_zone, q);
	tp->t_segqlen--;
	V_tcp_reass_qsize--;
	q = nq;
	}

	/* Insert the new segment queue entry into place. */
	te->tqe_m = m;
	te->tqe_th = th;
	te->tqe_len = *tlenp;

	if (p == NULL) {
	LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
	} else {
	LIST_INSERT_AFTER(p, te, tqe_q);
	}

	present:
	/*
	* Present data to user, advancing rcv_nxt through
	* completed sequence space.
	*/
	if (!TCPS_HAVEESTABLISHED(tp->t_state))
	return (0);
	q = LIST_FIRST(&tp->t_segq);
	if (!q \|\| q->tqe_th->th_seq != tp->rcv_nxt)
	return (0);
	SOCKBUF_LOCK(&so->so_rcv);
	do {
	tp->rcv_nxt += q->tqe_len;
	flags = q->tqe_th->th_flags & TH_FIN;
	nq = LIST_NEXT(q, tqe_q);
	LIST_REMOVE(q, tqe_q);
	if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
	m_freem(q->tqe_m);
	else
	sbappendstream_locked(&so->so_rcv, q->tqe_m);
	uma_zfree(tcp_reass_zone, q);
	tp->t_segqlen--;
	V_tcp_reass_qsize--;
	q = nq;
	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
	ND6_HINT(tp);
	sorwakeup_locked(so);
	return (flags);
	}
	Index: head/sys/netinet/tcp_sack.c
	===================================================================
	--- head/sys/netinet/tcp_sack.c (revision 183549)
	+++ head/sys/netinet/tcp_sack.c (revision 183550)
	@@ -1,682 +1,684 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
	*/

	/*-
	* @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
	*
	* NRL grants permission for redistribution and use in source and binary
	* forms, with or without modification, of the software and documentation
	* created at NRL provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgements:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* This product includes software developed at the Information
	* Technology Division, US Naval Research Laboratory.
	* 4. Neither the name of the NRL nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
	* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* The views and conclusions contained in the software and documentation
	* are those of the authors and should not be interpreted as representing
	* official policies, either expressed or implied, of the US Naval
	* Research Laboratory (NRL).
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/proc.h> /* for proc0 declaration */
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet6/nd6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet6/tcp6_var.h>
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif /* TCPDEBUG */

	#include <machine/in_cksum.h>

	extern struct uma_zone *sack_hole_zone;

	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK");
	int tcp_do_sack = 1;
	-SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW,
	- &tcp_do_sack, 0, "Enable/Disable TCP SACK support");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, enable,
	+ CTLFLAG_RW, tcp_do_sack, 0, "Enable/Disable TCP SACK support");
	TUNABLE_INT("net.inet.tcp.sack.enable", &tcp_do_sack);

	static int tcp_sack_maxholes = 128;
	-SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW,
	- &tcp_sack_maxholes, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, maxholes,
	+ CTLFLAG_RW, tcp_sack_maxholes, 0,
	"Maximum number of TCP SACK holes allowed per connection");

	static int tcp_sack_globalmaxholes = 65536;
	-SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW,
	- &tcp_sack_globalmaxholes, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalmaxholes,
	+ CTLFLAG_RW, tcp_sack_globalmaxholes, 0,
	"Global maximum number of TCP SACK holes");

	static int tcp_sack_globalholes = 0;
	-SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD,
	- &tcp_sack_globalholes, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_sack, OID_AUTO, globalholes,
	+ CTLFLAG_RD, tcp_sack_globalholes, 0,
	"Global number of TCP SACK holes currently allocated");

	/*
	* This function is called upon receipt of new valid data (while not in
	* header prediction mode), and it updates the ordered list of sacks.
	*/
	void
	tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
	{
	/*
	* First reported block MUST be the most recent one. Subsequent
	* blocks SHOULD be in the order in which they arrived at the
	* receiver. These two conditions make the implementation fully
	* compliant with RFC 2018.
	*/
	struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
	int num_head, num_saved, i;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/* Check arguments. */
	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));

	/* SACK block for the received segment. */
	head_blk.start = rcv_start;
	head_blk.end = rcv_end;

	/*
	* Merge updated SACK blocks into head_blk, and save unchanged SACK
	* blocks into saved_blks[]. num_saved will have the number of the
	* saved SACK blocks.
	*/
	num_saved = 0;
	for (i = 0; i < tp->rcv_numsacks; i++) {
	tcp_seq start = tp->sackblks[i].start;
	tcp_seq end = tp->sackblks[i].end;
	if (SEQ_GEQ(start, end) \|\| SEQ_LEQ(start, tp->rcv_nxt)) {
	/*
	* Discard this SACK block.
	*/
	} else if (SEQ_LEQ(head_blk.start, end) &&
	SEQ_GEQ(head_blk.end, start)) {
	/*
	* Merge this SACK block into head_blk. This SACK
	* block itself will be discarded.
	*/
	if (SEQ_GT(head_blk.start, start))
	head_blk.start = start;
	if (SEQ_LT(head_blk.end, end))
	head_blk.end = end;
	} else {
	/*
	* Save this SACK block.
	*/
	saved_blks[num_saved].start = start;
	saved_blks[num_saved].end = end;
	num_saved++;
	}
	}

	/*
	* Update SACK list in tp->sackblks[].
	*/
	num_head = 0;
	if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
	/*
	* The received data segment is an out-of-order segment. Put
	* head_blk at the top of SACK list.
	*/
	tp->sackblks[0] = head_blk;
	num_head = 1;
	/*
	* If the number of saved SACK blocks exceeds its limit,
	* discard the last SACK block.
	*/
	if (num_saved >= MAX_SACK_BLKS)
	num_saved--;
	}
	if (num_saved > 0) {
	/*
	* Copy the saved SACK blocks back.
	*/
	bcopy(saved_blks, &tp->sackblks[num_head],
	sizeof(struct sackblk) * num_saved);
	}

	/* Save the number of SACK blocks. */
	tp->rcv_numsacks = num_head + num_saved;
	}

	/*
	* Delete all receiver-side SACK information.
	*/
	void
	tcp_clean_sackreport(struct tcpcb *tp)
	{
	int i;

	INP_WLOCK_ASSERT(tp->t_inpcb);
	tp->rcv_numsacks = 0;
	for (i = 0; i < MAX_SACK_BLKS; i++)
	tp->sackblks[i].start = tp->sackblks[i].end=0;
	}

	/*
	* Allocate struct sackhole.
	*/
	static struct sackhole *
	tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
	{
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	struct sackhole *hole;

	if (tp->snd_numholes >= V_tcp_sack_maxholes \|\|
	V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
	V_tcpstat.tcps_sack_sboverflow++;
	return NULL;
	}

	hole = (struct sackhole *)uma_zalloc(sack_hole_zone, M_NOWAIT);
	if (hole == NULL)
	return NULL;

	hole->start = start;
	hole->end = end;
	hole->rxmit = start;

	tp->snd_numholes++;
	V_tcp_sack_globalholes++;

	return hole;
	}

	/*
	* Free struct sackhole.
	*/
	static void
	tcp_sackhole_free(struct tcpcb tp, struct sackhole hole)
	{
	+ INIT_VNET_INET(tp->t_vnet);

	uma_zfree(sack_hole_zone, hole);

	tp->snd_numholes--;
	V_tcp_sack_globalholes--;

	KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
	KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
	}

	/*
	* Insert new SACK hole into scoreboard.
	*/
	static struct sackhole *
	tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
	struct sackhole *after)
	{
	struct sackhole *hole;

	/* Allocate a new SACK hole. */
	hole = tcp_sackhole_alloc(tp, start, end);
	if (hole == NULL)
	return NULL;

	/* Insert the new SACK hole into scoreboard. */
	if (after != NULL)
	TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
	else
	TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);

	/* Update SACK hint. */
	if (tp->sackhint.nexthole == NULL)
	tp->sackhint.nexthole = hole;

	return hole;
	}

	/*
	* Remove SACK hole from scoreboard.
	*/
	static void
	tcp_sackhole_remove(struct tcpcb tp, struct sackhole hole)
	{

	/* Update SACK hint. */
	if (tp->sackhint.nexthole == hole)
	tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);

	/* Remove this SACK hole. */
	TAILQ_REMOVE(&tp->snd_holes, hole, scblink);

	/* Free this SACK hole. */
	tcp_sackhole_free(tp, hole);
	}

	/*
	* Process cumulative ACK and the TCP SACK option to update the scoreboard.
	* tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
	* the sequence space).
	*/
	void
	tcp_sack_doack(struct tcpcb tp, struct tcpopt to, tcp_seq th_ack)
	{
	struct sackhole cur, temp;
	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
	int i, j, num_sack_blks;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	num_sack_blks = 0;
	/*
	* If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
	* treat [SND.UNA, SEG.ACK) as if it is a SACK block.
	*/
	if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
	sack_blocks[num_sack_blks].start = tp->snd_una;
	sack_blocks[num_sack_blks++].end = th_ack;
	}
	/*
	* Append received valid SACK blocks to sack_blocks[], but only if we
	* received new blocks from the other side.
	*/
	if (to->to_flags & TOF_SACK) {
	for (i = 0; i < to->to_nsacks; i++) {
	bcopy((to->to_sacks + i * TCPOLEN_SACK),
	&sack, sizeof(sack));
	sack.start = ntohl(sack.start);
	sack.end = ntohl(sack.end);
	if (SEQ_GT(sack.end, sack.start) &&
	SEQ_GT(sack.start, tp->snd_una) &&
	SEQ_GT(sack.start, th_ack) &&
	SEQ_LT(sack.start, tp->snd_max) &&
	SEQ_GT(sack.end, tp->snd_una) &&
	SEQ_LEQ(sack.end, tp->snd_max))
	sack_blocks[num_sack_blks++] = sack;
	}
	}
	/*
	* Return if SND.UNA is not advanced and no valid SACK block is
	* received.
	*/
	if (num_sack_blks == 0)
	return;

	/*
	* Sort the SACK blocks so we can update the scoreboard with just one
	* pass. The overhead of sorting upto 4+1 elements is less than
	* making upto 4+1 passes over the scoreboard.
	*/
	for (i = 0; i < num_sack_blks; i++) {
	for (j = i + 1; j < num_sack_blks; j++) {
	if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
	sack = sack_blocks[i];
	sack_blocks[i] = sack_blocks[j];
	sack_blocks[j] = sack;
	}
	}
	}
	if (TAILQ_EMPTY(&tp->snd_holes))
	/*
	* Empty scoreboard. Need to initialize snd_fack (it may be
	* uninitialized or have a bogus value). Scoreboard holes
	* (from the sack blocks received) are created later below
	* (in the logic that adds holes to the tail of the
	* scoreboard).
	*/
	tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
	/*
	* In the while-loop below, incoming SACK blocks (sack_blocks[]) and
	* SACK holes (snd_holes) are traversed from their tails with just
	* one pass in order to reduce the number of compares especially when
	* the bandwidth-delay product is large.
	*
	* Note: Typically, in the first RTT of SACK recovery, the highest
	* three or four SACK blocks with the same ack number are received.
	* In the second RTT, if retransmitted data segments are not lost,
	* the highest three or four SACK blocks with ack number advancing
	* are received.
	*/
	sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */
	if (SEQ_LT(tp->snd_fack, sblkp->start)) {
	/*
	* The highest SACK block is beyond fack. Append new SACK
	* hole at the tail. If the second or later highest SACK
	* blocks are also beyond the current fack, they will be
	* inserted by way of hole splitting in the while-loop below.
	*/
	temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
	if (temp != NULL) {
	tp->snd_fack = sblkp->end;
	/* Go to the previous sack block. */
	sblkp--;
	} else {
	/*
	* We failed to add a new hole based on the current
	* sack block. Skip over all the sack blocks that
	* fall completely to the right of snd_fack and
	* proceed to trim the scoreboard based on the
	* remaining sack blocks. This also trims the
	* scoreboard for th_ack (which is sack_blocks[0]).
	*/
	while (sblkp >= sack_blocks &&
	SEQ_LT(tp->snd_fack, sblkp->start))
	sblkp--;
	if (sblkp >= sack_blocks &&
	SEQ_LT(tp->snd_fack, sblkp->end))
	tp->snd_fack = sblkp->end;
	}
	} else if (SEQ_LT(tp->snd_fack, sblkp->end))
	/* fack is advanced. */
	tp->snd_fack = sblkp->end;
	/* We must have at least one SACK hole in scoreboard. */
	KASSERT(!TAILQ_EMPTY(&tp->snd_holes),
	("SACK scoreboard must not be empty"));
	cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
	/*
	* Since the incoming sack blocks are sorted, we can process them
	* making one sweep of the scoreboard.
	*/
	while (sblkp >= sack_blocks && cur != NULL) {
	if (SEQ_GEQ(sblkp->start, cur->end)) {
	/*
	* SACKs data beyond the current hole. Go to the
	* previous sack block.
	*/
	sblkp--;
	continue;
	}
	if (SEQ_LEQ(sblkp->end, cur->start)) {
	/*
	* SACKs data before the current hole. Go to the
	* previous hole.
	*/
	cur = TAILQ_PREV(cur, sackhole_head, scblink);
	continue;
	}
	tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
	KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
	("sackhint bytes rtx >= 0"));
	if (SEQ_LEQ(sblkp->start, cur->start)) {
	/* Data acks at least the beginning of hole. */
	if (SEQ_GEQ(sblkp->end, cur->end)) {
	/* Acks entire hole, so delete hole. */
	temp = cur;
	cur = TAILQ_PREV(cur, sackhole_head, scblink);
	tcp_sackhole_remove(tp, temp);
	/*
	* The sack block may ack all or part of the
	* next hole too, so continue onto the next
	* hole.
	*/
	continue;
	} else {
	/* Move start of hole forward. */
	cur->start = sblkp->end;
	cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
	}
	} else {
	/* Data acks at least the end of hole. */
	if (SEQ_GEQ(sblkp->end, cur->end)) {
	/* Move end of hole backward. */
	cur->end = sblkp->start;
	cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
	} else {
	/*
	* ACKs some data in middle of a hole; need
	* to split current hole
	*/
	temp = tcp_sackhole_insert(tp, sblkp->end,
	cur->end, cur);
	if (temp != NULL) {
	if (SEQ_GT(cur->rxmit, temp->rxmit)) {
	temp->rxmit = cur->rxmit;
	tp->sackhint.sack_bytes_rexmit
	+= (temp->rxmit
	- temp->start);
	}
	cur->end = sblkp->start;
	cur->rxmit = SEQ_MIN(cur->rxmit,
	cur->end);
	}
	}
	}
	tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
	/*
	* Testing sblkp->start against cur->start tells us whether
	* we're done with the sack block or the sack hole.
	* Accordingly, we advance one or the other.
	*/
	if (SEQ_LEQ(sblkp->start, cur->start))
	cur = TAILQ_PREV(cur, sackhole_head, scblink);
	else
	sblkp--;
	}
	}

	/*
	* Free all SACK holes to clear the scoreboard.
	*/
	void
	tcp_free_sackholes(struct tcpcb *tp)
	{
	struct sackhole *q;

	INP_WLOCK_ASSERT(tp->t_inpcb);
	while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
	tcp_sackhole_remove(tp, q);
	tp->sackhint.sack_bytes_rexmit = 0;

	KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
	KASSERT(tp->sackhint.nexthole == NULL,
	("tp->sackhint.nexthole == NULL"));
	}

	/*
	* Partial ack handling within a sack recovery episode. Keeping this very
	* simple for now. When a partial ack is received, force snd_cwnd to a value
	* that will allow the sender to transmit no more than 2 segments. If
	* necessary, a better scheme can be adopted at a later point, but for now,
	* the goal is to prevent the sender from bursting a large amount of data in
	* the midst of sack recovery.
	*/
	void
	tcp_sack_partialack(struct tcpcb tp, struct tcphdr th)
	{
	int num_segs = 1;

	INP_WLOCK_ASSERT(tp->t_inpcb);
	tcp_timer_activate(tp, TT_REXMT, 0);
	tp->t_rtttime = 0;
	/* Send one or 2 segments based on how much new data was acked. */
	if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2)
	num_segs = 2;
	tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
	(tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
	if (tp->snd_cwnd > tp->snd_ssthresh)
	tp->snd_cwnd = tp->snd_ssthresh;
	tp->t_flags \|= TF_ACKNOW;
	(void) tcp_output(tp);
	}

	#if 0
	/*
	* Debug version of tcp_sack_output() that walks the scoreboard. Used for
	* now to sanity check the hint.
	*/
	static struct sackhole *
	tcp_sack_output_debug(struct tcpcb tp, int sack_bytes_rexmt)
	{
	struct sackhole *p;

	INP_WLOCK_ASSERT(tp->t_inpcb);
	*sack_bytes_rexmt = 0;
	TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
	if (SEQ_LT(p->rxmit, p->end)) {
	if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
	continue;
	}
	*sack_bytes_rexmt += (p->rxmit - p->start);
	break;
	}
	*sack_bytes_rexmt += (p->rxmit - p->start);
	}
	return (p);
	}
	#endif

	/*
	* Returns the next hole to retransmit and the number of retransmitted bytes
	* from the scoreboard. We store both the next hole and the number of
	* retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
	* reception). This avoids scoreboard traversals completely.
	*
	* The loop here will traverse at most one link. Here's the argument. For
	* the loop to traverse more than 1 link before finding the next hole to
	* retransmit, we would need to have at least 1 node following the current
	* hint with (rxmit == end). But, for all holes following the current hint,
	* (start == rxmit), since we have not yet retransmitted from them.
	* Therefore, in order to traverse more 1 link in the loop below, we need to
	* have at least one node following the current hint with (start == rxmit ==
	* end). But that can't happen, (start == end) means that all the data in
	* that hole has been sacked, in which case, the hole would have been removed
	* from the scoreboard.
	*/
	struct sackhole *
	tcp_sack_output(struct tcpcb tp, int sack_bytes_rexmt)
	{
	struct sackhole *hole = NULL;

	INP_WLOCK_ASSERT(tp->t_inpcb);
	*sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
	hole = tp->sackhint.nexthole;
	if (hole == NULL \|\| SEQ_LT(hole->rxmit, hole->end))
	goto out;
	while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
	if (SEQ_LT(hole->rxmit, hole->end)) {
	tp->sackhint.nexthole = hole;
	break;
	}
	}
	out:
	return (hole);
	}

	/*
	* After a timeout, the SACK list may be rebuilt. This SACK information
	* should be used to avoid retransmitting SACKed data. This function
	* traverses the SACK list to see if snd_nxt should be moved forward.
	*/
	void
	tcp_sack_adjust(struct tcpcb *tp)
	{
	struct sackhole p, cur = TAILQ_FIRST(&tp->snd_holes);

	INP_WLOCK_ASSERT(tp->t_inpcb);
	if (cur == NULL)
	return; /* No holes */
	if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack))
	return; /* We're already beyond any SACKed blocks */
	/*-
	* Two cases for which we want to advance snd_nxt:
	* i) snd_nxt lies between end of one hole and beginning of another
	* ii) snd_nxt lies between end of last hole and snd_fack
	*/
	while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
	if (SEQ_LT(tp->snd_nxt, cur->end))
	return;
	if (SEQ_GEQ(tp->snd_nxt, p->start))
	cur = p;
	else {
	tp->snd_nxt = p->start;
	return;
	}
	}
	if (SEQ_LT(tp->snd_nxt, cur->end))
	return;
	tp->snd_nxt = tp->snd_fack;
	return;
	}
	Index: head/sys/netinet/tcp_subr.c
	===================================================================
	--- head/sys/netinet/tcp_subr.c (revision 183549)
	+++ head/sys/netinet/tcp_subr.c (revision 183550)
	@@ -1,2212 +1,2257 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/callout.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#ifdef INET6
	#include <sys/domain.h>
	#endif
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/random.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/route.h>
	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#endif
	#include <netinet/in_pcb.h>
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#endif
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#endif
	#include <netinet/ip_icmp.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcp_syncache.h>
	#include <netinet/tcp_offload.h>
	#ifdef INET6
	#include <netinet6/tcp6_var.h>
	#endif
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif
	#include <netinet6/ip6protosw.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/xform.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif
	#include <netipsec/key.h>
	#include <sys/syslog.h>
	#endif /IPSEC/

	#include <machine/in_cksum.h>
	#include <sys/md5.h>

	#include <security/mac/mac_framework.h>

	int tcp_mssdflt = TCP_MSS;
	#ifdef INET6
	int tcp_v6mssdflt = TCP6_MSS;
	#endif

	static int
	sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(TD_TO_VNET(curthread));
	int error, new;

	new = V_tcp_mssdflt;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
	if (new < TCP_MINMSS)
	error = EINVAL;
	else
	V_tcp_mssdflt = new;
	}
	return (error);
	}

	SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_mssdflt, 0, &sysctl_net_inet_tcp_mss_check, "I",
	"Default TCP Maximum Segment Size");

	#ifdef INET6
	static int
	sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET6(TD_TO_VNET(curthread));
	int error, new;

	new = V_tcp_v6mssdflt;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
	if (new < TCP_MINMSS)
	error = EINVAL;
	else
	V_tcp_v6mssdflt = new;
	}
	return (error);
	}

	SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_v6mssdflt, 0, &sysctl_net_inet_tcp_mss_v6_check, "I",
	"Default TCP Maximum Segment Size for IPv6");
	#endif

	/*
	* Minimum MSS we accept and use. This prevents DoS attacks where
	* we are forced to a ridiculous low MSS like 20 and send hundreds
	* of packets instead of one. The effect scales with the available
	* bandwidth and quickly saturates the CPU and network interface
	* with packet generation and sending. Set to zero to disable MINMSS
	* checking. This setting prevents us from sending too small packets.
	*/
	int tcp_minmss = TCP_MINMSS;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW,
	- &tcp_minmss , 0, "Minmum TCP Maximum Segment Size");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, minmss,
	+ CTLFLAG_RW, tcp_minmss , 0, "Minmum TCP Maximum Segment Size");

	int tcp_do_rfc1323 = 1;
	-SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
	- &tcp_do_rfc1323, 0, "Enable rfc1323 (high performance TCP) extensions");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
	+ CTLFLAG_RW, tcp_do_rfc1323, 0,
	+ "Enable rfc1323 (high performance TCP) extensions");

	static int tcp_log_debug = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
	&tcp_log_debug, 0, "Log errors caused by incoming TCP segments");

	static int tcp_tcbhashsize = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
	&tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");

	static int do_tcpdrain = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW,
	- &do_tcpdrain, 0,
	+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
	"Enable tcp_drain routine for extra help when low on mbufs");

	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
	- &tcbinfo.ipi_count, 0, "Number of active PCBs");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, pcbcount,
	+ CTLFLAG_RD, tcbinfo.ipi_count, 0, "Number of active PCBs");

	static int icmp_may_rst = 1;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
	- &icmp_may_rst, 0,
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, icmp_may_rst,
	+ CTLFLAG_RW, icmp_may_rst, 0,
	"Certain ICMP unreachable messages may abort connections in SYN_SENT");

	static int tcp_isn_reseed_interval = 0;
	-SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
	- &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp, OID_AUTO, isn_reseed_interval,
	+ CTLFLAG_RW, tcp_isn_reseed_interval, 0,
	+ "Seconds between reseeding of ISN secret");

	/*
	* TCP bandwidth limiting sysctls. Note that the default lower bound of
	* 1024 exists only for debugging. A good production default would be
	* something like 6100.
	*/
	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
	"TCP inflight data limiting");

	static int tcp_inflight_enable = 1;
	-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
	- &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, enable,
	+ CTLFLAG_RW, tcp_inflight_enable, 0,
	+ "Enable automatic TCP inflight data limiting");

	static int tcp_inflight_debug = 0;
	SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
	&tcp_inflight_debug, 0, "Debug TCP inflight calculations");

	static int tcp_inflight_rttthresh;
	SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
	"RTT threshold below which inflight will deactivate itself");

	static int tcp_inflight_min = 6144;
	-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
	- &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, min,
	+ CTLFLAG_RW, tcp_inflight_min, 0, "Lower-bound for TCP inflight window");

	static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
	- &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, max,
	+ CTLFLAG_RW, tcp_inflight_max, 0, "Upper-bound for TCP inflight window");

	static int tcp_inflight_stab = 20;
	-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
	- &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_inflight, OID_AUTO, stab,
	+ CTLFLAG_RW, tcp_inflight_stab, 0,
	+ "Inflight Algorithm Stabilization 20 = 2 packets");

	uma_zone_t sack_hole_zone;

	static struct inpcb tcp_notify(struct inpcb , int);
	static void tcp_isn_tick(void *);

	/*
	* Target size of TCP PCB hash tables. Must be a power of two.
	*
	* Note that this can be overridden by the kernel environment
	* variable net.inet.tcp.tcbhashsize
	*/
	#ifndef TCBHASHSIZE
	#define TCBHASHSIZE 512
	#endif

	/*
	* XXX
	* Callouts should be moved into struct tcp directly. They are currently
	* separate because the tcpcb structure is exported to userland for sysctl
	* parsing purposes, which do not know about callouts.
	*/
	struct tcpcb_mem {
	struct tcpcb tcb;
	struct tcp_timer tt;
	};

	static uma_zone_t tcpcb_zone;
	MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
	struct callout isn_callout;
	static struct mtx isn_mtx;

	#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
	#define ISN_LOCK() mtx_lock(&isn_mtx)
	#define ISN_UNLOCK() mtx_unlock(&isn_mtx)

	/*
	* TCP initialization.
	*/
	static void
	tcp_zone_change(void *tag)
	{

	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
	uma_zone_set_max(tcpcb_zone, maxsockets);
	tcp_tw_zone_change();
	}

	static int
	tcp_inpcb_init(void *mem, int size, int flags)
	{
	struct inpcb *inp = mem;

	INP_LOCK_INIT(inp, "inp", "tcpinp");
	return (0);
	}

	void
	tcp_init(void)
	{
	+ INIT_VNET_INET(curvnet);

	int hashsize = TCBHASHSIZE;
	tcp_delacktime = TCPTV_DELACK;
	tcp_keepinit = TCPTV_KEEP_INIT;
	tcp_keepidle = TCPTV_KEEP_IDLE;
	tcp_keepintvl = TCPTV_KEEPINTVL;
	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
	tcp_msl = TCPTV_MSL;
	tcp_rexmit_min = TCPTV_MIN;
	if (tcp_rexmit_min < 1)
	tcp_rexmit_min = 1;
	tcp_rexmit_slop = TCPTV_CPU_VAR;
	V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;

	INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp");
	LIST_INIT(&V_tcb);
	V_tcbinfo.ipi_listhead = &V_tcb;
	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
	if (!powerof2(hashsize)) {
	printf("WARNING: TCB hash size not a power of 2\n");
	hashsize = 512; /* safe default */
	}
	tcp_tcbhashsize = hashsize;
	V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB,
	&V_tcbinfo.ipi_hashmask);
	V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB,
	&V_tcbinfo.ipi_porthashmask);
	V_tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
	NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
	#ifdef INET6
	#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
	#else /* INET6 */
	#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
	#endif /* INET6 */
	if (max_protohdr < TCP_MINPROTOHDR)
	max_protohdr = TCP_MINPROTOHDR;
	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
	panic("tcp_init");
	#undef TCP_MINPROTOHDR
	/*
	* These have to be type stable for the benefit of the timers.
	*/
	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uma_zone_set_max(tcpcb_zone, maxsockets);
	tcp_tw_init();
	syncache_init();
	tcp_hc_init();
	tcp_reass_init();
	ISN_LOCK_INIT();
	callout_init(&isn_callout, CALLOUT_MPSAFE);
	tcp_isn_tick(NULL);
	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
	SHUTDOWN_PRI_DEFAULT);
	sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
	EVENTHANDLER_PRI_ANY);
	}

	void
	tcp_fini(void *xtp)
	{

	callout_stop(&isn_callout);
	}

	/*
	* Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
	* tcp_template used to store this data in mbufs, but we now recopy it out
	* of the tcpcb each time to conserve mbufs.
	*/
	void
	tcpip_fillheaders(struct inpcb inp, void ip_ptr, void *tcp_ptr)
	{
	struct tcphdr th = (struct tcphdr )tcp_ptr;

	INP_WLOCK_ASSERT(inp);

	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV6) != 0) {
	struct ip6_hdr *ip6;

	ip6 = (struct ip6_hdr *)ip_ptr;
	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) \|
	(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) \|
	(IPV6_VERSION & IPV6_VERSION_MASK);
	ip6->ip6_nxt = IPPROTO_TCP;
	ip6->ip6_plen = htons(sizeof(struct tcphdr));
	ip6->ip6_src = inp->in6p_laddr;
	ip6->ip6_dst = inp->in6p_faddr;
	} else
	#endif
	{
	struct ip *ip;

	ip = (struct ip *)ip_ptr;
	ip->ip_v = IPVERSION;
	ip->ip_hl = 5;
	ip->ip_tos = inp->inp_ip_tos;
	ip->ip_len = 0;
	ip->ip_id = 0;
	ip->ip_off = 0;
	ip->ip_ttl = inp->inp_ip_ttl;
	ip->ip_sum = 0;
	ip->ip_p = IPPROTO_TCP;
	ip->ip_src = inp->inp_laddr;
	ip->ip_dst = inp->inp_faddr;
	}
	th->th_sport = inp->inp_lport;
	th->th_dport = inp->inp_fport;
	th->th_seq = 0;
	th->th_ack = 0;
	th->th_x2 = 0;
	th->th_off = 5;
	th->th_flags = 0;
	th->th_win = 0;
	th->th_urp = 0;
	th->th_sum = 0; /* in_pseudo() is called later for ipv4 */
	}

	/*
	* Create template to be used to send tcp packets on a connection.
	* Allocates an mbuf and fills in a skeletal tcp/ip header. The only
	* use for this function is in keepalives, which use tcp_respond.
	*/
	struct tcptemp *
	tcpip_maketemplate(struct inpcb *inp)
	{
	struct tcptemp *t;

	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
	if (t == NULL)
	return (NULL);
	tcpip_fillheaders(inp, (void )&t->tt_ipgen, (void )&t->tt_t);
	return (t);
	}

	/*
	* Send a single message to the TCP at address specified by
	* the given TCP/IP header. If m == NULL, then we make a copy
	* of the tcpiphdr at ti and send directly to the addressed host.
	* This is used to force keep alive messages out using the TCP
	* template for a connection. If flags are given then we send
	* a message back to the TCP which originated the * segment ti,
	* and discard the mbuf containing it and any other attached mbufs.
	*
	* In any case the ack and sequence number of the transmitted
	* segment are as specified by the parameters.
	*
	* NOTE: If m != NULL, then ti must point to inside the mbuf.
	*/
	void
	tcp_respond(struct tcpcb tp, void ipgen, struct tcphdr th, struct mbuf m,
	tcp_seq ack, tcp_seq seq, int flags)
	{
	+ INIT_VNET_INET(curvnet);
	int tlen;
	int win = 0;
	struct ip *ip;
	struct tcphdr *nth;
	#ifdef INET6
	struct ip6_hdr *ip6;
	int isipv6;
	#endif /* INET6 */
	int ipflags = 0;
	struct inpcb *inp;

	KASSERT(tp != NULL \|\| m != NULL, ("tcp_respond: tp and m both NULL"));

	#ifdef INET6
	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
	ip6 = ipgen;
	#endif /* INET6 */
	ip = ipgen;

	if (tp != NULL) {
	inp = tp->t_inpcb;
	KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
	INP_WLOCK_ASSERT(inp);
	} else
	inp = NULL;

	if (tp != NULL) {
	if (!(flags & TH_RST)) {
	win = sbspace(&inp->inp_socket->so_rcv);
	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
	win = (long)TCP_MAXWIN << tp->rcv_scale;
	}
	}
	if (m == NULL) {
	m = m_gethdr(M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return;
	tlen = 0;
	m->m_data += max_linkhdr;
	#ifdef INET6
	if (isipv6) {
	bcopy((caddr_t)ip6, mtod(m, caddr_t),
	sizeof(struct ip6_hdr));
	ip6 = mtod(m, struct ip6_hdr *);
	nth = (struct tcphdr *)(ip6 + 1);
	} else
	#endif /* INET6 */
	{
	bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
	ip = mtod(m, struct ip *);
	nth = (struct tcphdr *)(ip + 1);
	}
	bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
	flags = TH_ACK;
	} else {
	/*
	* reuse the mbuf.
	* XXX MRT We inherrit the FIB, which is lucky.
	*/
	m_freem(m->m_next);
	m->m_next = NULL;
	m->m_data = (caddr_t)ipgen;
	/* m_len is set later */
	tlen = 0;
	#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
	#ifdef INET6
	if (isipv6) {
	xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
	nth = (struct tcphdr *)(ip6 + 1);
	} else
	#endif /* INET6 */
	{
	xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
	nth = (struct tcphdr *)(ip + 1);
	}
	if (th != nth) {
	/*
	* this is usually a case when an extension header
	* exists between the IPv6 header and the
	* TCP header.
	*/
	nth->th_sport = th->th_sport;
	nth->th_dport = th->th_dport;
	}
	xchg(nth->th_dport, nth->th_sport, n_short);
	#undef xchg
	}
	#ifdef INET6
	if (isipv6) {
	ip6->ip6_flow = 0;
	ip6->ip6_vfc = IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_TCP;
	ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
	tlen));
	tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
	} else
	#endif
	{
	tlen += sizeof (struct tcpiphdr);
	ip->ip_len = tlen;
	ip->ip_ttl = V_ip_defttl;
	if (V_path_mtu_discovery)
	ip->ip_off \|= IP_DF;
	}
	m->m_len = tlen;
	m->m_pkthdr.len = tlen;
	m->m_pkthdr.rcvif = NULL;
	#ifdef MAC
	if (inp != NULL) {
	/*
	* Packet is associated with a socket, so allow the
	* label of the response to reflect the socket label.
	*/
	INP_WLOCK_ASSERT(inp);
	mac_inpcb_create_mbuf(inp, m);
	} else {
	/*
	* Packet is not associated with a socket, so possibly
	* update the label in place.
	*/
	mac_netinet_tcp_reply(m);
	}
	#endif
	nth->th_seq = htonl(seq);
	nth->th_ack = htonl(ack);
	nth->th_x2 = 0;
	nth->th_off = sizeof (struct tcphdr) >> 2;
	nth->th_flags = flags;
	if (tp != NULL)
	nth->th_win = htons((u_short) (win >> tp->rcv_scale));
	else
	nth->th_win = htons((u_short)win);
	nth->th_urp = 0;
	#ifdef INET6
	if (isipv6) {
	nth->th_sum = 0;
	nth->th_sum = in6_cksum(m, IPPROTO_TCP,
	sizeof(struct ip6_hdr),
	tlen - sizeof(struct ip6_hdr));
	ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
	NULL, NULL);
	} else
	#endif /* INET6 */
	{
	nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
	m->m_pkthdr.csum_flags = CSUM_TCP;
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	}
	#ifdef TCPDEBUG
	if (tp == NULL \|\| (inp->inp_socket->so_options & SO_DEBUG))
	tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
	#endif
	#ifdef INET6
	if (isipv6)
	(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
	else
	#endif /* INET6 */
	(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
	}

	/*
	* Create a new TCP control block, making an
	* empty reassembly queue and hooking it to the argument
	* protocol control block. The `inp' parameter must have
	* come from the zone allocator set up in tcp_init().
	*/
	struct tcpcb *
	tcp_newtcpcb(struct inpcb *inp)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct tcpcb_mem *tm;
	struct tcpcb *tp;
	#ifdef INET6
	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
	#endif /* INET6 */

	tm = uma_zalloc(tcpcb_zone, M_NOWAIT \| M_ZERO);
	if (tm == NULL)
	return (NULL);
	tp = &tm->tcb;
	tp->t_timers = &tm->tt;
	/* LIST_INIT(&tp->t_segq); / / XXX covered by M_ZERO */
	tp->t_maxseg = tp->t_maxopd =
	#ifdef INET6
	isipv6 ? V_tcp_v6mssdflt :
	#endif /* INET6 */
	V_tcp_mssdflt;

	/* Set up our timeouts. */
	callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
	callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);

	if (V_tcp_do_rfc1323)
	tp->t_flags = (TF_REQ_SCALE\|TF_REQ_TSTMP);
	if (V_tcp_do_sack)
	tp->t_flags \|= TF_SACK_PERMIT;
	TAILQ_INIT(&tp->snd_holes);
	tp->t_inpcb = inp; /* XXX */
	/*
	* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
	* rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
	* reasonable initial retransmit time.
	*/
	tp->t_srtt = TCPTV_SRTTBASE;
	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
	tp->t_rttmin = tcp_rexmit_min;
	tp->t_rxtcur = TCPTV_RTOBASE;
	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	tp->t_rcvtime = ticks;
	tp->t_bw_rtttime = ticks;
	/*
	* IPv4 TTL initialization is necessary for an IPv6 socket as well,
	* because the socket may be bound to an IPv6 wildcard address,
	* which may match an IPv4-mapped IPv6 address.
	*/
	inp->inp_ip_ttl = V_ip_defttl;
	inp->inp_ppcb = tp;
	return (tp); /* XXX */
	}

	/*
	* Drop a TCP connection, reporting
	* the specified error. If connection is synchronized,
	* then send a RST to peer.
	*/
	struct tcpcb *
	tcp_drop(struct tcpcb *tp, int errno)
	{
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	struct socket *so = tp->t_inpcb->inp_socket;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(tp->t_inpcb);

	if (TCPS_HAVERCVDSYN(tp->t_state)) {
	tp->t_state = TCPS_CLOSED;
	(void) tcp_output_reset(tp);
	V_tcpstat.tcps_drops++;
	} else
	V_tcpstat.tcps_conndrops++;
	if (errno == ETIMEDOUT && tp->t_softerror)
	errno = tp->t_softerror;
	so->so_error = errno;
	return (tcp_close(tp));
	}

	void
	tcp_discardcb(struct tcpcb *tp)
	{
	+ INIT_VNET_INET(tp->t_vnet);
	struct tseg_qent *q;
	struct inpcb *inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	#ifdef INET6
	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
	#endif /* INET6 */

	INP_WLOCK_ASSERT(inp);

	/*
	* Make sure that all of our timers are stopped before we
	* delete the PCB.
	*/
	callout_stop(&tp->t_timers->tt_rexmt);
	callout_stop(&tp->t_timers->tt_persist);
	callout_stop(&tp->t_timers->tt_keep);
	callout_stop(&tp->t_timers->tt_2msl);
	callout_stop(&tp->t_timers->tt_delack);

	/*
	* If we got enough samples through the srtt filter,
	* save the rtt and rttvar in the routing entry.
	* 'Enough' is arbitrarily defined as 4 rtt samples.
	* 4 samples is enough for the srtt filter to converge
	* to within enough % of the correct value; fewer samples
	* and we could save a bogus rtt. The danger is not high
	* as tcp quickly recovers from everything.
	* XXX: Works very well but needs some more statistics!
	*/
	if (tp->t_rttupdated >= 4) {
	struct hc_metrics_lite metrics;
	u_long ssthresh;

	bzero(&metrics, sizeof(metrics));
	/*
	* Update the ssthresh always when the conditions below
	* are satisfied. This gives us better new start value
	* for the congestion avoidance for new connections.
	* ssthresh is only set if packet loss occured on a session.
	*
	* XXXRW: 'so' may be NULL here, and/or socket buffer may be
	* being torn down. Ideally this code would not use 'so'.
	*/
	ssthresh = tp->snd_ssthresh;
	if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
	/*
	* convert the limit from user data bytes to
	* packets then to packet data bytes.
	*/
	ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
	if (ssthresh < 2)
	ssthresh = 2;
	ssthresh *= (u_long)(tp->t_maxseg +
	#ifdef INET6
	(isipv6 ? sizeof (struct ip6_hdr) +
	sizeof (struct tcphdr) :
	#endif
	sizeof (struct tcpiphdr)
	#ifdef INET6
	)
	#endif
	);
	} else
	ssthresh = 0;
	metrics.rmx_ssthresh = ssthresh;

	metrics.rmx_rtt = tp->t_srtt;
	metrics.rmx_rttvar = tp->t_rttvar;
	/* XXX: This wraps if the pipe is more than 4 Gbit per second */
	metrics.rmx_bandwidth = tp->snd_bandwidth;
	metrics.rmx_cwnd = tp->snd_cwnd;
	metrics.rmx_sendpipe = 0;
	metrics.rmx_recvpipe = 0;

	tcp_hc_update(&inp->inp_inc, &metrics);
	}

	/* free the reassembly queue, if any */
	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
	LIST_REMOVE(q, tqe_q);
	m_freem(q->tqe_m);
	uma_zfree(tcp_reass_zone, q);
	tp->t_segqlen--;
	V_tcp_reass_qsize--;
	}
	/* Disconnect offload device, if any. */
	tcp_offload_detach(tp);

	tcp_free_sackholes(tp);
	inp->inp_ppcb = NULL;
	tp->t_inpcb = NULL;
	uma_zfree(tcpcb_zone, tp);
	}

	/*
	* Attempt to close a TCP control block, marking it as dropped, and freeing
	* the socket if we hold the only reference.
	*/
	struct tcpcb *
	tcp_close(struct tcpcb *tp)
	{
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	struct inpcb *inp = tp->t_inpcb;
	struct socket *so;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	/* Notify any offload devices of listener close */
	if (tp->t_state == TCPS_LISTEN)
	tcp_offload_listen_close(tp);
	in_pcbdrop(inp);
	V_tcpstat.tcps_closed++;
	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
	so = inp->inp_socket;
	soisdisconnected(so);
	if (inp->inp_vflag & INP_SOCKREF) {
	KASSERT(so->so_state & SS_PROTOREF,
	("tcp_close: !SS_PROTOREF"));
	inp->inp_vflag &= ~INP_SOCKREF;
	INP_WUNLOCK(inp);
	ACCEPT_LOCK();
	SOCK_LOCK(so);
	so->so_state &= ~SS_PROTOREF;
	sofree(so);
	return (NULL);
	}
	return (tp);
	}

	void
	tcp_drain(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);

	- if (do_tcpdrain) {
	+ if (!do_tcpdrain)
	+ return;
	+
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET(vnet_iter);
	struct inpcb *inpb;
	struct tcpcb *tcpb;
	struct tseg_qent *te;

	/*
	* Walk the tcpbs, if existing, and flush the reassembly queue,
	* if there is one...
	* XXX: The "Net/3" implementation doesn't imply that the TCP
	* reassembly queue should be flushed, but in a situation
	* where we're really low on mbufs, this is potentially
	* usefull.
	*/
	INP_INFO_RLOCK(&V_tcbinfo);
	LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) {
	if (inpb->inp_vflag & INP_TIMEWAIT)
	continue;
	INP_WLOCK(inpb);
	if ((tcpb = intotcpcb(inpb)) != NULL) {
	while ((te = LIST_FIRST(&tcpb->t_segq))
	!= NULL) {
	LIST_REMOVE(te, tqe_q);
	m_freem(te->tqe_m);
	uma_zfree(tcp_reass_zone, te);
	tcpb->t_segqlen--;
	V_tcp_reass_qsize--;
	}
	tcp_clean_sackreport(tcpb);
	}
	INP_WUNLOCK(inpb);
	}
	INP_INFO_RUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	}

	/*
	* Notify a tcp user of an asynchronous error;
	* store error as soft error, but wake up user
	* (for now, won't do anything until can select for soft error).
	*
	* Do not wake up user since there currently is no mechanism for
	* reporting soft errors (yet - a kqueue filter may be added).
	*/
	static struct inpcb *
	tcp_notify(struct inpcb *inp, int error)
	{
	struct tcpcb *tp;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	if ((inp->inp_vflag & INP_TIMEWAIT) \|\|
	(inp->inp_vflag & INP_DROPPED))
	return (inp);

	tp = intotcpcb(inp);
	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));

	/*
	* Ignore some errors if we are hooked up.
	* If connection hasn't completed, has retransmitted several times,
	* and receives a second error, give up now. This is better
	* than waiting a long time to establish a connection that
	* can never complete.
	*/
	if (tp->t_state == TCPS_ESTABLISHED &&
	(error == EHOSTUNREACH \|\| error == ENETUNREACH \|\|
	error == EHOSTDOWN)) {
	return (inp);
	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
	tp->t_softerror) {
	tp = tcp_drop(tp, error);
	if (tp != NULL)
	return (inp);
	else
	return (NULL);
	} else {
	tp->t_softerror = error;
	return (inp);
	}
	#if 0
	wakeup( &so->so_timeo);
	sorwakeup(so);
	sowwakeup(so);
	#endif
	}

	static int
	tcp_pcblist(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	int error, i, m, n, pcb_count;
	struct inpcb inp, *inp_list;
	inp_gen_t gencnt;
	struct xinpgen xig;

	/*
	* The process of preparing the TCB list is too time-consuming and
	* resource-intensive to repeat twice on every request.
	*/
	if (req->oldptr == NULL) {
	m = syncache_pcbcount();
	n = V_tcbinfo.ipi_count;
	req->oldidx = 2 * (sizeof xig)
	+ ((m + n) + n/8) * sizeof(struct xtcpcb);
	return (0);
	}

	if (req->newptr != NULL)
	return (EPERM);

	/*
	* OK, now we're committed to doing something.
	*/
	INP_INFO_RLOCK(&V_tcbinfo);
	gencnt = V_tcbinfo.ipi_gencnt;
	n = V_tcbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_tcbinfo);

	m = syncache_pcbcount();

	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
	+ (n + m) * sizeof(struct xtcpcb));
	if (error != 0)
	return (error);

	xig.xig_len = sizeof xig;
	xig.xig_count = n + m;
	xig.xig_gen = gencnt;
	xig.xig_sogen = so_gencnt;
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	if (error)
	return (error);

	error = syncache_pcblist(req, m, &pcb_count);
	if (error)
	return (error);

	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
	if (inp_list == NULL)
	return (ENOMEM);

	INP_INFO_RLOCK(&V_tcbinfo);
	for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
	inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) {
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt) {
	/*
	* XXX: This use of cr_cansee(), introduced with
	* TCP state changes, is not quite right, but for
	* now, better than nothing.
	*/
	if (inp->inp_vflag & INP_TIMEWAIT) {
	if (intotw(inp) != NULL)
	error = cr_cansee(req->td->td_ucred,
	intotw(inp)->tw_cred);
	else
	error = EINVAL; /* Skip this inp. */
	} else
	error = cr_canseesocket(req->td->td_ucred,
	inp->inp_socket);
	if (error == 0)
	inp_list[i++] = inp;
	}
	INP_RUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(&V_tcbinfo);
	n = i;

	error = 0;
	for (i = 0; i < n; i++) {
	inp = inp_list[i];
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt) {
	struct xtcpcb xt;
	void *inp_ppcb;

	bzero(&xt, sizeof(xt));
	xt.xt_len = sizeof xt;
	/* XXX should avoid extra copy */
	bcopy(inp, &xt.xt_inp, sizeof *inp);
	inp_ppcb = inp->inp_ppcb;
	if (inp_ppcb == NULL)
	bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
	else if (inp->inp_vflag & INP_TIMEWAIT) {
	bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
	xt.xt_tp.t_state = TCPS_TIME_WAIT;
	} else
	bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
	if (inp->inp_socket != NULL)
	sotoxsocket(inp->inp_socket, &xt.xt_socket);
	else {
	bzero(&xt.xt_socket, sizeof xt.xt_socket);
	xt.xt_socket.xso_protocol = IPPROTO_TCP;
	}
	xt.xt_inp.inp_gencnt = inp->inp_gencnt;
	INP_RUNLOCK(inp);
	error = SYSCTL_OUT(req, &xt, sizeof xt);
	} else
	INP_RUNLOCK(inp);

	}
	if (!error) {
	/*
	* Give the user an updated idea of our state.
	* If the generation differs from what we told
	* her before, she knows that something happened
	* while we were processing this request, and it
	* might be necessary to retry.
	*/
	INP_INFO_RLOCK(&V_tcbinfo);
	xig.xig_gen = V_tcbinfo.ipi_gencnt;
	xig.xig_sogen = so_gencnt;
	xig.xig_count = V_tcbinfo.ipi_count + pcb_count;
	INP_INFO_RUNLOCK(&V_tcbinfo);
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	}
	free(inp_list, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
	tcp_pcblist, "S,xtcpcb", "List of active TCP connections");

	static int
	tcp_getcred(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	struct xucred xuc;
	struct sockaddr_in addrs[2];
	struct inpcb *inp;
	int error;

	error = priv_check(req->td, PRIV_NETINET_GETCRED);
	if (error)
	return (error);
	error = SYSCTL_IN(req, addrs, sizeof(addrs));
	if (error)
	return (error);
	INP_INFO_RLOCK(&V_tcbinfo);
	inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr,
	addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
	if (inp != NULL) {
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(&V_tcbinfo);
	if (inp->inp_socket == NULL)
	error = ENOENT;
	if (error == 0)
	error = cr_canseesocket(req->td->td_ucred,
	inp->inp_socket);
	if (error == 0)
	cru2x(inp->inp_socket->so_cred, &xuc);
	INP_RUNLOCK(inp);
	} else {
	INP_INFO_RUNLOCK(&V_tcbinfo);
	error = ENOENT;
	}
	if (error == 0)
	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
	return (error);
	}

	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
	CTLTYPE_OPAQUE\|CTLFLAG_RW\|CTLFLAG_PRISON, 0, 0,
	tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");

	#ifdef INET6
	static int
	tcp6_getcred(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct xucred xuc;
	struct sockaddr_in6 addrs[2];
	struct inpcb *inp;
	int error, mapped = 0;

	error = priv_check(req->td, PRIV_NETINET_GETCRED);
	if (error)
	return (error);
	error = SYSCTL_IN(req, addrs, sizeof(addrs));
	if (error)
	return (error);
	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 \|\|
	(error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
	return (error);
	}
	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
	if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
	mapped = 1;
	else
	return (EINVAL);
	}

	INP_INFO_RLOCK(&V_tcbinfo);
	if (mapped == 1)
	inp = in_pcblookup_hash(&V_tcbinfo,
	(struct in_addr )&addrs[1].sin6_addr.s6_addr[12],
	addrs[1].sin6_port,
	(struct in_addr )&addrs[0].sin6_addr.s6_addr[12],
	addrs[0].sin6_port,
	0, NULL);
	else
	inp = in6_pcblookup_hash(&V_tcbinfo,
	&addrs[1].sin6_addr, addrs[1].sin6_port,
	&addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL);
	if (inp != NULL) {
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(&V_tcbinfo);
	if (inp->inp_socket == NULL)
	error = ENOENT;
	if (error == 0)
	error = cr_canseesocket(req->td->td_ucred,
	inp->inp_socket);
	if (error == 0)
	cru2x(inp->inp_socket->so_cred, &xuc);
	INP_RUNLOCK(inp);
	} else {
	INP_INFO_RUNLOCK(&V_tcbinfo);
	error = ENOENT;
	}
	if (error == 0)
	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
	return (error);
	}

	SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
	CTLTYPE_OPAQUE\|CTLFLAG_RW\|CTLFLAG_PRISON, 0, 0,
	tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
	#endif


	void
	tcp_ctlinput(int cmd, struct sockaddr sa, void vip)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip = vip;
	struct tcphdr *th;
	struct in_addr faddr;
	struct inpcb *inp;
	struct tcpcb *tp;
	struct inpcb (notify)(struct inpcb *, int) = tcp_notify;
	struct icmp *icp;
	struct in_conninfo inc;
	tcp_seq icmp_tcp_seq;
	int mtu;

	faddr = ((struct sockaddr_in *)sa)->sin_addr;
	if (sa->sa_family != AF_INET \|\| faddr.s_addr == INADDR_ANY)
	return;

	if (cmd == PRC_MSGSIZE)
	notify = tcp_mtudisc;
	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB \|\|
	cmd == PRC_UNREACH_PORT \|\| cmd == PRC_TIMXCEED_INTRANS) && ip)
	notify = tcp_drop_syn_sent;
	/*
	* Redirects don't need to be handled up here.
	*/
	else if (PRC_IS_REDIRECT(cmd))
	return;
	/*
	* Source quench is depreciated.
	*/
	else if (cmd == PRC_QUENCH)
	return;
	/*
	* Hostdead is ugly because it goes linearly through all PCBs.
	* XXX: We never get this from ICMP, otherwise it makes an
	* excellent DoS attack on machines with many connections.
	*/
	else if (cmd == PRC_HOSTDEAD)
	ip = NULL;
	else if ((unsigned)cmd >= PRC_NCMDS \|\| inetctlerrmap[cmd] == 0)
	return;
	if (ip != NULL) {
	icp = (struct icmp *)((caddr_t)ip
	- offsetof(struct icmp, icmp_ip));
	th = (struct tcphdr *)((caddr_t)ip
	+ (ip->ip_hl << 2));
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport,
	ip->ip_src, th->th_sport, 0, NULL);
	if (inp != NULL) {
	INP_WLOCK(inp);
	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
	!(inp->inp_vflag & INP_DROPPED) &&
	!(inp->inp_socket == NULL)) {
	icmp_tcp_seq = htonl(th->th_seq);
	tp = intotcpcb(inp);
	if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
	SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
	if (cmd == PRC_MSGSIZE) {
	/*
	* MTU discovery:
	* If we got a needfrag set the MTU
	* in the route to the suggested new
	* value (if given) and then notify.
	*/
	bzero(&inc, sizeof(inc));
	inc.inc_flags = 0; /* IPv4 */
	inc.inc_faddr = faddr;
	inc.inc_fibnum =
	inp->inp_inc.inc_fibnum;

	mtu = ntohs(icp->icmp_nextmtu);
	/*
	* If no alternative MTU was
	* proposed, try the next smaller
	* one. ip->ip_len has already
	* been swapped in icmp_input().
	*/
	if (!mtu)
	mtu = ip_next_mtu(ip->ip_len,
	1);
	if (mtu < max(296, V_tcp_minmss
	+ sizeof(struct tcpiphdr)))
	mtu = 0;
	if (!mtu)
	mtu = V_tcp_mssdflt
	+ sizeof(struct tcpiphdr);
	/*
	* Only cache the the MTU if it
	* is smaller than the interface
	* or route MTU. tcp_mtudisc()
	* will do right thing by itself.
	*/
	if (mtu <= tcp_maxmtu(&inc, NULL))
	tcp_hc_updatemtu(&inc, mtu);
	}

	inp = (*notify)(inp, inetctlerrmap[cmd]);
	}
	}
	if (inp != NULL)
	INP_WUNLOCK(inp);
	} else {
	inc.inc_fport = th->th_dport;
	inc.inc_lport = th->th_sport;
	inc.inc_faddr = faddr;
	inc.inc_laddr = ip->ip_src;
	#ifdef INET6
	inc.inc_isipv6 = 0;
	#endif
	syncache_unreach(&inc, th);
	}
	INP_INFO_WUNLOCK(&V_tcbinfo);
	} else
	in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
	}

	#ifdef INET6
	void
	tcp6_ctlinput(int cmd, struct sockaddr sa, void d)
	{
	+ INIT_VNET_INET(curvnet);
	struct tcphdr th;
	struct inpcb (notify)(struct inpcb *, int) = tcp_notify;
	struct ip6_hdr *ip6;
	struct mbuf *m;
	struct ip6ctlparam *ip6cp = NULL;
	const struct sockaddr_in6 *sa6_src = NULL;
	int off;
	struct tcp_portonly {
	u_int16_t th_sport;
	u_int16_t th_dport;
	} *thp;

	if (sa->sa_family != AF_INET6 \|\|
	sa->sa_len != sizeof(struct sockaddr_in6))
	return;

	if (cmd == PRC_MSGSIZE)
	notify = tcp_mtudisc;
	else if (!PRC_IS_REDIRECT(cmd) &&
	((unsigned)cmd >= PRC_NCMDS \|\| inet6ctlerrmap[cmd] == 0))
	return;
	/* Source quench is depreciated. */
	else if (cmd == PRC_QUENCH)
	return;

	/* if the parameter is from icmp6, decode it. */
	if (d != NULL) {
	ip6cp = (struct ip6ctlparam *)d;
	m = ip6cp->ip6c_m;
	ip6 = ip6cp->ip6c_ip6;
	off = ip6cp->ip6c_off;
	sa6_src = ip6cp->ip6c_src;
	} else {
	m = NULL;
	ip6 = NULL;
	off = 0; /* fool gcc */
	sa6_src = &sa6_any;
	}

	if (ip6 != NULL) {
	struct in_conninfo inc;
	/*
	* XXX: We assume that when IPV6 is non NULL,
	* M and OFF are valid.
	*/

	/* check if we can safely examine src and dst ports */
	if (m->m_pkthdr.len < off + sizeof(*thp))
	return;

	bzero(&th, sizeof(th));
	m_copydata(m, off, sizeof(*thp), (caddr_t)&th);

	in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
	(struct sockaddr *)ip6cp->ip6c_src,
	th.th_sport, cmd, NULL, notify);

	inc.inc_fport = th.th_dport;
	inc.inc_lport = th.th_sport;
	inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
	inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
	inc.inc_isipv6 = 1;
	INP_INFO_WLOCK(&V_tcbinfo);
	syncache_unreach(&inc, &th);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	} else
	in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
	0, cmd, NULL, notify);
	}
	#endif /* INET6 */


	/*
	* Following is where TCP initial sequence number generation occurs.
	*
	* There are two places where we must use initial sequence numbers:
	* 1. In SYN-ACK packets.
	* 2. In SYN packets.
	*
	* All ISNs for SYN-ACK packets are generated by the syncache. See
	* tcp_syncache.c for details.
	*
	* The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
	* depends on this property. In addition, these ISNs should be
	* unguessable so as to prevent connection hijacking. To satisfy
	* the requirements of this situation, the algorithm outlined in
	* RFC 1948 is used, with only small modifications.
	*
	* Implementation details:
	*
	* Time is based off the system timer, and is corrected so that it
	* increases by one megabyte per second. This allows for proper
	* recycling on high speed LANs while still leaving over an hour
	* before rollover.
	*
	* As reading the exact system time is too expensive to be done
	* whenever setting up a TCP connection, we increment the time
	* offset in two ways. First, a small random positive increment
	* is added to isn_offset for each connection that is set up.
	* Second, the function tcp_isn_tick fires once per clock tick
	* and increments isn_offset as necessary so that sequence numbers
	* are incremented at approximately ISN_BYTES_PER_SECOND. The
	* random positive increments serve only to ensure that the same
	* exact sequence number is never sent out twice (as could otherwise
	* happen when a port is recycled in less than the system tick
	* interval.)
	*
	* net.inet.tcp.isn_reseed_interval controls the number of seconds
	* between seeding of isn_secret. This is normally set to zero,
	* as reseeding should not be necessary.
	*
	* Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
	* isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
	* general, this means holding an exclusive (write) lock.
	*/

	#define ISN_BYTES_PER_SECOND 1048576
	#define ISN_STATIC_INCREMENT 4096
	#define ISN_RANDOM_INCREMENT (4096 - 1)

	static u_char isn_secret[32];
	static int isn_last_reseed;
	static u_int32_t isn_offset, isn_offset_old;
	static MD5_CTX isn_ctx;

	tcp_seq
	tcp_new_isn(struct tcpcb *tp)
	{
	+ INIT_VNET_INET(tp->t_vnet);
	u_int32_t md5_buffer[4];
	tcp_seq new_isn;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	ISN_LOCK();
	/* Seed if this is the first use, reseed if requested. */
	if ((V_isn_last_reseed == 0) \|\| ((V_tcp_isn_reseed_interval > 0) &&
	(((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
	< (u_int)ticks))) {
	read_random(&V_isn_secret, sizeof(V_isn_secret));
	V_isn_last_reseed = ticks;
	}

	/* Compute the md5 hash and return the ISN. */
	MD5Init(&V_isn_ctx);
	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
	#ifdef INET6
	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
	sizeof(struct in6_addr));
	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
	sizeof(struct in6_addr));
	} else
	#endif
	{
	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
	sizeof(struct in_addr));
	MD5Update(&V_isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
	sizeof(struct in_addr));
	}
	MD5Update(&V_isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret));
	MD5Final((u_char *) &md5_buffer, &V_isn_ctx);
	new_isn = (tcp_seq) md5_buffer[0];
	V_isn_offset += ISN_STATIC_INCREMENT +
	(arc4random() & ISN_RANDOM_INCREMENT);
	new_isn += V_isn_offset;
	ISN_UNLOCK();
	return (new_isn);
	}

	/*
	* Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary
	* to keep time flowing at a relatively constant rate. If the random
	* increments have already pushed us past the projected offset, do nothing.
	*/
	static void
	tcp_isn_tick(void *xtp)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	u_int32_t projected_offset;

	ISN_LOCK();
	- projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */
	+ INIT_VNET_INET(curvnet);
	+ projected_offset =
	+ V_isn_offset_old + ISN_BYTES_PER_SECOND / 100;

	- if (SEQ_GT(projected_offset, V_isn_offset))
	- V_isn_offset = projected_offset;
	+ if (SEQ_GT(projected_offset, V_isn_offset))
	+ V_isn_offset = projected_offset;

	- V_isn_offset_old = V_isn_offset;
	+ V_isn_offset_old = V_isn_offset;
	+ CURVNET_RESTORE();
	+ }
	+ VNET_LIST_RUNLOCK();
	callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL);
	ISN_UNLOCK();
	}

	/*
	* When a specific ICMP unreachable message is received and the
	* connection state is SYN-SENT, drop the connection. This behavior
	* is controlled by the icmp_may_rst sysctl.
	*/
	struct inpcb *
	tcp_drop_syn_sent(struct inpcb *inp, int errno)
	{
	+#ifdef INVARIANTS
	+ INIT_VNET_INET(inp->inp_vnet);
	+#endif
	struct tcpcb *tp;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	if ((inp->inp_vflag & INP_TIMEWAIT) \|\|
	(inp->inp_vflag & INP_DROPPED))
	return (inp);

	tp = intotcpcb(inp);
	if (tp->t_state != TCPS_SYN_SENT)
	return (inp);

	tp = tcp_drop(tp, errno);
	if (tp != NULL)
	return (inp);
	else
	return (NULL);
	}

	/*
	* When `need fragmentation' ICMP is received, update our idea of the MSS
	* based on the new value in the route. Also nudge TCP to send something,
	* since we know the packet we just sent was dropped.
	* This duplicates some code in the tcp_mss() function in tcp_input.c.
	*/
	struct inpcb *
	tcp_mtudisc(struct inpcb *inp, int errno)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct tcpcb *tp;
	struct socket *so;

	INP_WLOCK_ASSERT(inp);
	if ((inp->inp_vflag & INP_TIMEWAIT) \|\|
	(inp->inp_vflag & INP_DROPPED))
	return (inp);

	tp = intotcpcb(inp);
	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));

	tcp_mss_update(tp, -1, NULL);

	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_snd);
	/* If the mss is larger than the socket buffer, decrease the mss. */
	if (so->so_snd.sb_hiwat < tp->t_maxseg)
	tp->t_maxseg = so->so_snd.sb_hiwat;
	SOCKBUF_UNLOCK(&so->so_snd);

	V_tcpstat.tcps_mturesent++;
	tp->t_rtttime = 0;
	tp->snd_nxt = tp->snd_una;
	tcp_free_sackholes(tp);
	tp->snd_recover = tp->snd_max;
	if (tp->t_flags & TF_SACK_PERMIT)
	EXIT_FASTRECOVERY(tp);
	tcp_output_send(tp);
	return (inp);
	}

	/*
	* Look-up the routing entry to the peer of this inpcb. If no route
	* is found and it cannot be allocated, then return NULL. This routine
	* is called by TCP routines that access the rmx structure and by tcp_mss
	* to get the interface MTU.
	*/
	u_long
	tcp_maxmtu(struct in_conninfo inc, int flags)
	{
	struct route sro;
	struct sockaddr_in *dst;
	struct ifnet *ifp;
	u_long maxmtu = 0;

	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));

	bzero(&sro, sizeof(sro));
	if (inc->inc_faddr.s_addr != INADDR_ANY) {
	dst = (struct sockaddr_in *)&sro.ro_dst;
	dst->sin_family = AF_INET;
	dst->sin_len = sizeof(*dst);
	dst->sin_addr = inc->inc_faddr;
	in_rtalloc_ign(&sro, RTF_CLONING, inc->inc_fibnum);
	}
	if (sro.ro_rt != NULL) {
	ifp = sro.ro_rt->rt_ifp;
	if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
	maxmtu = ifp->if_mtu;
	else
	maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);

	/* Report additional interface capabilities. */
	if (flags != NULL) {
	if (ifp->if_capenable & IFCAP_TSO4 &&
	ifp->if_hwassist & CSUM_TSO)
	*flags \|= CSUM_TSO;
	}
	RTFREE(sro.ro_rt);
	}
	return (maxmtu);
	}

	#ifdef INET6
	u_long
	tcp_maxmtu6(struct in_conninfo inc, int flags)
	{
	struct route_in6 sro6;
	struct ifnet *ifp;
	u_long maxmtu = 0;

	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));

	bzero(&sro6, sizeof(sro6));
	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
	sro6.ro_dst.sin6_family = AF_INET6;
	sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
	sro6.ro_dst.sin6_addr = inc->inc6_faddr;
	rtalloc_ign((struct route *)&sro6, RTF_CLONING);
	}
	if (sro6.ro_rt != NULL) {
	ifp = sro6.ro_rt->rt_ifp;
	if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
	maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
	else
	maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
	IN6_LINKMTU(sro6.ro_rt->rt_ifp));

	/* Report additional interface capabilities. */
	if (flags != NULL) {
	if (ifp->if_capenable & IFCAP_TSO6 &&
	ifp->if_hwassist & CSUM_TSO)
	*flags \|= CSUM_TSO;
	}
	RTFREE(sro6.ro_rt);
	}

	return (maxmtu);
	}
	#endif /* INET6 */

	#ifdef IPSEC
	/* compute ESP/AH header size for TCP, including outer IP header. */
	size_t
	ipsec_hdrsiz_tcp(struct tcpcb *tp)
	{
	struct inpcb *inp;
	struct mbuf *m;
	size_t hdrsiz;
	struct ip *ip;
	#ifdef INET6
	struct ip6_hdr *ip6;
	#endif
	struct tcphdr *th;

	if ((tp == NULL) \|\| ((inp = tp->t_inpcb) == NULL))
	return (0);
	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (!m)
	return (0);

	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV6) != 0) {
	ip6 = mtod(m, struct ip6_hdr *);
	th = (struct tcphdr *)(ip6 + 1);
	m->m_pkthdr.len = m->m_len =
	sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	tcpip_fillheaders(inp, ip6, th);
	hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
	} else
	#endif /* INET6 */
	{
	ip = mtod(m, struct ip *);
	th = (struct tcphdr *)(ip + 1);
	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
	tcpip_fillheaders(inp, ip, th);
	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
	}

	m_free(m);
	return (hdrsiz);
	}
	#endif /* IPSEC */

	/*
	* TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
	*
	* This code attempts to calculate the bandwidth-delay product as a
	* means of determining the optimal window size to maximize bandwidth,
	* minimize RTT, and avoid the over-allocation of buffers on interfaces and
	* routers. This code also does a fairly good job keeping RTTs in check
	* across slow links like modems. We implement an algorithm which is very
	* similar (but not meant to be) TCP/Vegas. The code operates on the
	* transmitter side of a TCP connection and so only effects the transmit
	* side of the connection.
	*
	* BACKGROUND: TCP makes no provision for the management of buffer space
	* at the end points or at the intermediate routers and switches. A TCP
	* stream, whether using NewReno or not, will eventually buffer as
	* many packets as it is able and the only reason this typically works is
	* due to the fairly small default buffers made available for a connection
	* (typicaly 16K or 32K). As machines use larger windows and/or window
	* scaling it is now fairly easy for even a single TCP connection to blow-out
	* all available buffer space not only on the local interface, but on
	* intermediate routers and switches as well. NewReno makes a misguided
	* attempt to 'solve' this problem by waiting for an actual failure to occur,
	* then backing off, then steadily increasing the window again until another
	* failure occurs, ad-infinitum. This results in terrible oscillation that
	* is only made worse as network loads increase and the idea of intentionally
	* blowing out network buffers is, frankly, a terrible way to manage network
	* resources.
	*
	* It is far better to limit the transmit window prior to the failure
	* condition being achieved. There are two general ways to do this: First
	* you can 'scan' through different transmit window sizes and locate the
	* point where the RTT stops increasing, indicating that you have filled the
	* pipe, then scan backwards until you note that RTT stops decreasing, then
	* repeat ad-infinitum. This method works in principle but has severe
	* implementation issues due to RTT variances, timer granularity, and
	* instability in the algorithm which can lead to many false positives and
	* create oscillations as well as interact badly with other TCP streams
	* implementing the same algorithm.
	*
	* The second method is to limit the window to the bandwidth delay product
	* of the link. This is the method we implement. RTT variances and our
	* own manipulation of the congestion window, bwnd, can potentially
	* destabilize the algorithm. For this reason we have to stabilize the
	* elements used to calculate the window. We do this by using the minimum
	* observed RTT, the long term average of the observed bandwidth, and
	* by adding two segments worth of slop. It isn't perfect but it is able
	* to react to changing conditions and gives us a very stable basis on
	* which to extend the algorithm.
	*/
	void
	tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
	{
	+ INIT_VNET_INET(tp->t_vnet);
	u_long bw;
	u_long bwnd;
	int save_ticks;

	INP_WLOCK_ASSERT(tp->t_inpcb);

	/*
	* If inflight_enable is disabled in the middle of a tcp connection,
	* make sure snd_bwnd is effectively disabled.
	*/
	if (V_tcp_inflight_enable == 0 \|\|
	tp->t_rttlow < V_tcp_inflight_rttthresh) {
	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	tp->snd_bandwidth = 0;
	return;
	}

	/*
	* Figure out the bandwidth. Due to the tick granularity this
	* is a very rough number and it MUST be averaged over a fairly
	* long period of time. XXX we need to take into account a link
	* that is not using all available bandwidth, but for now our
	* slop will ramp us up if this case occurs and the bandwidth later
	* increases.
	*
	* Note: if ticks rollover 'bw' may wind up negative. We must
	* effectively reset t_bw_rtttime for this case.
	*/
	save_ticks = ticks;
	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
	return;

	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
	(save_ticks - tp->t_bw_rtttime);
	tp->t_bw_rtttime = save_ticks;
	tp->t_bw_rtseq = ack_seq;
	if (tp->t_bw_rtttime == 0 \|\| (int)bw < 0)
	return;
	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;

	tp->snd_bandwidth = bw;

	/*
	* Calculate the semi-static bandwidth delay product, plus two maximal
	* segments. The additional slop puts us squarely in the sweet
	* spot and also handles the bandwidth run-up case and stabilization.
	* Without the slop we could be locking ourselves into a lower
	* bandwidth.
	*
	* Situations Handled:
	* (1) Prevents over-queueing of packets on LANs, especially on
	* high speed LANs, allowing larger TCP buffers to be
	* specified, and also does a good job preventing
	* over-queueing of packets over choke points like modems
	* (at least for the transmit side).
	*
	* (2) Is able to handle changing network loads (bandwidth
	* drops so bwnd drops, bandwidth increases so bwnd
	* increases).
	*
	* (3) Theoretically should stabilize in the face of multiple
	* connections implementing the same algorithm (this may need
	* a little work).
	*
	* (4) Stability value (defaults to 20 = 2 maximal packets) can
	* be adjusted with a sysctl but typically only needs to be
	* on very slow connections. A value no smaller then 5
	* should be used, but only reduce this default if you have
	* no other choice.
	*/
	#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10;
	#undef USERTT

	if (tcp_inflight_debug > 0) {
	static int ltime;
	if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
	ltime = ticks;
	printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
	tp,
	bw,
	tp->t_rttbest,
	tp->t_srtt,
	bwnd
	);
	}
	}
	if ((long)bwnd < V_tcp_inflight_min)
	bwnd = V_tcp_inflight_min;
	if (bwnd > V_tcp_inflight_max)
	bwnd = V_tcp_inflight_max;
	if ((long)bwnd < tp->t_maxseg * 2)
	bwnd = tp->t_maxseg * 2;
	tp->snd_bwnd = bwnd;
	}

	#ifdef TCP_SIGNATURE
	/*
	* Callback function invoked by m_apply() to digest TCP segment data
	* contained within an mbuf chain.
	*/
	static int
	tcp_signature_apply(void fstate, void data, u_int len)
	{

	MD5Update(fstate, (u_char *)data, len);
	return (0);
	}

	/*
	* Compute TCP-MD5 hash of a TCP segment. (RFC2385)
	*
	* Parameters:
	* m pointer to head of mbuf chain
	* _unused
	* len length of TCP segment data, excluding options
	* optlen length of TCP segment options
	* buf pointer to storage for computed MD5 digest
	* direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
	*
	* We do this over ip, tcphdr, segment data, and the key in the SADB.
	* When called from tcp_input(), we can be sure that th_sum has been
	* zeroed out and verified already.
	*
	* Return 0 if successful, otherwise return -1.
	*
	* XXX The key is retrieved from the system's PF_KEY SADB, by keying a
	* search with the destination IP address, and a 'magic SPI' to be
	* determined by the application. This is hardcoded elsewhere to 1179
	* right now. Another branch of this code exists which uses the SPD to
	* specify per-application flows but it is unstable.
	*/
	int
	tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen,
	u_char *buf, u_int direction)
	{
	union sockaddr_union dst;
	struct ippseudo ippseudo;
	MD5_CTX ctx;
	int doff;
	struct ip *ip;
	struct ipovly *ipovly;
	struct secasvar *sav;
	struct tcphdr *th;
	#ifdef INET6
	struct ip6_hdr *ip6;
	struct in6_addr in6;
	char ip6buf[INET6_ADDRSTRLEN];
	uint32_t plen;
	uint16_t nhdr;
	#endif
	u_short savecsum;

	KASSERT(m != NULL, ("NULL mbuf chain"));
	KASSERT(buf != NULL, ("NULL signature pointer"));

	/* Extract the destination from the IP header in the mbuf. */
	bzero(&dst, sizeof(union sockaddr_union));
	ip = mtod(m, struct ip *);
	#ifdef INET6
	ip6 = NULL; /* Make the compiler happy. */
	#endif
	switch (ip->ip_v) {
	case IPVERSION:
	dst.sa.sa_len = sizeof(struct sockaddr_in);
	dst.sa.sa_family = AF_INET;
	dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
	ip->ip_src : ip->ip_dst;
	break;
	#ifdef INET6
	case (IPV6_VERSION >> 4):
	ip6 = mtod(m, struct ip6_hdr *);
	dst.sa.sa_len = sizeof(struct sockaddr_in6);
	dst.sa.sa_family = AF_INET6;
	dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ?
	ip6->ip6_src : ip6->ip6_dst;
	break;
	#endif
	default:
	return (EINVAL);
	/* NOTREACHED */
	break;
	}

	/* Look up an SADB entry which matches the address of the peer. */
	sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
	if (sav == NULL) {
	ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__,
	(ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) :
	#ifdef INET6
	(ip->ip_v == (IPV6_VERSION >> 4)) ?
	ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) :
	#endif
	"(unsupported)"));
	return (EINVAL);
	}

	MD5Init(&ctx);
	/*
	* Step 1: Update MD5 hash with IP(v6) pseudo-header.
	*
	* XXX The ippseudo header MUST be digested in network byte order,
	* or else we'll fail the regression test. Assume all fields we've
	* been doing arithmetic on have been in host byte order.
	* XXX One cannot depend on ipovly->ih_len here. When called from
	* tcp_output(), the underlying ip_len member has not yet been set.
	*/
	switch (ip->ip_v) {
	case IPVERSION:
	ipovly = (struct ipovly *)ip;
	ippseudo.ippseudo_src = ipovly->ih_src;
	ippseudo.ippseudo_dst = ipovly->ih_dst;
	ippseudo.ippseudo_pad = 0;
	ippseudo.ippseudo_p = IPPROTO_TCP;
	ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) +
	optlen);
	MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));

	th = (struct tcphdr )((u_char )ip + sizeof(struct ip));
	doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen;
	break;
	#ifdef INET6
	/*
	* RFC 2385, 2.0 Proposal
	* For IPv6, the pseudo-header is as described in RFC 2460, namely the
	* 128-bit source IPv6 address, 128-bit destination IPv6 address, zero-
	* extended next header value (to form 32 bits), and 32-bit segment
	* length.
	* Note: Upper-Layer Packet Length comes before Next Header.
	*/
	case (IPV6_VERSION >> 4):
	in6 = ip6->ip6_src;
	in6_clearscope(&in6);
	MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
	in6 = ip6->ip6_dst;
	in6_clearscope(&in6);
	MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr));
	plen = htonl(len + sizeof(struct tcphdr) + optlen);
	MD5Update(&ctx, (char *)&plen, sizeof(uint32_t));
	nhdr = 0;
	MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
	MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
	MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));
	nhdr = IPPROTO_TCP;
	MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t));

	th = (struct tcphdr )((u_char )ip6 + sizeof(struct ip6_hdr));
	doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen;
	break;
	#endif
	default:
	return (EINVAL);
	/* NOTREACHED */
	break;
	}


	/*
	* Step 2: Update MD5 hash with TCP header, excluding options.
	* The TCP checksum must be set to zero.
	*/
	savecsum = th->th_sum;
	th->th_sum = 0;
	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
	th->th_sum = savecsum;

	/*
	* Step 3: Update MD5 hash with TCP segment data.
	* Use m_apply() to avoid an early m_pullup().
	*/
	if (len > 0)
	m_apply(m, doff, len, tcp_signature_apply, &ctx);

	/*
	* Step 4: Update MD5 hash with shared secret.
	*/
	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
	MD5Final(buf, &ctx);

	key_sa_recordxfer(sav, m);
	KEY_FREESAV(&sav);
	return (0);
	}
	#endif /* TCP_SIGNATURE */

	static int
	sysctl_drop(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	+#ifdef INET6
	+ INIT_VNET_INET6(curvnet);
	+#endif
	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
	struct sockaddr_storage addrs[2];
	struct inpcb *inp;
	struct tcpcb *tp;
	struct tcptw *tw;
	struct sockaddr_in fin, lin;
	#ifdef INET6
	struct sockaddr_in6 fin6, lin6;
	struct in6_addr f6, l6;
	#endif
	int error;

	inp = NULL;
	fin = lin = NULL;
	#ifdef INET6
	fin6 = lin6 = NULL;
	#endif
	error = 0;

	if (req->oldptr != NULL \|\| req->oldlen != 0)
	return (EINVAL);
	if (req->newptr == NULL)
	return (EPERM);
	if (req->newlen < sizeof(addrs))
	return (ENOMEM);
	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
	if (error)
	return (error);

	switch (addrs[0].ss_family) {
	#ifdef INET6
	case AF_INET6:
	fin6 = (struct sockaddr_in6 *)&addrs[0];
	lin6 = (struct sockaddr_in6 *)&addrs[1];
	if (fin6->sin6_len != sizeof(struct sockaddr_in6) \|\|
	lin6->sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);
	if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
	if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
	return (EINVAL);
	in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
	in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
	fin = (struct sockaddr_in *)&addrs[0];
	lin = (struct sockaddr_in *)&addrs[1];
	break;
	}
	error = sa6_embedscope(fin6, V_ip6_use_defzone);
	if (error)
	return (error);
	error = sa6_embedscope(lin6, V_ip6_use_defzone);
	if (error)
	return (error);
	break;
	#endif
	case AF_INET:
	fin = (struct sockaddr_in *)&addrs[0];
	lin = (struct sockaddr_in *)&addrs[1];
	if (fin->sin_len != sizeof(struct sockaddr_in) \|\|
	lin->sin_len != sizeof(struct sockaddr_in))
	return (EINVAL);
	break;
	default:
	return (EINVAL);
	}
	INP_INFO_WLOCK(&V_tcbinfo);
	switch (addrs[0].ss_family) {
	#ifdef INET6
	case AF_INET6:
	inp = in6_pcblookup_hash(&V_tcbinfo, &f6, fin6->sin6_port,
	&l6, lin6->sin6_port, 0, NULL);
	break;
	#endif
	case AF_INET:
	inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr,
	fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL);
	break;
	}
	if (inp != NULL) {
	INP_WLOCK(inp);
	if (inp->inp_vflag & INP_TIMEWAIT) {
	/*
	* XXXRW: There currently exists a state where an
	* inpcb is present, but its timewait state has been
	* discarded. For now, don't allow dropping of this
	* type of inpcb.
	*/
	tw = intotw(inp);
	if (tw != NULL)
	tcp_twclose(tw, 0);
	else
	INP_WUNLOCK(inp);
	} else if (!(inp->inp_vflag & INP_DROPPED) &&
	!(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
	tp = intotcpcb(inp);
	tp = tcp_drop(tp, ECONNABORTED);
	if (tp != NULL)
	INP_WUNLOCK(inp);
	} else
	INP_WUNLOCK(inp);
	} else
	error = ESRCH;
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}

	SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
	CTLTYPE_STRUCT\|CTLFLAG_WR\|CTLFLAG_SKIP, NULL,
	0, sysctl_drop, "", "Drop TCP connection");

	/*
	* Generate a standardized TCP log line for use throughout the
	* tcp subsystem. Memory allocation is done with M_NOWAIT to
	* allow use in the interrupt context.
	*
	* NB: The caller MUST free(s, M_TCPLOG) the returned string.
	* NB: The function may return NULL if memory allocation failed.
	*
	* Due to header inclusion and ordering limitations the struct ip
	* and ip6_hdr pointers have to be passed as void pointers.
	*/
	char *
	tcp_log_addrs(struct in_conninfo inc, struct tcphdr th, void *ip4hdr,
	const void *ip6hdr)
	{
	char s, sp;
	size_t size;
	struct ip *ip;
	#ifdef INET6
	const struct ip6_hdr *ip6;

	ip6 = (const struct ip6_hdr *)ip6hdr;
	#endif /* INET6 */
	ip = (struct ip *)ip4hdr;

	/*
	* The log line looks like this:
	* "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
	*/
	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
	sizeof(PRINT_TH_FLAGS) + 1 +
	#ifdef INET6
	2 * INET6_ADDRSTRLEN;
	#else
	2 * INET_ADDRSTRLEN;
	#endif /* INET6 */

	/* Is logging enabled? */
	if (tcp_log_debug == 0 && tcp_log_in_vain == 0)
	return (NULL);

	s = malloc(size, M_TCPLOG, M_ZERO\|M_NOWAIT);
	if (s == NULL)
	return (NULL);

	strcat(s, "TCP: [");
	sp = s + strlen(s);

	if (inc && inc->inc_isipv6 == 0) {
	inet_ntoa_r(inc->inc_faddr, sp);
	sp = s + strlen(s);
	sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
	sp = s + strlen(s);
	inet_ntoa_r(inc->inc_laddr, sp);
	sp = s + strlen(s);
	sprintf(sp, "]:%i", ntohs(inc->inc_lport));
	#ifdef INET6
	} else if (inc) {
	ip6_sprintf(sp, &inc->inc6_faddr);
	sp = s + strlen(s);
	sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
	sp = s + strlen(s);
	ip6_sprintf(sp, &inc->inc6_laddr);
	sp = s + strlen(s);
	sprintf(sp, "]:%i", ntohs(inc->inc_lport));
	} else if (ip6 && th) {
	ip6_sprintf(sp, &ip6->ip6_src);
	sp = s + strlen(s);
	sprintf(sp, "]:%i to [", ntohs(th->th_sport));
	sp = s + strlen(s);
	ip6_sprintf(sp, &ip6->ip6_dst);
	sp = s + strlen(s);
	sprintf(sp, "]:%i", ntohs(th->th_dport));
	#endif /* INET6 */
	} else if (ip && th) {
	inet_ntoa_r(ip->ip_src, sp);
	sp = s + strlen(s);
	sprintf(sp, "]:%i to [", ntohs(th->th_sport));
	sp = s + strlen(s);
	inet_ntoa_r(ip->ip_dst, sp);
	sp = s + strlen(s);
	sprintf(sp, "]:%i", ntohs(th->th_dport));
	} else {
	free(s, M_TCPLOG);
	return (NULL);
	}
	sp = s + strlen(s);
	if (th)
	sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS);
	if (*(s + size - 1) != '\0')
	panic("%s: string too long", __func__);
	return (s);
	}
	Index: head/sys/netinet/tcp_syncache.c
	===================================================================
	--- head/sys/netinet/tcp_syncache.c (revision 183549)
	+++ head/sys/netinet/tcp_syncache.c (revision 183550)
	@@ -1,1736 +1,1761 @@
	/*-
	* Copyright (c) 2001 McAfee, Inc.
	* Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project by Jonathan Lemon
	* and McAfee Research, the Security Research Division of McAfee, Inc. under
	* DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
	* DARPA CHATS research program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/md5.h>
	#include <sys/proc.h> /* for proc0 declaration */
	#include <sys/random.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syslog.h>
	#include <sys/ucred.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet6/nd6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_pcb.h>
	#endif
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcp_syncache.h>
	#include <netinet/tcp_offload.h>
	#ifdef INET6
	#include <netinet6/tcp6_var.h>
	#endif

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif
	#include <netipsec/key.h>
	#endif /IPSEC/

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	static int tcp_syncookies = 1;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
	&tcp_syncookies, 0,
	"Use TCP SYN cookies if the syncache overflows");

	static int tcp_syncookiesonly = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
	&tcp_syncookiesonly, 0,
	"Use only TCP SYN cookies");

	#ifdef TCP_OFFLOAD_DISABLE
	#define TOEPCB_ISSET(sc) (0)
	#else
	#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
	#endif

	static void syncache_drop(struct syncache , struct syncache_head );
	static void syncache_free(struct syncache *);
	static void syncache_insert(struct syncache , struct syncache_head );
	struct syncache syncache_lookup(struct in_conninfo , struct syncache_head **);
	static int syncache_respond(struct syncache *);
	static struct socket syncache_socket(struct syncache , struct socket *,
	struct mbuf *m);
	static void syncache_timeout(struct syncache sc, struct syncache_head sch,
	int docallout);
	static void syncache_timer(void *);
	static void syncookie_generate(struct syncache_head , struct syncache ,
	u_int32_t *);
	static struct syncache
	syncookie_lookup(struct in_conninfo , struct syncache_head *,
	struct syncache , struct tcpopt , struct tcphdr *,
	struct socket *);

	/*
	* Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
	* 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds,
	* the odds are that the user has given up attempting to connect by then.
	*/
	#define SYNCACHE_MAXREXMTS 3

	/* Arbitrary values */
	#define TCP_SYNCACHE_HASHSIZE 512
	#define TCP_SYNCACHE_BUCKETLIMIT 30

	static struct tcp_syncache tcp_syncache;

	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");

	-SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
	- &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
	+ bucketlimit, CTLFLAG_RDTUN,
	+ tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");

	-SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
	- &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
	+ cachelimit, CTLFLAG_RDTUN,
	+ tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");

	-SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
	- &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
	+ count, CTLFLAG_RD,
	+ tcp_syncache.cache_count, 0, "Current number of entries in syncache");

	-SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
	- &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
	+ hashsize, CTLFLAG_RDTUN,
	+ tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");

	-SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
	- &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
	+ rexmtlimit, CTLFLAG_RW,
	+ tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");

	int tcp_sc_rst_sock_fail = 1;
	-SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_RW,
	- &tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure");
	+SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_syncache, OID_AUTO,
	+ rst_on_sock_fail, CTLFLAG_RW,
	+ tcp_sc_rst_sock_fail, 0, "Send reset on socket allocation failure");

	static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");

	#define SYNCACHE_HASH(inc, mask) \
	((V_tcp_syncache.hash_secret ^ \
	(inc)->inc_faddr.s_addr ^ \
	((inc)->inc_faddr.s_addr >> 16) ^ \
	(inc)->inc_fport ^ (inc)->inc_lport) & mask)

	#define SYNCACHE_HASH6(inc, mask) \
	((V_tcp_syncache.hash_secret ^ \
	(inc)->inc6_faddr.s6_addr32[0] ^ \
	(inc)->inc6_faddr.s6_addr32[3] ^ \
	(inc)->inc_fport ^ (inc)->inc_lport) & mask)

	#define ENDPTS_EQ(a, b) ( \
	(a)->ie_fport == (b)->ie_fport && \
	(a)->ie_lport == (b)->ie_lport && \
	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
	)

	#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)

	#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx)
	#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx)
	#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED)

	/*
	* Requires the syncache entry to be already removed from the bucket list.
	*/
	static void
	syncache_free(struct syncache *sc)
	{
	+ INIT_VNET_INET(curvnet);
	+
	if (sc->sc_ipopts)
	(void) m_free(sc->sc_ipopts);
	if (sc->sc_cred)
	crfree(sc->sc_cred);
	#ifdef MAC
	mac_syncache_destroy(&sc->sc_label);
	#endif

	uma_zfree(V_tcp_syncache.zone, sc);
	}

	void
	syncache_init(void)
	{
	+ INIT_VNET_INET(curvnet);
	int i;

	V_tcp_syncache.cache_count = 0;
	V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
	V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
	V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
	V_tcp_syncache.hash_secret = arc4random();

	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
	&V_tcp_syncache.hashsize);
	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
	&V_tcp_syncache.bucket_limit);
	if (!powerof2(V_tcp_syncache.hashsize) \|\|
	V_tcp_syncache.hashsize == 0) {
	printf("WARNING: syncache hash size is not a power of 2.\n");
	V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
	}
	V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1;

	/* Set limits. */
	V_tcp_syncache.cache_limit =
	V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit;
	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
	&V_tcp_syncache.cache_limit);

	/* Allocate the hash table. */
	MALLOC(V_tcp_syncache.hashbase, struct syncache_head *,
	V_tcp_syncache.hashsize * sizeof(struct syncache_head),
	M_SYNCACHE, M_WAITOK \| M_ZERO);

	/* Initialize the hash buckets. */
	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
	TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
	mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
	NULL, MTX_DEF);
	callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
	&V_tcp_syncache.hashbase[i].sch_mtx, 0);
	V_tcp_syncache.hashbase[i].sch_length = 0;
	}

	/* Create the syncache entry zone. */
	V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit);
	}

	/*
	* Inserts a syncache entry into the specified bucket row.
	* Locks and unlocks the syncache_head autonomously.
	*/
	static void
	syncache_insert(struct syncache sc, struct syncache_head sch)
	{
	+ INIT_VNET_INET(sch->sch_vnet);
	struct syncache *sc2;

	SCH_LOCK(sch);

	/*
	* Make sure that we don't overflow the per-bucket limit.
	* If the bucket is full, toss the oldest element.
	*/
	if (sch->sch_length >= V_tcp_syncache.bucket_limit) {
	KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
	("sch->sch_length incorrect"));
	sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
	syncache_drop(sc2, sch);
	V_tcpstat.tcps_sc_bucketoverflow++;
	}

	/* Put it into the bucket. */
	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
	sch->sch_length++;

	/* Reinitialize the bucket row's timer. */
	if (sch->sch_length == 1)
	sch->sch_nextc = ticks + INT_MAX;
	syncache_timeout(sc, sch, 1);

	SCH_UNLOCK(sch);

	V_tcp_syncache.cache_count++;
	V_tcpstat.tcps_sc_added++;
	}

	/*
	* Remove and free entry from syncache bucket row.
	* Expects locked syncache head.
	*/
	static void
	syncache_drop(struct syncache sc, struct syncache_head sch)
	{
	+ INIT_VNET_INET(sch->sch_vnet);

	SCH_LOCK_ASSERT(sch);

	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
	sch->sch_length--;

	#ifndef TCP_OFFLOAD_DISABLE
	if (sc->sc_tu)
	sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
	#endif
	syncache_free(sc);
	V_tcp_syncache.cache_count--;
	}

	/*
	* Engage/reengage time on bucket row.
	*/
	static void
	syncache_timeout(struct syncache sc, struct syncache_head sch, int docallout)
	{
	sc->sc_rxttime = ticks +
	TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]);
	sc->sc_rxmits++;
	if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
	sch->sch_nextc = sc->sc_rxttime;
	if (docallout)
	callout_reset(&sch->sch_timer, sch->sch_nextc - ticks,
	syncache_timer, (void *)sch);
	}
	}

	/*
	* Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
	* If we have retransmitted an entry the maximum number of times, expire it.
	* One separate timer for each bucket row.
	*/
	static void
	syncache_timer(void *xsch)
	{
	struct syncache_head sch = (struct syncache_head )xsch;
	+ INIT_VNET_INET(sch->sch_vnet);
	struct syncache sc, nsc;
	int tick = ticks;
	char *s;

	/* NB: syncache_head has already been locked by the callout. */
	SCH_LOCK_ASSERT(sch);

	/*
	* In the following cycle we may remove some entries and/or
	* advance some timeouts, so re-initialize the bucket timer.
	*/
	sch->sch_nextc = tick + INT_MAX;

	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
	/*
	* We do not check if the listen socket still exists
	* and accept the case where the listen socket may be
	* gone by the time we resend the SYN/ACK. We do
	* not expect this to happens often. If it does,
	* then the RST will be sent by the time the remote
	* host does the SYN/ACK->ACK.
	*/
	if (TSTMP_GT(sc->sc_rxttime, tick)) {
	if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc))
	sch->sch_nextc = sc->sc_rxttime;
	continue;
	}
	if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) {
	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
	log(LOG_DEBUG, "%s; %s: Retransmits exhausted, "
	"giving up and removing syncache entry\n",
	s, __func__);
	free(s, M_TCPLOG);
	}
	syncache_drop(sc, sch);
	V_tcpstat.tcps_sc_stale++;
	continue;
	}
	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
	log(LOG_DEBUG, "%s; %s: Response timeout, "
	"retransmitting (%u) SYN\|ACK\n",
	s, __func__, sc->sc_rxmits);
	free(s, M_TCPLOG);
	}

	(void) syncache_respond(sc);
	V_tcpstat.tcps_sc_retransmitted++;
	syncache_timeout(sc, sch, 0);
	}
	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
	callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
	syncache_timer, (void *)(sch));
	}

	/*
	* Find an entry in the syncache.
	* Returns always with locked syncache_head plus a matching entry or NULL.
	*/
	struct syncache *
	syncache_lookup(struct in_conninfo inc, struct syncache_head *schp)
	{
	+ INIT_VNET_INET(curvnet);
	struct syncache *sc;
	struct syncache_head *sch;

	#ifdef INET6
	if (inc->inc_isipv6) {
	sch = &V_tcp_syncache.hashbase[
	SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)];
	*schp = sch;

	SCH_LOCK(sch);

	/* Circle through bucket row to find matching entry. */
	TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
	if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
	return (sc);
	}
	} else
	#endif
	{
	sch = &V_tcp_syncache.hashbase[
	SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)];
	*schp = sch;

	SCH_LOCK(sch);

	/* Circle through bucket row to find matching entry. */
	TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
	#ifdef INET6
	if (sc->sc_inc.inc_isipv6)
	continue;
	#endif
	if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
	return (sc);
	}
	}
	SCH_LOCK_ASSERT(*schp);
	return (NULL); /* always returns with locked sch */
	}

	/*
	* This function is called when we get a RST for a
	* non-existent connection, so that we can see if the
	* connection is in the syn cache. If it is, zap it.
	*/
	void
	syncache_chkrst(struct in_conninfo inc, struct tcphdr th)
	{
	+ INIT_VNET_INET(curvnet);
	struct syncache *sc;
	struct syncache_head *sch;
	char *s = NULL;

	sc = syncache_lookup(inc, &sch); /* returns locked sch */
	SCH_LOCK_ASSERT(sch);

	/*
	* Any RST to our SYN\|ACK must not carry ACK, SYN or FIN flags.
	* See RFC 793 page 65, section SEGMENT ARRIVES.
	*/
	if (th->th_flags & (TH_ACK\|TH_SYN\|TH_FIN)) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or "
	"FIN flag set, segment ignored\n", s, __func__);
	V_tcpstat.tcps_badrst++;
	goto done;
	}

	/*
	* No corresponding connection was found in syncache.
	* If syncookies are enabled and possibly exclusively
	* used, or we are under memory pressure, a valid RST
	* may not find a syncache entry. In that case we're
	* done and no SYN\|ACK retransmissions will happen.
	* Otherwise the the RST was misdirected or spoofed.
	*/
	if (sc == NULL) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Spurious RST without matching "
	"syncache entry (possibly syncookie only), "
	"segment ignored\n", s, __func__);
	V_tcpstat.tcps_badrst++;
	goto done;
	}

	/*
	* If the RST bit is set, check the sequence number to see
	* if this is a valid reset segment.
	* RFC 793 page 37:
	* In all states except SYN-SENT, all reset (RST) segments
	* are validated by checking their SEQ-fields. A reset is
	* valid if its sequence number is in the window.
	*
	* The sequence number in the reset segment is normally an
	* echo of our outgoing acknowlegement numbers, but some hosts
	* send a reset with the sequence number at the rightmost edge
	* of our receive window, and we have to handle this case.
	*/
	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
	SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
	syncache_drop(sc, sch);
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Our SYN\|ACK was rejected, "
	"connection attempt aborted by remote endpoint\n",
	s, __func__);
	V_tcpstat.tcps_sc_reset++;
	} else {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != "
	"IRS %u (+WND %u), segment ignored\n",
	s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd);
	V_tcpstat.tcps_badrst++;
	}

	done:
	if (s != NULL)
	free(s, M_TCPLOG);
	SCH_UNLOCK(sch);
	}

	void
	syncache_badack(struct in_conninfo *inc)
	{
	+ INIT_VNET_INET(curvnet);
	struct syncache *sc;
	struct syncache_head *sch;

	sc = syncache_lookup(inc, &sch); /* returns locked sch */
	SCH_LOCK_ASSERT(sch);
	if (sc != NULL) {
	syncache_drop(sc, sch);
	V_tcpstat.tcps_sc_badack++;
	}
	SCH_UNLOCK(sch);
	}

	void
	syncache_unreach(struct in_conninfo inc, struct tcphdr th)
	{
	+ INIT_VNET_INET(curvnet);
	struct syncache *sc;
	struct syncache_head *sch;

	sc = syncache_lookup(inc, &sch); /* returns locked sch */
	SCH_LOCK_ASSERT(sch);
	if (sc == NULL)
	goto done;

	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
	if (ntohl(th->th_seq) != sc->sc_iss)
	goto done;

	/*
	* If we've rertransmitted 3 times and this is our second error,
	* we remove the entry. Otherwise, we allow it to continue on.
	* This prevents us from incorrectly nuking an entry during a
	* spurious network outage.
	*
	* See tcp_notify().
	*/
	if ((sc->sc_flags & SCF_UNREACH) == 0 \|\| sc->sc_rxmits < 3 + 1) {
	sc->sc_flags \|= SCF_UNREACH;
	goto done;
	}
	syncache_drop(sc, sch);
	V_tcpstat.tcps_sc_unreach++;
	done:
	SCH_UNLOCK(sch);
	}

	/*
	* Build a new TCP socket structure from a syncache entry.
	*/
	static struct socket *
	syncache_socket(struct syncache sc, struct socket lso, struct mbuf *m)
	{
	+ INIT_VNET_INET(lso->so_vnet);
	struct inpcb *inp = NULL;
	struct socket *so;
	struct tcpcb *tp;
	char *s;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);

	/*
	* Ok, create the full blown connection, and set things up
	* as they would have been set up if we had created the
	* connection when the SYN arrived. If we can't create
	* the connection, abort it.
	*/
	so = sonewconn(lso, SS_ISCONNECTED);
	if (so == NULL) {
	/*
	* Drop the connection; we will either send a RST or
	* have the peer retransmit its SYN again after its
	* RTO and try again.
	*/
	V_tcpstat.tcps_listendrop++;
	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
	log(LOG_DEBUG, "%s; %s: Socket create failed "
	"due to limits or memory shortage\n",
	s, __func__);
	free(s, M_TCPLOG);
	}
	goto abort2;
	}
	#ifdef MAC
	SOCK_LOCK(so);
	mac_socketpeer_set_from_mbuf(m, so);
	SOCK_UNLOCK(so);
	#endif

	inp = sotoinpcb(so);
	inp->inp_inc.inc_fibnum = sc->sc_inc.inc_fibnum;
	so->so_fibnum = sc->sc_inc.inc_fibnum;
	INP_WLOCK(inp);

	/* Insert new socket into PCB hash list. */
	inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
	#ifdef INET6
	if (sc->sc_inc.inc_isipv6) {
	inp->in6p_laddr = sc->sc_inc.inc6_laddr;
	} else {
	inp->inp_vflag &= ~INP_IPV6;
	inp->inp_vflag \|= INP_IPV4;
	#endif
	inp->inp_laddr = sc->sc_inc.inc_laddr;
	#ifdef INET6
	}
	#endif
	inp->inp_lport = sc->sc_inc.inc_lport;
	if (in_pcbinshash(inp) != 0) {
	/*
	* Undo the assignments above if we failed to
	* put the PCB on the hash lists.
	*/
	#ifdef INET6
	if (sc->sc_inc.inc_isipv6)
	inp->in6p_laddr = in6addr_any;
	else
	#endif
	inp->inp_laddr.s_addr = INADDR_ANY;
	inp->inp_lport = 0;
	goto abort;
	}
	#ifdef IPSEC
	/* Copy old policy into new socket's. */
	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
	printf("syncache_socket: could not copy policy\n");
	#endif
	#ifdef INET6
	if (sc->sc_inc.inc_isipv6) {
	struct inpcb *oinp = sotoinpcb(lso);
	struct in6_addr laddr6;
	struct sockaddr_in6 sin6;
	/*
	* Inherit socket options from the listening socket.
	* Note that in6p_inputopts are not (and should not be)
	* copied, since it stores previously received options and is
	* used to detect if each new option is different than the
	* previous one and hence should be passed to a user.
	* If we copied in6p_inputopts, a user would not be able to
	* receive options just after calling the accept system call.
	*/
	inp->inp_flags \|= oinp->inp_flags & INP_CONTROLOPTS;
	if (oinp->in6p_outputopts)
	inp->in6p_outputopts =
	ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);

	sin6.sin6_family = AF_INET6;
	sin6.sin6_len = sizeof(sin6);
	sin6.sin6_addr = sc->sc_inc.inc6_faddr;
	sin6.sin6_port = sc->sc_inc.inc_fport;
	sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
	laddr6 = inp->in6p_laddr;
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
	inp->in6p_laddr = sc->sc_inc.inc6_laddr;
	if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
	thread0.td_ucred)) {
	inp->in6p_laddr = laddr6;
	goto abort;
	}
	/* Override flowlabel from in6_pcbconnect. */
	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
	inp->in6p_flowinfo \|= sc->sc_flowlabel;
	} else
	#endif
	{
	struct in_addr laddr;
	struct sockaddr_in sin;

	inp->inp_options = (m) ? ip_srcroute(m) : NULL;

	if (inp->inp_options == NULL) {
	inp->inp_options = sc->sc_ipopts;
	sc->sc_ipopts = NULL;
	}

	sin.sin_family = AF_INET;
	sin.sin_len = sizeof(sin);
	sin.sin_addr = sc->sc_inc.inc_faddr;
	sin.sin_port = sc->sc_inc.inc_fport;
	bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
	laddr = inp->inp_laddr;
	if (inp->inp_laddr.s_addr == INADDR_ANY)
	inp->inp_laddr = sc->sc_inc.inc_laddr;
	if (in_pcbconnect(inp, (struct sockaddr *)&sin,
	thread0.td_ucred)) {
	inp->inp_laddr = laddr;
	goto abort;
	}
	}
	tp = intotcpcb(inp);
	tp->t_state = TCPS_SYN_RECEIVED;
	tp->iss = sc->sc_iss;
	tp->irs = sc->sc_irs;
	tcp_rcvseqinit(tp);
	tcp_sendseqinit(tp);
	tp->snd_wl1 = sc->sc_irs;
	tp->snd_max = tp->iss + 1;
	tp->snd_nxt = tp->iss + 1;
	tp->rcv_up = sc->sc_irs + 1;
	tp->rcv_wnd = sc->sc_wnd;
	tp->rcv_adv += tp->rcv_wnd;
	tp->last_ack_sent = tp->rcv_nxt;

	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH\|TF_NODELAY);
	if (sc->sc_flags & SCF_NOOPT)
	tp->t_flags \|= TF_NOOPT;
	else {
	if (sc->sc_flags & SCF_WINSCALE) {
	tp->t_flags \|= TF_REQ_SCALE\|TF_RCVD_SCALE;
	tp->snd_scale = sc->sc_requested_s_scale;
	tp->request_r_scale = sc->sc_requested_r_scale;
	}
	if (sc->sc_flags & SCF_TIMESTAMP) {
	tp->t_flags \|= TF_REQ_TSTMP\|TF_RCVD_TSTMP;
	tp->ts_recent = sc->sc_tsreflect;
	tp->ts_recent_age = ticks;
	tp->ts_offset = sc->sc_tsoff;
	}
	#ifdef TCP_SIGNATURE
	if (sc->sc_flags & SCF_SIGNATURE)
	tp->t_flags \|= TF_SIGNATURE;
	#endif
	if (sc->sc_flags & SCF_SACK)
	tp->t_flags \|= TF_SACK_PERMIT;
	}

	if (sc->sc_flags & SCF_ECN)
	tp->t_flags \|= TF_ECN_PERMIT;

	/*
	* Set up MSS and get cached values from tcp_hostcache.
	* This might overwrite some of the defaults we just set.
	*/
	tcp_mss(tp, sc->sc_peer_mss);

	/*
	* If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
	*/
	if (sc->sc_rxmits)
	tp->snd_cwnd = tp->t_maxseg;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);

	INP_WUNLOCK(inp);

	V_tcpstat.tcps_accepts++;
	return (so);

	abort:
	INP_WUNLOCK(inp);
	abort2:
	if (so != NULL)
	soabort(so);
	return (NULL);
	}

	/*
	* This function gets called when we receive an ACK for a
	* socket in the LISTEN state. We look up the connection
	* in the syncache, and if its there, we pull it out of
	* the cache and turn it into a full-blown connection in
	* the SYN-RECEIVED state.
	*/
	int
	syncache_expand(struct in_conninfo inc, struct tcpopt to, struct tcphdr *th,
	struct socket *lsop, struct mbuf m)
	{
	+ INIT_VNET_INET(curvnet);
	struct syncache *sc;
	struct syncache_head *sch;
	struct syncache scs;
	char *s;

	/*
	* Global TCP locks are held because we manipulate the PCB lists
	* and create a new socket.
	*/
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	KASSERT((th->th_flags & (TH_RST\|TH_ACK\|TH_SYN)) == TH_ACK,
	("%s: can handle only ACK", __func__));

	sc = syncache_lookup(inc, &sch); /* returns locked sch */
	SCH_LOCK_ASSERT(sch);
	if (sc == NULL) {
	/*
	* There is no syncache entry, so see if this ACK is
	* a returning syncookie. To do this, first:
	* A. See if this socket has had a syncache entry dropped in
	* the past. We don't want to accept a bogus syncookie
	* if we've never received a SYN.
	* B. check that the syncookie is valid. If it is, then
	* cobble up a fake syncache entry, and return.
	*/
	if (!tcp_syncookies) {
	SCH_UNLOCK(sch);
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Spurious ACK, "
	"segment rejected (syncookies disabled)\n",
	s, __func__);
	goto failed;
	}
	bzero(&scs, sizeof(scs));
	sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop);
	SCH_UNLOCK(sch);
	if (sc == NULL) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Segment failed "
	"SYNCOOKIE authentication, segment rejected "
	"(probably spoofed)\n", s, __func__);
	goto failed;
	}
	} else {
	/* Pull out the entry to unlock the bucket row. */
	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
	sch->sch_length--;
	V_tcp_syncache.cache_count--;
	SCH_UNLOCK(sch);
	}

	/*
	* Segment validation:
	* ACK must match our initial sequence number + 1 (the SYN\|ACK).
	*/
	if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
	"rejected\n", s, __func__, th->th_ack, sc->sc_iss);
	goto failed;
	}

	/*
	* The SEQ must fall in the window starting at the received
	* initial receive sequence number + 1 (the SYN).
	*/
	if ((SEQ_LEQ(th->th_seq, sc->sc_irs) \|\|
	SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
	!TOEPCB_ISSET(sc)) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
	"rejected\n", s, __func__, th->th_seq, sc->sc_irs);
	goto failed;
	}

	if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
	"segment rejected\n", s, __func__);
	goto failed;
	}
	/*
	* If timestamps were negotiated the reflected timestamp
	* must be equal to what we actually sent in the SYN\|ACK.
	*/
	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
	!TOEPCB_ISSET(sc)) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
	log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
	"segment rejected\n",
	s, __func__, to->to_tsecr, sc->sc_ts);
	goto failed;
	}

	lsop = syncache_socket(sc, lsop, m);

	if (*lsop == NULL)
	V_tcpstat.tcps_sc_aborted++;
	else
	V_tcpstat.tcps_sc_completed++;

	/* how do we find the inp for the new socket? */
	if (sc != &scs)
	syncache_free(sc);
	return (1);
	failed:
	if (sc != NULL && sc != &scs)
	syncache_free(sc);
	if (s != NULL)
	free(s, M_TCPLOG);
	*lsop = NULL;
	return (0);
	}

	int
	tcp_offload_syncache_expand(struct in_conninfo inc, struct tcpopt to,
	struct tcphdr th, struct socket lsop, struct mbuf m)
	{
	int rc;

	INP_INFO_WLOCK(&V_tcbinfo);
	rc = syncache_expand(inc, to, th, lsop, m);
	INP_INFO_WUNLOCK(&V_tcbinfo);

	return (rc);
	}

	/*
	* Given a LISTEN socket and an inbound SYN request, add
	* this to the syn cache, and send back a segment:
	* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
	* to the source.
	*
	* IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
	* Doing so would require that we hold onto the data and deliver it
	* to the application. However, if we are the target of a SYN-flood
	* DoS attack, an attacker could send data which would eventually
	* consume all available buffer space if it were ACKed. By not ACKing
	* the data, we avoid this DoS scenario.
	*/
	static void
	_syncache_add(struct in_conninfo inc, struct tcpopt to, struct tcphdr *th,
	struct inpcb inp, struct socket lsop, struct mbuf m,
	struct toe_usrreqs tu, void toepcb)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct tcpcb *tp;
	struct socket *so;
	struct syncache *sc = NULL;
	struct syncache_head *sch;
	struct mbuf *ipopts = NULL;
	u_int32_t flowtmp;
	int win, sb_hiwat, ip_ttl, ip_tos, noopt;
	char *s;
	#ifdef INET6
	int autoflowlabel = 0;
	#endif
	#ifdef MAC
	struct label *maclabel;
	#endif
	struct syncache scs;
	struct ucred *cred;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp); /* listen socket */
	KASSERT((th->th_flags & (TH_RST\|TH_ACK\|TH_SYN)) == TH_SYN,
	("%s: unexpected tcp flags", __func__));

	/*
	* Combine all so/tp operations very early to drop the INP lock as
	* soon as possible.
	*/
	so = *lsop;
	tp = sototcpcb(so);
	cred = crhold(so->so_cred);

	#ifdef INET6
	if (inc->inc_isipv6 &&
	(inp->in6p_flags & IN6P_AUTOFLOWLABEL))
	autoflowlabel = 1;
	#endif
	ip_ttl = inp->inp_ip_ttl;
	ip_tos = inp->inp_ip_tos;
	win = sbspace(&so->so_rcv);
	sb_hiwat = so->so_rcv.sb_hiwat;
	noopt = (tp->t_flags & TF_NOOPT);

	/* By the time we drop the lock these should no longer be used. */
	so = NULL;
	tp = NULL;

	#ifdef MAC
	if (mac_syncache_init(&maclabel) != 0) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	goto done;
	} else
	mac_syncache_create(maclabel, inp);
	#endif
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);

	/*
	* Remember the IP options, if any.
	*/
	#ifdef INET6
	if (!inc->inc_isipv6)
	#endif
	ipopts = (m) ? ip_srcroute(m) : NULL;

	/*
	* See if we already have an entry for this connection.
	* If we do, resend the SYN,ACK, and reset the retransmit timer.
	*
	* XXX: should the syncache be re-initialized with the contents
	* of the new SYN here (which may have different options?)
	*
	* XXX: We do not check the sequence number to see if this is a
	* real retransmit or a new connection attempt. The question is
	* how to handle such a case; either ignore it as spoofed, or
	* drop the current entry and create a new one?
	*/
	sc = syncache_lookup(inc, &sch); /* returns locked entry */
	SCH_LOCK_ASSERT(sch);
	if (sc != NULL) {
	#ifndef TCP_OFFLOAD_DISABLE
	if (sc->sc_tu)
	sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
	sc->sc_toepcb);
	#endif
	V_tcpstat.tcps_sc_dupsyn++;
	if (ipopts) {
	/*
	* If we were remembering a previous source route,
	* forget it and use the new one we've been given.
	*/
	if (sc->sc_ipopts)
	(void) m_free(sc->sc_ipopts);
	sc->sc_ipopts = ipopts;
	}
	/*
	* Update timestamp if present.
	*/
	if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS))
	sc->sc_tsreflect = to->to_tsval;
	else
	sc->sc_flags &= ~SCF_TIMESTAMP;
	#ifdef MAC
	/*
	* Since we have already unconditionally allocated label
	* storage, free it up. The syncache entry will already
	* have an initialized label we can use.
	*/
	mac_syncache_destroy(&maclabel);
	KASSERT(sc->sc_label != NULL,
	("%s: label not initialized", __func__));
	#endif
	/* Retransmit SYN\|ACK and reset retransmit count. */
	if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) {
	log(LOG_DEBUG, "%s; %s: Received duplicate SYN, "
	"resetting timer and retransmitting SYN\|ACK\n",
	s, __func__);
	free(s, M_TCPLOG);
	}
	if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
	sc->sc_rxmits = 0;
	syncache_timeout(sc, sch, 1);
	V_tcpstat.tcps_sndacks++;
	V_tcpstat.tcps_sndtotal++;
	}
	SCH_UNLOCK(sch);
	goto done;
	}

	sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT \| M_ZERO);
	if (sc == NULL) {
	/*
	* The zone allocator couldn't provide more entries.
	* Treat this as if the cache was full; drop the oldest
	* entry and insert the new one.
	*/
	V_tcpstat.tcps_sc_zonefail++;
	if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL)
	syncache_drop(sc, sch);
	sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT \| M_ZERO);
	if (sc == NULL) {
	if (tcp_syncookies) {
	bzero(&scs, sizeof(scs));
	sc = &scs;
	} else {
	SCH_UNLOCK(sch);
	if (ipopts)
	(void) m_free(ipopts);
	goto done;
	}
	}
	}

	/*
	* Fill in the syncache values.
	*/
	#ifdef MAC
	sc->sc_label = maclabel;
	#endif
	sc->sc_cred = cred;
	cred = NULL;
	sc->sc_ipopts = ipopts;
	sc->sc_inc.inc_fibnum = inp->inp_inc.inc_fibnum;
	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
	#ifdef INET6
	if (!inc->inc_isipv6)
	#endif
	{
	sc->sc_ip_tos = ip_tos;
	sc->sc_ip_ttl = ip_ttl;
	}
	#ifndef TCP_OFFLOAD_DISABLE
	sc->sc_tu = tu;
	sc->sc_toepcb = toepcb;
	#endif
	sc->sc_irs = th->th_seq;
	sc->sc_iss = arc4random();
	sc->sc_flags = 0;
	sc->sc_flowlabel = 0;

	/*
	* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
	* win was derived from socket earlier in the function.
	*/
	win = imax(win, 0);
	win = imin(win, TCP_MAXWIN);
	sc->sc_wnd = win;

	if (V_tcp_do_rfc1323) {
	/*
	* A timestamp received in a SYN makes
	* it ok to send timestamp requests and replies.
	*/
	if (to->to_flags & TOF_TS) {
	sc->sc_tsreflect = to->to_tsval;
	sc->sc_ts = ticks;
	sc->sc_flags \|= SCF_TIMESTAMP;
	}
	if (to->to_flags & TOF_SCALE) {
	int wscale = 0;

	/*
	* Pick the smallest possible scaling factor that
	* will still allow us to scale up to sb_max, aka
	* kern.ipc.maxsockbuf.
	*
	* We do this because there are broken firewalls that
	* will corrupt the window scale option, leading to
	* the other endpoint believing that our advertised
	* window is unscaled. At scale factors larger than
	* 5 the unscaled window will drop below 1500 bytes,
	* leading to serious problems when traversing these
	* broken firewalls.
	*
	* With the default maxsockbuf of 256K, a scale factor
	* of 3 will be chosen by this algorithm. Those who
	* choose a larger maxsockbuf should watch out
	* for the compatiblity problems mentioned above.
	*
	* RFC1323: The Window field in a SYN (i.e., a <SYN>
	* or <SYN,ACK>) segment itself is never scaled.
	*/
	while (wscale < TCP_MAX_WINSHIFT &&
	(TCP_MAXWIN << wscale) < sb_max)
	wscale++;
	sc->sc_requested_r_scale = wscale;
	sc->sc_requested_s_scale = to->to_wscale;
	sc->sc_flags \|= SCF_WINSCALE;
	}
	}
	#ifdef TCP_SIGNATURE
	/*
	* If listening socket requested TCP digests, and received SYN
	* contains the option, flag this in the syncache so that
	* syncache_respond() will do the right thing with the SYN+ACK.
	* XXX: Currently we always record the option by default and will
	* attempt to use it in syncache_respond().
	*/
	if (to->to_flags & TOF_SIGNATURE)
	sc->sc_flags \|= SCF_SIGNATURE;
	#endif
	if (to->to_flags & TOF_SACKPERM)
	sc->sc_flags \|= SCF_SACK;
	if (to->to_flags & TOF_MSS)
	sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */
	if (noopt)
	sc->sc_flags \|= SCF_NOOPT;
	if ((th->th_flags & (TH_ECE\|TH_CWR)) && V_tcp_do_ecn)
	sc->sc_flags \|= SCF_ECN;

	if (tcp_syncookies) {
	syncookie_generate(sch, sc, &flowtmp);
	#ifdef INET6
	if (autoflowlabel)
	sc->sc_flowlabel = flowtmp;
	#endif
	} else {
	#ifdef INET6
	if (autoflowlabel)
	sc->sc_flowlabel =
	(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
	#endif
	}
	SCH_UNLOCK(sch);

	/*
	* Do a standard 3-way handshake.
	*/
	if (TOEPCB_ISSET(sc) \|\| syncache_respond(sc) == 0) {
	if (tcp_syncookies && tcp_syncookiesonly && sc != &scs)
	syncache_free(sc);
	else if (sc != &scs)
	syncache_insert(sc, sch); /* locks and unlocks sch */
	V_tcpstat.tcps_sndacks++;
	V_tcpstat.tcps_sndtotal++;
	} else {
	if (sc != &scs)
	syncache_free(sc);
	V_tcpstat.tcps_sc_dropped++;
	}

	done:
	if (cred != NULL)
	crfree(cred);
	#ifdef MAC
	if (sc == &scs)
	mac_syncache_destroy(&maclabel);
	#endif
	if (m) {

	*lsop = NULL;
	m_freem(m);
	}
	return;
	}

	static int
	syncache_respond(struct syncache *sc)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip = NULL;
	struct mbuf *m;
	struct tcphdr *th;
	int optlen, error;
	u_int16_t hlen, tlen, mssopt;
	struct tcpopt to;
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	#endif

	hlen =
	#ifdef INET6
	(sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
	#endif
	sizeof(struct ip);
	tlen = hlen + sizeof(struct tcphdr);

	/* Determine MSS we advertize to other end of connection. */
	mssopt = tcp_mssopt(&sc->sc_inc);
	if (sc->sc_peer_mss)
	mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss);

	/* XXX: Assume that the entire packet will fit in a header mbuf. */
	KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN,
	("syncache: mbuf too small"));

	/* Create the IP+TCP header from scratch. */
	m = m_gethdr(M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return (ENOBUFS);
	#ifdef MAC
	mac_syncache_create_mbuf(sc->sc_label, m);
	#endif
	m->m_data += max_linkhdr;
	m->m_len = tlen;
	m->m_pkthdr.len = tlen;
	m->m_pkthdr.rcvif = NULL;

	#ifdef INET6
	if (sc->sc_inc.inc_isipv6) {
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_vfc = IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_TCP;
	ip6->ip6_src = sc->sc_inc.inc6_laddr;
	ip6->ip6_dst = sc->sc_inc.inc6_faddr;
	ip6->ip6_plen = htons(tlen - hlen);
	/* ip6_hlim is set after checksum */
	ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
	ip6->ip6_flow \|= sc->sc_flowlabel;

	th = (struct tcphdr *)(ip6 + 1);
	} else
	#endif
	{
	ip = mtod(m, struct ip *);
	ip->ip_v = IPVERSION;
	ip->ip_hl = sizeof(struct ip) >> 2;
	ip->ip_len = tlen;
	ip->ip_id = 0;
	ip->ip_off = 0;
	ip->ip_sum = 0;
	ip->ip_p = IPPROTO_TCP;
	ip->ip_src = sc->sc_inc.inc_laddr;
	ip->ip_dst = sc->sc_inc.inc_faddr;
	ip->ip_ttl = sc->sc_ip_ttl;
	ip->ip_tos = sc->sc_ip_tos;

	/*
	* See if we should do MTU discovery. Route lookups are
	* expensive, so we will only unset the DF bit if:
	*
	* 1) path_mtu_discovery is disabled
	* 2) the SCF_UNREACH flag has been set
	*/
	if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
	ip->ip_off \|= IP_DF;

	th = (struct tcphdr *)(ip + 1);
	}
	th->th_sport = sc->sc_inc.inc_lport;
	th->th_dport = sc->sc_inc.inc_fport;

	th->th_seq = htonl(sc->sc_iss);
	th->th_ack = htonl(sc->sc_irs + 1);
	th->th_off = sizeof(struct tcphdr) >> 2;
	th->th_x2 = 0;
	th->th_flags = TH_SYN\|TH_ACK;
	th->th_win = htons(sc->sc_wnd);
	th->th_urp = 0;

	if (sc->sc_flags & SCF_ECN) {
	th->th_flags \|= TH_ECE;
	V_tcpstat.tcps_ecn_shs++;
	}

	/* Tack on the TCP options. */
	if ((sc->sc_flags & SCF_NOOPT) == 0) {
	to.to_flags = 0;

	to.to_mss = mssopt;
	to.to_flags = TOF_MSS;
	if (sc->sc_flags & SCF_WINSCALE) {
	to.to_wscale = sc->sc_requested_r_scale;
	to.to_flags \|= TOF_SCALE;
	}
	if (sc->sc_flags & SCF_TIMESTAMP) {
	/* Virgin timestamp or TCP cookie enhanced one. */
	to.to_tsval = sc->sc_ts;
	to.to_tsecr = sc->sc_tsreflect;
	to.to_flags \|= TOF_TS;
	}
	if (sc->sc_flags & SCF_SACK)
	to.to_flags \|= TOF_SACKPERM;
	#ifdef TCP_SIGNATURE
	if (sc->sc_flags & SCF_SIGNATURE)
	to.to_flags \|= TOF_SIGNATURE;
	#endif
	optlen = tcp_addoptions(&to, (u_char *)(th + 1));

	/* Adjust headers by option size. */
	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
	m->m_len += optlen;
	m->m_pkthdr.len += optlen;

	#ifdef TCP_SIGNATURE
	if (sc->sc_flags & SCF_SIGNATURE)
	tcp_signature_compute(m, 0, 0, optlen,
	to.to_signature, IPSEC_DIR_OUTBOUND);
	#endif
	#ifdef INET6
	if (sc->sc_inc.inc_isipv6)
	ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
	else
	#endif
	ip->ip_len += optlen;
	} else
	optlen = 0;

	#ifdef INET6
	if (sc->sc_inc.inc_isipv6) {
	th->th_sum = 0;
	th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen,
	tlen + optlen - hlen);
	ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
	error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
	} else
	#endif
	{
	th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	htons(tlen + optlen - hlen + IPPROTO_TCP));
	m->m_pkthdr.csum_flags = CSUM_TCP;
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
	}
	return (error);
	}

	void
	syncache_add(struct in_conninfo inc, struct tcpopt to, struct tcphdr *th,
	struct inpcb inp, struct socket lsop, struct mbuf m)
	{
	_syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
	}

	void
	tcp_offload_syncache_add(struct in_conninfo inc, struct tcpopt to,
	struct tcphdr th, struct inpcb inp, struct socket **lsop,
	struct toe_usrreqs tu, void toepcb)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(inp);
	_syncache_add(inc, to, th, inp, lsop, NULL, tu, toepcb);
	}

	/*
	* The purpose of SYN cookies is to avoid keeping track of all SYN's we
	* receive and to be able to handle SYN floods from bogus source addresses
	* (where we will never receive any reply). SYN floods try to exhaust all
	* our memory and available slots in the SYN cache table to cause a denial
	* of service to legitimate users of the local host.
	*
	* The idea of SYN cookies is to encode and include all necessary information
	* about the connection setup state within the SYN-ACK we send back and thus
	* to get along without keeping any local state until the ACK to the SYN-ACK
	* arrives (if ever). Everything we need to know should be available from
	* the information we encoded in the SYN-ACK.
	*
	* More information about the theory behind SYN cookies and its first
	* discussion and specification can be found at:
	* http://cr.yp.to/syncookies.html (overview)
	* http://cr.yp.to/syncookies/archive (gory details)
	*
	* This implementation extends the orginal idea and first implementation
	* of FreeBSD by using not only the initial sequence number field to store
	* information but also the timestamp field if present. This way we can
	* keep track of the entire state we need to know to recreate the session in
	* its original form. Almost all TCP speakers implement RFC1323 timestamps
	* these days. For those that do not we still have to live with the known
	* shortcomings of the ISN only SYN cookies.
	*
	* Cookie layers:
	*
	* Initial sequence number we send:
	* 31\|................................\|0
	* DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP
	* D = MD5 Digest (first dword)
	* M = MSS index
	* R = Rotation of secret
	* P = Odd or Even secret
	*
	* The MD5 Digest is computed with over following parameters:
	* a) randomly rotated secret
	* b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6)
	* c) the received initial sequence number from remote host
	* d) the rotation offset and odd/even bit
	*
	* Timestamp we send:
	* 31\|................................\|0
	* DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5
	* D = MD5 Digest (third dword) (only as filler)
	* S = Requested send window scale
	* R = Requested receive window scale
	* A = SACK allowed
	* 5 = TCP-MD5 enabled (not implemented yet)
	* XORed with MD5 Digest (forth dword)
	*
	* The timestamp isn't cryptographically secure and doesn't need to be.
	* The double use of the MD5 digest dwords ties it to a specific remote/
	* local host/port, remote initial sequence number and our local time
	* limited secret. A received timestamp is reverted (XORed) and then
	* the contained MD5 dword is compared to the computed one to ensure the
	* timestamp belongs to the SYN-ACK we sent. The other parameters may
	* have been tampered with but this isn't different from supplying bogus
	* values in the SYN in the first place.
	*
	* Some problems with SYN cookies remain however:
	* Consider the problem of a recreated (and retransmitted) cookie. If the
	* original SYN was accepted, the connection is established. The second
	* SYN is inflight, and if it arrives with an ISN that falls within the
	* receive window, the connection is killed.
	*
	* Notes:
	* A heuristic to determine when to accept syn cookies is not necessary.
	* An ACK flood would cause the syncookie verification to be attempted,
	* but a SYN flood causes syncookies to be generated. Both are of equal
	* cost, so there's no point in trying to optimize the ACK flood case.
	* Also, if you don't process certain ACKs for some reason, then all someone
	* would have to do is launch a SYN and ACK flood at the same time, which
	* would stop cookie verification and defeat the entire purpose of syncookies.
	*/
	static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 };

	static void
	syncookie_generate(struct syncache_head sch, struct syncache sc,
	u_int32_t *flowlabel)
	{
	+ INIT_VNET_INET(curvnet);
	MD5_CTX ctx;
	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
	u_int32_t data;
	u_int32_t *secbits;
	u_int off, pmss, mss;
	int i;

	SCH_LOCK_ASSERT(sch);

	/* Which of the two secrets to use. */
	secbits = sch->sch_oddeven ?
	sch->sch_secbits_odd : sch->sch_secbits_even;

	/* Reseed secret if too old. */
	if (sch->sch_reseed < time_uptime) {
	sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */
	secbits = sch->sch_oddeven ?
	sch->sch_secbits_odd : sch->sch_secbits_even;
	for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++)
	secbits[i] = arc4random();
	sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME;
	}

	/* Secret rotation offset. */
	off = sc->sc_iss & 0x7; /* iss was randomized before */

	/* Maximum segment size calculation. */
	pmss =
	max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss);
	for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--)
	if (tcp_sc_msstab[mss] <= pmss)
	break;

	/* Fold parameters and MD5 digest into the ISN we will send. */
	data = sch->sch_oddeven;/* odd or even secret, 1 bit */
	data \|= off << 1; /* secret offset, derived from iss, 3 bits */
	data \|= mss << 4; /* mss, 3 bits */

	MD5Init(&ctx);
	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
	SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
	MD5Update(&ctx, secbits, off);
	MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc));
	MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs));
	MD5Update(&ctx, &data, sizeof(data));
	MD5Final((u_int8_t *)&md5_buffer, &ctx);

	data \|= (md5_buffer[0] << 7);
	sc->sc_iss = data;

	#ifdef INET6
	*flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
	#endif

	/* Additional parameters are stored in the timestamp if present. */
	if (sc->sc_flags & SCF_TIMESTAMP) {
	data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */
	data \|= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */
	data \|= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */
	data \|= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */
	data \|= md5_buffer[2] << 10; /* more digest bits */
	data ^= md5_buffer[3];
	sc->sc_ts = data;
	sc->sc_tsoff = data - ticks; /* after XOR */
	}

	V_tcpstat.tcps_sc_sendcookie++;
	return;
	}

	static struct syncache *
	syncookie_lookup(struct in_conninfo inc, struct syncache_head sch,
	struct syncache sc, struct tcpopt to, struct tcphdr *th,
	struct socket *so)
	{
	+ INIT_VNET_INET(curvnet);
	MD5_CTX ctx;
	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
	u_int32_t data = 0;
	u_int32_t *secbits;
	tcp_seq ack, seq;
	int off, mss, wnd, flags;

	SCH_LOCK_ASSERT(sch);

	/*
	* Pull information out of SYN-ACK/ACK and
	* revert sequence number advances.
	*/
	ack = th->th_ack - 1;
	seq = th->th_seq - 1;
	off = (ack >> 1) & 0x7;
	mss = (ack >> 4) & 0x7;
	flags = ack & 0x7f;

	/* Which of the two secrets to use. */
	secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even;

	/*
	* The secret wasn't updated for the lifetime of a syncookie,
	* so this SYN-ACK/ACK is either too old (replay) or totally bogus.
	*/
	if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) {
	return (NULL);
	}

	/* Recompute the digest so we can compare it. */
	MD5Init(&ctx);
	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
	SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
	MD5Update(&ctx, secbits, off);
	MD5Update(&ctx, inc, sizeof(*inc));
	MD5Update(&ctx, &seq, sizeof(seq));
	MD5Update(&ctx, &flags, sizeof(flags));
	MD5Final((u_int8_t *)&md5_buffer, &ctx);

	/* Does the digest part of or ACK'ed ISS match? */
	if ((ack & (~0x7f)) != (md5_buffer[0] << 7))
	return (NULL);

	/* Does the digest part of our reflected timestamp match? */
	if (to->to_flags & TOF_TS) {
	data = md5_buffer[3] ^ to->to_tsecr;
	if ((data & (~0x3ff)) != (md5_buffer[2] << 10))
	return (NULL);
	}

	/* Fill in the syncache values. */
	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
	sc->sc_ipopts = NULL;

	sc->sc_irs = seq;
	sc->sc_iss = ack;

	#ifdef INET6
	if (inc->inc_isipv6) {
	if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL)
	sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
	} else
	#endif
	{
	sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
	sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
	}

	/* Additional parameters that were encoded in the timestamp. */
	if (data) {
	sc->sc_flags \|= SCF_TIMESTAMP;
	sc->sc_tsreflect = to->to_tsval;
	sc->sc_ts = to->to_tsecr;
	sc->sc_tsoff = to->to_tsecr - ticks;
	sc->sc_flags \|= (data & 0x1) ? SCF_SIGNATURE : 0;
	sc->sc_flags \|= ((data >> 1) & 0x1) ? SCF_SACK : 0;
	sc->sc_requested_s_scale = min((data >> 2) & 0xf,
	TCP_MAX_WINSHIFT);
	sc->sc_requested_r_scale = min((data >> 6) & 0xf,
	TCP_MAX_WINSHIFT);
	if (sc->sc_requested_s_scale \|\| sc->sc_requested_r_scale)
	sc->sc_flags \|= SCF_WINSCALE;
	} else
	sc->sc_flags \|= SCF_NOOPT;

	wnd = sbspace(&so->so_rcv);
	wnd = imax(wnd, 0);
	wnd = imin(wnd, TCP_MAXWIN);
	sc->sc_wnd = wnd;

	sc->sc_rxmits = 0;
	sc->sc_peer_mss = tcp_sc_msstab[mss];

	V_tcpstat.tcps_sc_recvcookie++;
	return (sc);
	}

	/*
	* Returns the current number of syncache entries. This number
	* will probably change before you get around to calling
	* syncache_pcblist.
	*/

	int
	syncache_pcbcount(void)
	{
	+ INIT_VNET_INET(curvnet);
	struct syncache_head *sch;
	int count, i;

	for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
	/* No need to lock for a read. */
	sch = &V_tcp_syncache.hashbase[i];
	count += sch->sch_length;
	}
	return count;
	}

	/*
	* Exports the syncache entries to userland so that netstat can display
	* them alongside the other sockets. This function is intended to be
	* called only from tcp_pcblist.
	*
	* Due to concurrency on an active system, the number of pcbs exported
	* may have no relation to max_pcbs. max_pcbs merely indicates the
	* amount of space the caller allocated for this function to use.
	*/
	int
	syncache_pcblist(struct sysctl_req req, int max_pcbs, int pcbs_exported)
	{
	+ INIT_VNET_INET(curvnet);
	struct xtcpcb xt;
	struct syncache *sc;
	struct syncache_head *sch;
	int count, error, i;

	for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) {
	sch = &V_tcp_syncache.hashbase[i];
	SCH_LOCK(sch);
	TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
	if (count >= max_pcbs) {
	SCH_UNLOCK(sch);
	goto exit;
	}
	if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0)
	continue;
	bzero(&xt, sizeof(xt));
	xt.xt_len = sizeof(xt);
	if (sc->sc_inc.inc_isipv6)
	xt.xt_inp.inp_vflag = INP_IPV6;
	else
	xt.xt_inp.inp_vflag = INP_IPV4;
	bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo));
	xt.xt_tp.t_inpcb = &xt.xt_inp;
	xt.xt_tp.t_state = TCPS_SYN_RECEIVED;
	xt.xt_socket.xso_protocol = IPPROTO_TCP;
	xt.xt_socket.xso_len = sizeof (struct xsocket);
	xt.xt_socket.so_type = SOCK_STREAM;
	xt.xt_socket.so_state = SS_ISCONNECTING;
	error = SYSCTL_OUT(req, &xt, sizeof xt);
	if (error) {
	SCH_UNLOCK(sch);
	goto exit;
	}
	count++;
	}
	SCH_UNLOCK(sch);
	}
	exit:
	*pcbs_exported = count;
	return error;
	}
	Index: head/sys/netinet/tcp_timer.c
	===================================================================
	--- head/sys/netinet/tcp_timer.c (revision 183549)
	+++ head/sys/netinet/tcp_timer.c (revision 183550)
	@@ -1,631 +1,665 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet6.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#endif
	#include <netinet/ip_var.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif

	int tcp_keepinit;
	SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");

	int tcp_keepidle;
	SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");

	int tcp_keepintvl;
	SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");

	int tcp_delacktime;
	SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
	"Time before a delayed ACK is sent");

	int tcp_msl;
	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");

	int tcp_rexmit_min;
	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
	"Minimum Retransmission Timeout");

	int tcp_rexmit_slop;
	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
	"Retransmission Timer Slop");

	static int always_keepalive = 1;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
	&always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");

	int tcp_fast_finwait2_recycle = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
	&tcp_fast_finwait2_recycle, 0,
	"Recycle closed FIN_WAIT_2 connections faster");

	int tcp_finwait2_timeout;
	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT\|CTLFLAG_RW,
	&tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");


	static int tcp_keepcnt = TCPTV_KEEPCNT;
	/* max idle probes */
	int tcp_maxpersistidle;
	/* max idle time in persist */
	int tcp_maxidle;

	/*
	* Tcp protocol timeout routine called every 500 ms.
	* Updates timestamps used for TCP
	* causes finite state machine actions if timers expire.
	*/
	void
	tcp_slowtimo(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);

	- tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
	- INP_INFO_WLOCK(&V_tcbinfo);
	- (void) tcp_tw_2msl_scan(0);
	- INP_INFO_WUNLOCK(&V_tcbinfo);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET(vnet_iter);
	+ tcp_maxidle = tcp_keepcnt * tcp_keepintvl;
	+ INP_INFO_WLOCK(&V_tcbinfo);
	+ (void) tcp_tw_2msl_scan(0);
	+ INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	+ }
	+ VNET_LIST_RUNLOCK();
	}

	int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
	{ 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };

	int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
	{ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };

	static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */

	static int tcp_timer_race;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
	0, "Count of t_inpcb races on tcp_discardcb");

	/*
	* TCP timer processing.
	*/

	void
	tcp_timer_delack(void *xtp)
	{
	struct tcpcb *tp = xtp;
	struct inpcb *inp;
	+ CURVNET_SET(tp->t_vnet);
	+ INIT_VNET_INET(tp->t_vnet);

	INP_INFO_RLOCK(&V_tcbinfo);
	inp = tp->t_inpcb;
	/*
	* XXXRW: While this assert is in fact correct, bugs in the tcpcb
	* tear-down mean we need it as a work-around for races between
	* timers and tcp_discardcb().
	*
	* KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
	*/
	if (inp == NULL) {
	tcp_timer_race++;
	INP_INFO_RUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	INP_WLOCK(inp);
	INP_INFO_RUNLOCK(&V_tcbinfo);
	if ((inp->inp_vflag & INP_DROPPED) \|\| callout_pending(&tp->t_timers->tt_delack)
	\|\| !callout_active(&tp->t_timers->tt_delack)) {
	INP_WUNLOCK(inp);
	+ CURVNET_RESTORE();
	return;
	}
	callout_deactivate(&tp->t_timers->tt_delack);

	tp->t_flags \|= TF_ACKNOW;
	V_tcpstat.tcps_delack++;
	(void) tcp_output(tp);
	INP_WUNLOCK(inp);
	+ CURVNET_RESTORE();
	}

	void
	tcp_timer_2msl(void *xtp)
	{
	struct tcpcb *tp = xtp;
	struct inpcb *inp;
	+ CURVNET_SET(tp->t_vnet);
	+ INIT_VNET_INET(tp->t_vnet);
	#ifdef TCPDEBUG
	int ostate;

	ostate = tp->t_state;
	#endif
	/*
	* XXXRW: Does this actually happen?
	*/
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = tp->t_inpcb;
	/*
	* XXXRW: While this assert is in fact correct, bugs in the tcpcb
	* tear-down mean we need it as a work-around for races between
	* timers and tcp_discardcb().
	*
	* KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
	*/
	if (inp == NULL) {
	tcp_timer_race++;
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	INP_WLOCK(inp);
	tcp_free_sackholes(tp);
	if ((inp->inp_vflag & INP_DROPPED) \|\| callout_pending(&tp->t_timers->tt_2msl) \|\|
	!callout_active(&tp->t_timers->tt_2msl)) {
	INP_WUNLOCK(tp->t_inpcb);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	callout_deactivate(&tp->t_timers->tt_2msl);
	/*
	* 2 MSL timeout in shutdown went off. If we're closed but
	* still waiting for peer to close and connection has been idle
	* too long, or if 2MSL time is up from TIME_WAIT, delete connection
	* control block. Otherwise, check again in a bit.
	*
	* If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
	* there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
	* Ignore fact that there were recent incoming segments.
	*/
	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
	tp->t_inpcb && tp->t_inpcb->inp_socket &&
	(tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
	V_tcpstat.tcps_finwait2_drops++;
	tp = tcp_close(tp);
	} else {
	if (tp->t_state != TCPS_TIME_WAIT &&
	(ticks - tp->t_rcvtime) <= tcp_maxidle)
	callout_reset(&tp->t_timers->tt_2msl, tcp_keepintvl,
	tcp_timer_2msl, tp);
	else
	tp = tcp_close(tp);
	}

	#ifdef TCPDEBUG
	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	tcp_trace(TA_USER, ostate, tp, (void )0, (struct tcphdr )0,
	PRU_SLOWTIMO);
	#endif
	if (tp != NULL)
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	}

	void
	tcp_timer_keep(void *xtp)
	{
	struct tcpcb *tp = xtp;
	struct tcptemp *t_template;
	struct inpcb *inp;
	+ CURVNET_SET(tp->t_vnet);
	+ INIT_VNET_INET(tp->t_vnet);
	#ifdef TCPDEBUG
	int ostate;

	ostate = tp->t_state;
	#endif
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = tp->t_inpcb;
	/*
	* XXXRW: While this assert is in fact correct, bugs in the tcpcb
	* tear-down mean we need it as a work-around for races between
	* timers and tcp_discardcb().
	*
	* KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
	*/
	if (inp == NULL) {
	tcp_timer_race++;
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	INP_WLOCK(inp);
	if ((inp->inp_vflag & INP_DROPPED) \|\| callout_pending(&tp->t_timers->tt_keep)
	\|\| !callout_active(&tp->t_timers->tt_keep)) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	callout_deactivate(&tp->t_timers->tt_keep);
	/*
	* Keep-alive timer went off; send something
	* or drop connection if idle for too long.
	*/
	V_tcpstat.tcps_keeptimeo++;
	if (tp->t_state < TCPS_ESTABLISHED)
	goto dropit;
	if ((always_keepalive \|\| inp->inp_socket->so_options & SO_KEEPALIVE) &&
	tp->t_state <= TCPS_CLOSING) {
	if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle)
	goto dropit;
	/*
	* Send a packet designed to force a response
	* if the peer is up and reachable:
	* either an ACK if the connection is still alive,
	* or an RST if the peer has closed the connection
	* due to timeout or reboot.
	* Using sequence number tp->snd_una-1
	* causes the transmitted zero-length segment
	* to lie outside the receive window;
	* by the protocol spec, this requires the
	* correspondent TCP to respond.
	*/
	V_tcpstat.tcps_keepprobe++;
	t_template = tcpip_maketemplate(inp);
	if (t_template) {
	tcp_respond(tp, t_template->tt_ipgen,
	&t_template->tt_t, (struct mbuf *)NULL,
	tp->rcv_nxt, tp->snd_una - 1, 0);
	free(t_template, M_TEMP);
	}
	callout_reset(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp);
	} else
	callout_reset(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp);

	#ifdef TCPDEBUG
	if (inp->inp_socket->so_options & SO_DEBUG)
	tcp_trace(TA_USER, ostate, tp, (void )0, (struct tcphdr )0,
	PRU_SLOWTIMO);
	#endif
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;

	dropit:
	V_tcpstat.tcps_keepdrops++;
	tp = tcp_drop(tp, ETIMEDOUT);

	#ifdef TCPDEBUG
	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	tcp_trace(TA_USER, ostate, tp, (void )0, (struct tcphdr )0,
	PRU_SLOWTIMO);
	#endif
	if (tp != NULL)
	INP_WUNLOCK(tp->t_inpcb);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	}

	void
	tcp_timer_persist(void *xtp)
	{
	struct tcpcb *tp = xtp;
	struct inpcb *inp;
	+ CURVNET_SET(tp->t_vnet);
	+ INIT_VNET_INET(tp->t_vnet);
	#ifdef TCPDEBUG
	int ostate;

	ostate = tp->t_state;
	#endif
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = tp->t_inpcb;
	/*
	* XXXRW: While this assert is in fact correct, bugs in the tcpcb
	* tear-down mean we need it as a work-around for races between
	* timers and tcp_discardcb().
	*
	* KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
	*/
	if (inp == NULL) {
	tcp_timer_race++;
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	INP_WLOCK(inp);
	if ((inp->inp_vflag & INP_DROPPED) \|\| callout_pending(&tp->t_timers->tt_persist)
	\|\| !callout_active(&tp->t_timers->tt_persist)) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	callout_deactivate(&tp->t_timers->tt_persist);
	/*
	* Persistance timer into zero window.
	* Force a byte to be output, if possible.
	*/
	V_tcpstat.tcps_persisttimeo++;
	/*
	* Hack: if the peer is dead/unreachable, we do not
	* time out if the window is closed. After a full
	* backoff, drop the connection if the idle time
	* (no responses to probes) reaches the maximum
	* backoff that we would use if retransmitting.
	*/
	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
	((ticks - tp->t_rcvtime) >= tcp_maxpersistidle \|\|
	(ticks - tp->t_rcvtime) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
	V_tcpstat.tcps_persistdrop++;
	tp = tcp_drop(tp, ETIMEDOUT);
	goto out;
	}
	tcp_setpersist(tp);
	tp->t_flags \|= TF_FORCEDATA;
	(void) tcp_output(tp);
	tp->t_flags &= ~TF_FORCEDATA;

	out:
	#ifdef TCPDEBUG
	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
	tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
	#endif
	if (tp != NULL)
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	}

	void
	tcp_timer_rexmt(void * xtp)
	{
	struct tcpcb *tp = xtp;
	+ CURVNET_SET(tp->t_vnet);
	+ INIT_VNET_INET(tp->t_vnet);
	int rexmt;
	int headlocked;
	struct inpcb *inp;
	#ifdef TCPDEBUG
	int ostate;

	ostate = tp->t_state;
	#endif
	INP_INFO_WLOCK(&V_tcbinfo);
	headlocked = 1;
	inp = tp->t_inpcb;
	/*
	* XXXRW: While this assert is in fact correct, bugs in the tcpcb
	* tear-down mean we need it as a work-around for races between
	* timers and tcp_discardcb().
	*
	* KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
	*/
	if (inp == NULL) {
	tcp_timer_race++;
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	INP_WLOCK(inp);
	if ((inp->inp_vflag & INP_DROPPED) \|\| callout_pending(&tp->t_timers->tt_rexmt)
	\|\| !callout_active(&tp->t_timers->tt_rexmt)) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	return;
	}
	callout_deactivate(&tp->t_timers->tt_rexmt);
	tcp_free_sackholes(tp);
	/*
	* Retransmission timer went off. Message has not
	* been acked within retransmit interval. Back off
	* to a longer retransmit interval and retransmit one segment.
	*/
	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
	tp->t_rxtshift = TCP_MAXRXTSHIFT;
	V_tcpstat.tcps_timeoutdrop++;
	tp = tcp_drop(tp, tp->t_softerror ?
	tp->t_softerror : ETIMEDOUT);
	goto out;
	}
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	if (tp->t_rxtshift == 1) {
	/*
	* first retransmit; record ssthresh and cwnd so they can
	* be recovered if this turns out to be a "bad" retransmit.
	* A retransmit is considered "bad" if an ACK for this
	* segment is received within RTT/2 interval; the assumption
	* here is that the ACK was already in flight. See
	* "On Estimating End-to-End Network Path Properties" by
	* Allman and Paxson for more details.
	*/
	tp->snd_cwnd_prev = tp->snd_cwnd;
	tp->snd_ssthresh_prev = tp->snd_ssthresh;
	tp->snd_recover_prev = tp->snd_recover;
	if (IN_FASTRECOVERY(tp))
	tp->t_flags \|= TF_WASFRECOVERY;
	else
	tp->t_flags &= ~TF_WASFRECOVERY;
	tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
	}
	V_tcpstat.tcps_rexmttimeo++;
	if (tp->t_state == TCPS_SYN_SENT)
	rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
	else
	rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
	TCPT_RANGESET(tp->t_rxtcur, rexmt,
	tp->t_rttmin, TCPTV_REXMTMAX);
	/*
	* Disable rfc1323 if we havn't got any response to
	* our third SYN to work-around some broken terminal servers
	* (most of which have hopefully been retired) that have bad VJ
	* header compression code which trashes TCP segments containing
	* unknown-to-them TCP options.
	*/
	if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
	tp->t_flags &= ~(TF_REQ_SCALE\|TF_REQ_TSTMP);
	/*
	* If we backed off this far, our srtt estimate is probably bogus.
	* Clobber it so we'll take the next rtt measurement as our srtt;
	* move the current srtt into rttvar to keep the current
	* retransmit times until then.
	*/
	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
	#ifdef INET6
	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
	in6_losing(tp->t_inpcb);
	else
	#endif
	tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
	tp->t_srtt = 0;
	}
	tp->snd_nxt = tp->snd_una;
	tp->snd_recover = tp->snd_max;
	/*
	* Force a segment to be sent.
	*/
	tp->t_flags \|= TF_ACKNOW;
	/*
	* If timing a segment in this window, stop the timer.
	*/
	tp->t_rtttime = 0;
	/*
	* Close the congestion window down to one segment
	* (we'll open it by one segment for each ack we get).
	* Since we probably have a window's worth of unacked
	* data accumulated, this "slow start" keeps us from
	* dumping all that data as back-to-back packets (which
	* might overwhelm an intermediate gateway).
	*
	* There are two phases to the opening: Initially we
	* open by one mss on each ack. This makes the window
	* size increase exponentially with time. If the
	* window is larger than the path can handle, this
	* exponential growth results in dropped packet(s)
	* almost immediately. To get more time between
	* drops but still "push" the network to take advantage
	* of improving conditions, we switch from exponential
	* to linear window opening at some threshhold size.
	* For a threshhold, we use half the current window
	* size, truncated to a multiple of the mss.
	*
	* (the minimum cwnd that will give us exponential
	* growth is 2 mss. We don't allow the threshhold
	* to go below this.)
	*/
	{
	u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
	if (win < 2)
	win = 2;
	tp->snd_cwnd = tp->t_maxseg;
	tp->snd_ssthresh = win * tp->t_maxseg;
	tp->t_dupacks = 0;
	}
	EXIT_FASTRECOVERY(tp);
	(void) tcp_output(tp);

	out:
	#ifdef TCPDEBUG
	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
	tcp_trace(TA_USER, ostate, tp, (void )0, (struct tcphdr )0,
	PRU_SLOWTIMO);
	#endif
	if (tp != NULL)
	INP_WUNLOCK(inp);
	if (headlocked)
	INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	}

	void
	tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
	{
	struct callout *t_callout;
	void *f_callout;

	switch (timer_type) {
	case TT_DELACK:
	t_callout = &tp->t_timers->tt_delack;
	f_callout = tcp_timer_delack;
	break;
	case TT_REXMT:
	t_callout = &tp->t_timers->tt_rexmt;
	f_callout = tcp_timer_rexmt;
	break;
	case TT_PERSIST:
	t_callout = &tp->t_timers->tt_persist;
	f_callout = tcp_timer_persist;
	break;
	case TT_KEEP:
	t_callout = &tp->t_timers->tt_keep;
	f_callout = tcp_timer_keep;
	break;
	case TT_2MSL:
	t_callout = &tp->t_timers->tt_2msl;
	f_callout = tcp_timer_2msl;
	break;
	default:
	panic("bad timer_type");
	}
	if (delta == 0) {
	callout_stop(t_callout);
	} else {
	callout_reset(t_callout, delta, f_callout, tp);
	}
	}

	int
	tcp_timer_active(struct tcpcb *tp, int timer_type)
	{
	struct callout *t_callout;

	switch (timer_type) {
	case TT_DELACK:
	t_callout = &tp->t_timers->tt_delack;
	break;
	case TT_REXMT:
	t_callout = &tp->t_timers->tt_rexmt;
	break;
	case TT_PERSIST:
	t_callout = &tp->t_timers->tt_persist;
	break;
	case TT_KEEP:
	t_callout = &tp->t_timers->tt_keep;
	break;
	case TT_2MSL:
	t_callout = &tp->t_timers->tt_2msl;
	break;
	default:
	panic("bad timer_type");
	}
	return callout_active(t_callout);
	}
	Index: head/sys/netinet/tcp_timewait.c
	===================================================================
	--- head/sys/netinet/tcp_timewait.c (revision 183549)
	+++ head/sys/netinet/tcp_timewait.c (revision 183550)
	@@ -1,650 +1,664 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mac.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/callout.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/random.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/route.h>
	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#endif
	#include <netinet/in_pcb.h>
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#endif
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#endif
	#include <netinet/ip_icmp.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#ifdef INET6
	#include <netinet6/tcp6_var.h>
	#endif
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif
	#include <netinet6/ip6protosw.h>

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	static uma_zone_t tcptw_zone;
	static int maxtcptw;

	/*
	* The timed wait queue contains references to each of the TCP sessions
	* currently in the TIME_WAIT state. The queue pointers, including the
	* queue pointers in each tcptw structure, are protected using the global
	* tcbinfo lock, which must be held over queue iteration and modification.
	*/
	static TAILQ_HEAD(, tcptw) twq_2msl;

	static void tcp_tw_2msl_reset(struct tcptw *, int);
	static void tcp_tw_2msl_stop(struct tcptw *);

	static int
	tcptw_auto_size(void)
	{
	+ INIT_VNET_INET(curvnet);
	int halfrange;

	/*
	* Max out at half the ephemeral port range so that TIME_WAIT
	* sockets don't tie up too many ephemeral ports.
	*/
	if (V_ipport_lastauto > V_ipport_firstauto)
	halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
	else
	halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
	/* Protect against goofy port ranges smaller than 32. */
	return (imin(imax(halfrange, 32), maxsockets / 5));
	}

	static int
	sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
	{
	int error, new;

	if (maxtcptw == 0)
	new = tcptw_auto_size();
	else
	new = maxtcptw;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr)
	if (new >= 32) {
	maxtcptw = new;
	uma_zone_set_max(tcptw_zone, maxtcptw);
	}
	return (error);
	}

	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT\|CTLFLAG_RW,
	&maxtcptw, 0, sysctl_maxtcptw, "IU",
	"Maximum number of compressed TCP TIME_WAIT entries");

	static int nolocaltimewait = 0;
	SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW,
	&nolocaltimewait, 0,
	"Do not create compressed TCP TIME_WAIT entries for local connections");

	void
	tcp_tw_zone_change(void)
	{

	if (maxtcptw == 0)
	uma_zone_set_max(tcptw_zone, tcptw_auto_size());
	}

	void
	tcp_tw_init(void)
	{
	+ INIT_VNET_INET(curvnet);

	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
	if (maxtcptw == 0)
	uma_zone_set_max(tcptw_zone, tcptw_auto_size());
	else
	uma_zone_set_max(tcptw_zone, maxtcptw);
	TAILQ_INIT(&V_twq_2msl);
	}

	/*
	* Move a TCP connection into TIME_WAIT state.
	* tcbinfo is locked.
	* inp is locked, and is unlocked before returning.
	*/
	void
	tcp_twstart(struct tcpcb *tp)
	{
	+#if defined(INVARIANTS) \|\| defined(INVARIANT_SUPPORT)
	+ INIT_VNET_INET(tp->t_vnet);
	+#endif
	struct tcptw *tw;
	struct inpcb *inp = tp->t_inpcb;
	int acknow;
	struct socket *so;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
	INP_WLOCK_ASSERT(inp);

	if (V_nolocaltimewait && in_localip(inp->inp_faddr)) {
	tp = tcp_close(tp);
	if (tp != NULL)
	INP_WUNLOCK(inp);
	return;
	}

	tw = uma_zalloc(tcptw_zone, M_NOWAIT);
	if (tw == NULL) {
	tw = tcp_tw_2msl_scan(1);
	if (tw == NULL) {
	tp = tcp_close(tp);
	if (tp != NULL)
	INP_WUNLOCK(inp);
	return;
	}
	}
	tw->tw_inpcb = inp;

	/*
	* Recover last window size sent.
	*/
	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;

	/*
	* Set t_recent if timestamps are used on the connection.
	*/
	if ((tp->t_flags & (TF_REQ_TSTMP\|TF_RCVD_TSTMP\|TF_NOOPT)) ==
	(TF_REQ_TSTMP\|TF_RCVD_TSTMP)) {
	tw->t_recent = tp->ts_recent;
	tw->ts_offset = tp->ts_offset;
	} else {
	tw->t_recent = 0;
	tw->ts_offset = 0;
	}

	tw->snd_nxt = tp->snd_nxt;
	tw->rcv_nxt = tp->rcv_nxt;
	tw->iss = tp->iss;
	tw->irs = tp->irs;
	tw->t_starttime = tp->t_starttime;
	tw->tw_time = 0;

	/* XXX
	* If this code will
	* be used for fin-wait-2 state also, then we may need
	* a ts_recent from the last segment.
	*/
	acknow = tp->t_flags & TF_ACKNOW;

	/*
	* First, discard tcpcb state, which includes stopping its timers and
	* freeing it. tcp_discardcb() used to also release the inpcb, but
	* that work is now done in the caller.
	*
	* Note: soisdisconnected() call used to be made in tcp_discardcb(),
	* and might not be needed here any longer.
	*/
	tcp_discardcb(tp);
	so = inp->inp_socket;
	soisdisconnected(so);
	tw->tw_cred = crhold(so->so_cred);
	SOCK_LOCK(so);
	tw->tw_so_options = so->so_options;
	SOCK_UNLOCK(so);
	if (acknow)
	tcp_twrespond(tw, TH_ACK);
	inp->inp_ppcb = tw;
	inp->inp_vflag \|= INP_TIMEWAIT;
	tcp_tw_2msl_reset(tw, 0);

	/*
	* If the inpcb owns the sole reference to the socket, then we can
	* detach and free the socket as it is not needed in time wait.
	*/
	if (inp->inp_vflag & INP_SOCKREF) {
	KASSERT(so->so_state & SS_PROTOREF,
	("tcp_twstart: !SS_PROTOREF"));
	inp->inp_vflag &= ~INP_SOCKREF;
	INP_WUNLOCK(inp);
	ACCEPT_LOCK();
	SOCK_LOCK(so);
	so->so_state &= ~SS_PROTOREF;
	sofree(so);
	} else
	INP_WUNLOCK(inp);
	}

	#if 0
	/*
	* The appromixate rate of ISN increase of Microsoft TCP stacks;
	* the actual rate is slightly higher due to the addition of
	* random positive increments.
	*
	* Most other new OSes use semi-randomized ISN values, so we
	* do not need to worry about them.
	*/
	#define MS_ISN_BYTES_PER_SECOND 250000

	/*
	* Determine if the ISN we will generate has advanced beyond the last
	* sequence number used by the previous connection. If so, indicate
	* that it is safe to recycle this tw socket by returning 1.
	*/
	int
	tcp_twrecycleable(struct tcptw *tw)
	{
	+ INIT_VNET_INET(curvnet);
	tcp_seq new_iss = tw->iss;
	tcp_seq new_irs = tw->irs;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
	new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);

	if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
	return (1);
	else
	return (0);
	}
	#endif

	/*
	* Returns 1 if the TIME_WAIT state was killed and we should start over,
	* looking for a pcb in the listen state. Returns 0 otherwise.
	*/
	int
	tcp_twcheck(struct inpcb inp, struct tcpopt to, struct tcphdr *th,
	struct mbuf *m, int tlen)
	{
	+#if defined(INVARIANTS) \|\| defined(INVARIANT_SUPPORT)
	+ INIT_VNET_INET(curvnet);
	+#endif
	struct tcptw *tw;
	int thflags;
	tcp_seq seq;
	#ifdef INET6
	int isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
	#else
	const int isipv6 = 0;
	#endif

	/* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	/*
	* XXXRW: Time wait state for inpcb has been recycled, but inpcb is
	* still present. This is undesirable, but temporarily necessary
	* until we work out how to handle inpcb's who's timewait state has
	* been removed.
	*/
	tw = intotw(inp);
	if (tw == NULL)
	goto drop;

	thflags = th->th_flags;

	/*
	* NOTE: for FIN_WAIT_2 (to be added later),
	* must validate sequence number before accepting RST
	*/

	/*
	* If the segment contains RST:
	* Drop the segment - see Stevens, vol. 2, p. 964 and
	* RFC 1337.
	*/
	if (thflags & TH_RST)
	goto drop;

	#if 0
	/* PAWS not needed at the moment */
	/*
	* RFC 1323 PAWS: If we have a timestamp reply on this segment
	* and it's less than ts_recent, drop it.
	*/
	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
	TSTMP_LT(to.to_tsval, tp->ts_recent)) {
	if ((thflags & TH_ACK) == 0)
	goto drop;
	goto ack;
	}
	/*
	* ts_recent is never updated because we never accept new segments.
	*/
	#endif

	/*
	* If a new connection request is received
	* while in TIME_WAIT, drop the old connection
	* and start over if the sequence numbers
	* are above the previous ones.
	*/
	if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) {
	tcp_twclose(tw, 0);
	return (1);
	}

	/*
	* Drop the the segment if it does not contain an ACK.
	*/
	if ((thflags & TH_ACK) == 0)
	goto drop;

	/*
	* Reset the 2MSL timer if this is a duplicate FIN.
	*/
	if (thflags & TH_FIN) {
	seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
	if (seq + 1 == tw->rcv_nxt)
	tcp_tw_2msl_reset(tw, 1);
	}

	/*
	* Acknowledge the segment if it has data or is not a duplicate ACK.
	*/
	if (thflags != TH_ACK \|\| tlen != 0 \|\|
	th->th_seq != tw->rcv_nxt \|\| th->th_ack != tw->snd_nxt)
	tcp_twrespond(tw, TH_ACK);
	goto drop;

	/*
	* Generate a RST, dropping incoming segment.
	* Make ACK acceptable to originator of segment.
	* Don't bother to respond if destination was broadcast/multicast.
	*/
	if (m->m_flags & (M_BCAST\|M_MCAST))
	goto drop;
	if (isipv6) {
	#ifdef INET6
	struct ip6_hdr *ip6;

	/* IPv6 anycast check is done at tcp6_input() */
	ip6 = mtod(m, struct ip6_hdr *);
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
	goto drop;
	#endif
	} else {
	struct ip *ip;

	ip = mtod(m, struct ip *);
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
	goto drop;
	}
	if (thflags & TH_ACK) {
	tcp_respond(NULL,
	mtod(m, void *), th, m, 0, th->th_ack, TH_RST);
	} else {
	seq = th->th_seq + (thflags & TH_SYN ? 1 : 0);
	tcp_respond(NULL,
	mtod(m, void *), th, m, seq, 0, TH_RST\|TH_ACK);
	}
	INP_WUNLOCK(inp);
	return (0);

	drop:
	INP_WUNLOCK(inp);
	m_freem(m);
	return (0);
	}

	void
	tcp_twclose(struct tcptw *tw, int reuse)
	{
	+ INIT_VNET_INET(curvnet);
	struct socket *so;
	struct inpcb *inp;

	/*
	* At this point, we are in one of two situations:
	*
	* (1) We have no socket, just an inpcb<->twtcp pair. We can free
	* all state.
	*
	* (2) We have a socket -- if we own a reference, release it and
	* notify the socket layer.
	*/
	inp = tw->tw_inpcb;
	KASSERT((inp->inp_vflag & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
	INP_WLOCK_ASSERT(inp);

	tw->tw_inpcb = NULL;
	tcp_tw_2msl_stop(tw);
	inp->inp_ppcb = NULL;
	in_pcbdrop(inp);

	so = inp->inp_socket;
	if (so != NULL) {
	/*
	* If there's a socket, handle two cases: first, we own a
	* strong reference, which we will now release, or we don't
	* in which case another reference exists (XXXRW: think
	* about this more), and we don't need to take action.
	*/
	if (inp->inp_vflag & INP_SOCKREF) {
	inp->inp_vflag &= ~INP_SOCKREF;
	INP_WUNLOCK(inp);
	ACCEPT_LOCK();
	SOCK_LOCK(so);
	KASSERT(so->so_state & SS_PROTOREF,
	("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
	so->so_state &= ~SS_PROTOREF;
	sofree(so);
	} else {
	/*
	* If we don't own the only reference, the socket and
	* inpcb need to be left around to be handled by
	* tcp_usr_detach() later.
	*/
	INP_WUNLOCK(inp);
	}
	} else {
	#ifdef INET6
	if (inp->inp_vflag & INP_IPV6PROTO)
	in6_pcbfree(inp);
	else
	#endif
	in_pcbfree(inp);
	}
	V_tcpstat.tcps_closed++;
	crfree(tw->tw_cred);
	tw->tw_cred = NULL;
	if (reuse)
	return;
	uma_zfree(tcptw_zone, tw);
	}

	int
	tcp_twrespond(struct tcptw *tw, int flags)
	{
	+ INIT_VNET_INET(curvnet);
	struct inpcb *inp = tw->tw_inpcb;
	struct tcphdr *th;
	struct mbuf *m;
	struct ip *ip = NULL;
	u_int hdrlen, optlen;
	int error;
	struct tcpopt to;
	#ifdef INET6
	struct ip6_hdr *ip6 = NULL;
	int isipv6 = inp->inp_inc.inc_isipv6;
	#endif

	INP_WLOCK_ASSERT(inp);

	m = m_gethdr(M_DONTWAIT, MT_DATA);
	if (m == NULL)
	return (ENOBUFS);
	m->m_data += max_linkhdr;

	#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
	#endif

	#ifdef INET6
	if (isipv6) {
	hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
	ip6 = mtod(m, struct ip6_hdr *);
	th = (struct tcphdr *)(ip6 + 1);
	tcpip_fillheaders(inp, ip6, th);
	} else
	#endif
	{
	hdrlen = sizeof(struct tcpiphdr);
	ip = mtod(m, struct ip *);
	th = (struct tcphdr *)(ip + 1);
	tcpip_fillheaders(inp, ip, th);
	}
	to.to_flags = 0;

	/*
	* Send a timestamp and echo-reply if both our side and our peer
	* have sent timestamps in our SYN's and this is not a RST.
	*/
	if (tw->t_recent && flags == TH_ACK) {
	to.to_flags \|= TOF_TS;
	to.to_tsval = ticks + tw->ts_offset;
	to.to_tsecr = tw->t_recent;
	}
	optlen = tcp_addoptions(&to, (u_char *)(th + 1));

	m->m_len = hdrlen + optlen;
	m->m_pkthdr.len = m->m_len;

	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));

	th->th_seq = htonl(tw->snd_nxt);
	th->th_ack = htonl(tw->rcv_nxt);
	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
	th->th_flags = flags;
	th->th_win = htons(tw->last_win);

	#ifdef INET6
	if (isipv6) {
	th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
	sizeof(struct tcphdr) + optlen);
	ip6->ip6_hlim = in6_selecthlim(inp, NULL);
	error = ip6_output(m, inp->in6p_outputopts, NULL,
	(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
	} else
	#endif
	{
	th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
	htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
	m->m_pkthdr.csum_flags = CSUM_TCP;
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	ip->ip_len = m->m_pkthdr.len;
	if (V_path_mtu_discovery)
	ip->ip_off \|= IP_DF;
	error = ip_output(m, inp->inp_options, NULL,
	((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
	NULL, inp);
	}
	if (flags & TH_ACK)
	V_tcpstat.tcps_sndacks++;
	else
	V_tcpstat.tcps_sndctrl++;
	V_tcpstat.tcps_sndtotal++;
	return (error);
	}

	static void
	tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(tw->tw_inpcb);
	if (rearm)
	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
	tw->tw_time = ticks + 2 * tcp_msl;
	TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
	}

	static void
	tcp_tw_2msl_stop(struct tcptw *tw)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
	}

	struct tcptw *
	tcp_tw_2msl_scan(int reuse)
	{
	+ INIT_VNET_INET(curvnet);
	struct tcptw *tw;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	for (;;) {
	tw = TAILQ_FIRST(&V_twq_2msl);
	if (tw == NULL \|\| (!reuse && tw->tw_time > ticks))
	break;
	INP_WLOCK(tw->tw_inpcb);
	tcp_twclose(tw, reuse);
	if (reuse)
	return (tw);
	}
	return (NULL);
	}
	Index: head/sys/netinet/tcp_usrreq.c
	===================================================================
	--- head/sys/netinet/tcp_usrreq.c (revision 183549)
	+++ head/sys/netinet/tcp_usrreq.c (revision 183550)
	@@ -1,1907 +1,1933 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California.
	* Copyright (c) 2006-2007 Robert N. M. Watson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_tcpdebug.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/mbuf.h>
	#ifdef INET6
	#include <sys/domain.h>
	#endif /* INET6 */
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/proc.h>
	#include <sys/jail.h>
	#include <sys/vimage.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#endif
	#include <netinet/in_pcb.h>
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#endif
	#include <netinet/in_var.h>
	#include <netinet/ip_var.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#endif
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcpip.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	#endif
	#include <netinet/tcp_offload.h>

	/*
	* TCP protocol interface to socket abstraction.
	*/
	static int tcp_attach(struct socket *);
	static int tcp_connect(struct tcpcb , struct sockaddr ,
	struct thread *td);
	#ifdef INET6
	static int tcp6_connect(struct tcpcb , struct sockaddr ,
	struct thread *td);
	#endif /* INET6 */
	static void tcp_disconnect(struct tcpcb *);
	static void tcp_usrclosed(struct tcpcb *);
	static void tcp_fill_info(struct tcpcb , struct tcp_info );

	#ifdef TCPDEBUG
	#define TCPDEBUG0 int ostate = 0
	#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
	#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
	tcp_trace(TA_USER, ostate, tp, 0, 0, req)
	#else
	#define TCPDEBUG0
	#define TCPDEBUG1()
	#define TCPDEBUG2(req)
	#endif

	/*
	* TCP attaches to socket via pru_attach(), reserving space,
	* and an internet control block.
	*/
	static int
	tcp_usr_attach(struct socket so, int proto, struct thread td)
	{
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	int error;
	TCPDEBUG0;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
	TCPDEBUG1();

	error = tcp_attach(so);
	if (error)
	goto out;

	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
	so->so_linger = TCP_LINGERTIME;

	inp = sotoinpcb(so);
	tp = intotcpcb(inp);
	out:
	TCPDEBUG2(PRU_ATTACH);
	return error;
	}

	/*
	* tcp_detach is called when the socket layer loses its final reference
	* to the socket, be it a file descriptor reference, a reference from TCP,
	* etc. At this point, there is only one case in which we will keep around
	* inpcb state: time wait.
	*
	* This function can probably be re-absorbed back into tcp_usr_detach() now
	* that there is a single detach path.
	*/
	static void
	tcp_detach(struct socket so, struct inpcb inp)
	{
	struct tcpcb *tp;
	#ifdef INET6
	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
	#endif
	+#ifdef INVARIANTS
	+ INIT_VNET_INET(so->so_vnet);
	+#endif

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
	KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));

	tp = intotcpcb(inp);

	if (inp->inp_vflag & INP_TIMEWAIT) {
	/*
	* There are two cases to handle: one in which the time wait
	* state is being discarded (INP_DROPPED), and one in which
	* this connection will remain in timewait. In the former,
	* it is time to discard all state (except tcptw, which has
	* already been discarded by the timewait close code, which
	* should be further up the call stack somewhere). In the
	* latter case, we detach from the socket, but leave the pcb
	* present until timewait ends.
	*
	* XXXRW: Would it be cleaner to free the tcptw here?
	*/
	if (inp->inp_vflag & INP_DROPPED) {
	KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
	"INP_DROPPED && tp != NULL"));
	#ifdef INET6
	if (isipv6) {
	in6_pcbdetach(inp);
	in6_pcbfree(inp);
	} else {
	#endif
	in_pcbdetach(inp);
	in_pcbfree(inp);
	#ifdef INET6
	}
	#endif
	} else {
	#ifdef INET6
	if (isipv6)
	in6_pcbdetach(inp);
	else
	#endif
	in_pcbdetach(inp);
	INP_WUNLOCK(inp);
	}
	} else {
	/*
	* If the connection is not in timewait, we consider two
	* two conditions: one in which no further processing is
	* necessary (dropped \|\| embryonic), and one in which TCP is
	* not yet done, but no longer requires the socket, so the
	* pcb will persist for the time being.
	*
	* XXXRW: Does the second case still occur?
	*/
	if (inp->inp_vflag & INP_DROPPED \|\|
	tp->t_state < TCPS_SYN_SENT) {
	tcp_discardcb(tp);
	#ifdef INET6
	if (isipv6) {
	in6_pcbdetach(inp);
	in6_pcbfree(inp);
	} else {
	#endif
	in_pcbdetach(inp);
	in_pcbfree(inp);
	#ifdef INET6
	}
	#endif
	} else {
	#ifdef INET6
	if (isipv6)
	in6_pcbdetach(inp);
	else
	#endif
	in_pcbdetach(inp);
	}
	}
	}

	/*
	* pru_detach() detaches the TCP protocol from the socket.
	* If the protocol state is non-embryonic, then can't
	* do this directly: have to initiate a pru_disconnect(),
	* which may finish later; embryonic TCB's can just
	* be discarded here.
	*/
	static void
	tcp_usr_detach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(inp);
	KASSERT(inp->inp_socket != NULL,
	("tcp_usr_detach: inp_socket == NULL"));
	tcp_detach(so, inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	}

	/*
	* Give the socket an address.
	*/
	static int
	tcp_usr_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	struct sockaddr_in *sinp;

	sinp = (struct sockaddr_in *)nam;
	if (nam->sa_len != sizeof (*sinp))
	return (EINVAL);
	/*
	* Must check for multicast addresses and disallow binding
	* to them.
	*/
	if (sinp->sin_family == AF_INET &&
	IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
	return (EAFNOSUPPORT);

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = EINVAL;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	error = in_pcbbind(inp, nam, td->td_ucred);
	out:
	TCPDEBUG2(PRU_BIND);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);

	return (error);
	}

	#ifdef INET6
	static int
	tcp6_usr_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	struct sockaddr_in6 *sin6p;

	sin6p = (struct sockaddr_in6 *)nam;
	if (nam->sa_len != sizeof (*sin6p))
	return (EINVAL);
	/*
	* Must check for multicast addresses and disallow binding
	* to them.
	*/
	if (sin6p->sin6_family == AF_INET6 &&
	IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
	return (EAFNOSUPPORT);

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = EINVAL;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	inp->inp_vflag &= ~INP_IPV4;
	inp->inp_vflag \|= INP_IPV6;
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
	inp->inp_vflag \|= INP_IPV4;
	else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
	struct sockaddr_in sin;

	in6_sin6_2_sin(&sin, sin6p);
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_vflag &= ~INP_IPV6;
	error = in_pcbbind(inp, (struct sockaddr *)&sin,
	td->td_ucred);
	goto out;
	}
	}
	error = in6_pcbbind(inp, nam, td->td_ucred);
	out:
	TCPDEBUG2(PRU_BIND);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}
	#endif /* INET6 */

	/*
	* Prepare to accept connections.
	*/
	static int
	tcp_usr_listen(struct socket so, int backlog, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = EINVAL;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	SOCK_LOCK(so);
	error = solisten_proto_check(so);
	if (error == 0 && inp->inp_lport == 0)
	error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
	if (error == 0) {
	tp->t_state = TCPS_LISTEN;
	solisten_proto(so, backlog);
	tcp_offload_listen_open(tp);
	}
	SOCK_UNLOCK(so);

	out:
	TCPDEBUG2(PRU_LISTEN);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}

	#ifdef INET6
	static int
	tcp6_usr_listen(struct socket so, int backlog, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = EINVAL;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	SOCK_LOCK(so);
	error = solisten_proto_check(so);
	if (error == 0 && inp->inp_lport == 0) {
	inp->inp_vflag &= ~INP_IPV4;
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
	inp->inp_vflag \|= INP_IPV4;
	error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
	}
	if (error == 0) {
	tp->t_state = TCPS_LISTEN;
	solisten_proto(so, backlog);
	}
	SOCK_UNLOCK(so);

	out:
	TCPDEBUG2(PRU_LISTEN);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}
	#endif /* INET6 */

	/*
	* Initiate connection to peer.
	* Create a template for use in transmissions on this connection.
	* Enter SYN_SENT state, and mark socket as connecting.
	* Start keep-alive timer, and seed output sequence space.
	* Send initial segment on connection.
	*/
	static int
	tcp_usr_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	struct sockaddr_in *sinp;

	sinp = (struct sockaddr_in *)nam;
	if (nam->sa_len != sizeof (*sinp))
	return (EINVAL);
	/*
	* Must disallow TCP ``connections'' to multicast addresses.
	*/
	if (sinp->sin_family == AF_INET
	&& IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
	return (EAFNOSUPPORT);
	if (jailed(td->td_ucred))
	prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = EINVAL;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	if ((error = tcp_connect(tp, nam, td)) != 0)
	goto out;
	error = tcp_output_connect(so, nam);
	out:
	TCPDEBUG2(PRU_CONNECT);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}

	#ifdef INET6
	static int
	tcp6_usr_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	struct sockaddr_in6 *sin6p;

	TCPDEBUG0;

	sin6p = (struct sockaddr_in6 *)nam;
	if (nam->sa_len != sizeof (*sin6p))
	return (EINVAL);
	/*
	* Must disallow TCP ``connections'' to multicast addresses.
	*/
	if (sin6p->sin6_family == AF_INET6
	&& IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
	return (EAFNOSUPPORT);

	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = EINVAL;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
	struct sockaddr_in sin;

	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
	error = EINVAL;
	goto out;
	}

	in6_sin6_2_sin(&sin, sin6p);
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_vflag &= ~INP_IPV6;
	if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
	goto out;
	error = tcp_output_connect(so, nam);
	goto out;
	}
	inp->inp_vflag &= ~INP_IPV4;
	inp->inp_vflag \|= INP_IPV6;
	inp->inp_inc.inc_isipv6 = 1;
	if ((error = tcp6_connect(tp, nam, td)) != 0)
	goto out;
	error = tcp_output_connect(so, nam);

	out:
	TCPDEBUG2(PRU_CONNECT);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}
	#endif /* INET6 */

	/*
	* Initiate disconnect from peer.
	* If connection never passed embryonic stage, just drop;
	* else if don't need to let data drain, then can just drop anyways,
	* else have to begin TCP shutdown process: mark socket disconnecting,
	* drain unread data, state switch to reflect user close, and
	* send segment (e.g. FIN) to peer. Socket will be really disconnected
	* when peer sends FIN and acks ours.
	*
	* SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
	*/
	static int
	tcp_usr_disconnect(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	int error = 0;

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = ECONNRESET;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	tcp_disconnect(tp);
	out:
	TCPDEBUG2(PRU_DISCONNECT);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}

	/*
	* Accept a connection. Essentially all the work is
	* done at higher levels; just return the address
	* of the peer, storing through addr.
	*/
	static int
	tcp_usr_accept(struct socket so, struct sockaddr *nam)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp = NULL;
	struct tcpcb *tp = NULL;
	struct in_addr addr;
	in_port_t port = 0;
	TCPDEBUG0;

	if (so->so_state & SS_ISDISCONNECTED)
	return (ECONNABORTED);

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
	INP_INFO_RLOCK(&V_tcbinfo);
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = ECONNABORTED;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();

	/*
	* We inline in_getpeeraddr and COMMON_END here, so that we can
	* copy the data of interest and defer the malloc until after we
	* release the lock.
	*/
	port = inp->inp_fport;
	addr = inp->inp_faddr;

	out:
	TCPDEBUG2(PRU_ACCEPT);
	INP_WUNLOCK(inp);
	INP_INFO_RUNLOCK(&V_tcbinfo);
	if (error == 0)
	*nam = in_sockaddr(port, &addr);
	return error;
	}

	#ifdef INET6
	static int
	tcp6_usr_accept(struct socket so, struct sockaddr *nam)
	{
	struct inpcb *inp = NULL;
	int error = 0;
	struct tcpcb *tp = NULL;
	struct in_addr addr;
	struct in6_addr addr6;
	in_port_t port = 0;
	int v4 = 0;
	TCPDEBUG0;

	if (so->so_state & SS_ISDISCONNECTED)
	return (ECONNABORTED);

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = ECONNABORTED;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();

	/*
	* We inline in6_mapped_peeraddr and COMMON_END here, so that we can
	* copy the data of interest and defer the malloc until after we
	* release the lock.
	*/
	if (inp->inp_vflag & INP_IPV4) {
	v4 = 1;
	port = inp->inp_fport;
	addr = inp->inp_faddr;
	} else {
	port = inp->inp_fport;
	addr6 = inp->in6p_faddr;
	}

	out:
	TCPDEBUG2(PRU_ACCEPT);
	INP_WUNLOCK(inp);
	if (error == 0) {
	if (v4)
	*nam = in6_v4mapsin6_sockaddr(port, &addr);
	else
	*nam = in6_sockaddr(port, &addr6);
	}
	return error;
	}
	#endif /* INET6 */

	/*
	* Mark the connection as being incapable of further output.
	*/
	static int
	tcp_usr_shutdown(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;

	TCPDEBUG0;
	INP_INFO_WLOCK(&V_tcbinfo);
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = ECONNRESET;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	socantsendmore(so);
	tcp_usrclosed(tp);
	error = tcp_output_disconnect(tp);

	out:
	TCPDEBUG2(PRU_SHUTDOWN);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);

	return (error);
	}

	/*
	* After a receive, possibly send window update to peer.
	*/
	static int
	tcp_usr_rcvd(struct socket *so, int flags)
	{
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	int error = 0;

	TCPDEBUG0;
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = ECONNRESET;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	tcp_output_rcvd(tp);

	out:
	TCPDEBUG2(PRU_RCVD);
	INP_WUNLOCK(inp);
	return (error);
	}

	/*
	* Do a send by putting data in output queue and updating urgent
	* marker if URG set. Possibly send more data. Unlike the other
	* pru_*() routines, the mbuf chains are our responsibility. We
	* must either enqueue them or free them. The other pru_* routines
	* generally are caller-frees.
	*/
	static int
	tcp_usr_send(struct socket so, int flags, struct mbuf m,
	struct sockaddr nam, struct mbuf control, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	int headlocked = 0;
	#ifdef INET6
	int isipv6;
	#endif
	TCPDEBUG0;

	/*
	* We require the pcbinfo lock in two cases:
	*
	* (1) An implied connect is taking place, which can result in
	* binding IPs and ports and hence modification of the pcb hash
	* chains.
	*
	* (2) PRUS_EOF is set, resulting in explicit close on the send.
	*/
	if ((nam != NULL) \|\| (flags & PRUS_EOF)) {
	INP_INFO_WLOCK(&V_tcbinfo);
	headlocked = 1;
	}
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	if (control)
	m_freem(control);
	if (m)
	m_freem(m);
	error = ECONNRESET;
	goto out;
	}
	#ifdef INET6
	isipv6 = nam && nam->sa_family == AF_INET6;
	#endif /* INET6 */
	tp = intotcpcb(inp);
	TCPDEBUG1();
	if (control) {
	/* TCP doesn't do control messages (rights, creds, etc) */
	if (control->m_len) {
	m_freem(control);
	if (m)
	m_freem(m);
	error = EINVAL;
	goto out;
	}
	m_freem(control); /* empty control, just free it */
	}
	if (!(flags & PRUS_OOB)) {
	sbappendstream(&so->so_snd, m);
	if (nam && tp->t_state < TCPS_SYN_SENT) {
	/*
	* Do implied connect if not yet connected,
	* initialize window to default value, and
	* initialize maxseg/maxopd using peer's cached
	* MSS.
	*/
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	#ifdef INET6
	if (isipv6)
	error = tcp6_connect(tp, nam, td);
	else
	#endif /* INET6 */
	error = tcp_connect(tp, nam, td);
	if (error)
	goto out;
	tp->snd_wnd = TTCP_CLIENT_SND_WND;
	tcp_mss(tp, -1);
	}
	if (flags & PRUS_EOF) {
	/*
	* Close the send side of the connection after
	* the data is sent.
	*/
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	socantsendmore(so);
	tcp_usrclosed(tp);
	}
	if (headlocked) {
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	}
	if (tp != NULL) {
	if (flags & PRUS_MORETOCOME)
	tp->t_flags \|= TF_MORETOCOME;
	error = tcp_output_send(tp);
	if (flags & PRUS_MORETOCOME)
	tp->t_flags &= ~TF_MORETOCOME;
	}
	} else {
	/*
	* XXXRW: PRUS_EOF not implemented with PRUS_OOB?
	*/
	SOCKBUF_LOCK(&so->so_snd);
	if (sbspace(&so->so_snd) < -512) {
	SOCKBUF_UNLOCK(&so->so_snd);
	m_freem(m);
	error = ENOBUFS;
	goto out;
	}
	/*
	* According to RFC961 (Assigned Protocols),
	* the urgent pointer points to the last octet
	* of urgent data. We continue, however,
	* to consider it to indicate the first octet
	* of data past the urgent section.
	* Otherwise, snd_up should be one lower.
	*/
	sbappendstream_locked(&so->so_snd, m);
	SOCKBUF_UNLOCK(&so->so_snd);
	if (nam && tp->t_state < TCPS_SYN_SENT) {
	/*
	* Do implied connect if not yet connected,
	* initialize window to default value, and
	* initialize maxseg/maxopd using peer's cached
	* MSS.
	*/
	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	#ifdef INET6
	if (isipv6)
	error = tcp6_connect(tp, nam, td);
	else
	#endif /* INET6 */
	error = tcp_connect(tp, nam, td);
	if (error)
	goto out;
	tp->snd_wnd = TTCP_CLIENT_SND_WND;
	tcp_mss(tp, -1);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	} else if (nam) {
	INP_INFO_WUNLOCK(&V_tcbinfo);
	headlocked = 0;
	}
	tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
	tp->t_flags \|= TF_FORCEDATA;
	error = tcp_output_send(tp);
	tp->t_flags &= ~TF_FORCEDATA;
	}
	out:
	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
	((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
	INP_WUNLOCK(inp);
	if (headlocked)
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}

	/*
	* Abort the TCP. Drop the connection abruptly.
	*/
	static void
	tcp_usr_abort(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	TCPDEBUG0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));

	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(inp);
	KASSERT(inp->inp_socket != NULL,
	("tcp_usr_abort: inp_socket == NULL"));

	/*
	* If we still have full TCP state, and we're not dropped, drop.
	*/
	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
	!(inp->inp_vflag & INP_DROPPED)) {
	tp = intotcpcb(inp);
	TCPDEBUG1();
	tcp_drop(tp, ECONNABORTED);
	TCPDEBUG2(PRU_ABORT);
	}
	if (!(inp->inp_vflag & INP_DROPPED)) {
	SOCK_LOCK(so);
	so->so_state \|= SS_PROTOREF;
	SOCK_UNLOCK(so);
	inp->inp_vflag \|= INP_SOCKREF;
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	}

	/*
	* TCP socket is closed. Start friendly disconnect.
	*/
	static void
	tcp_usr_close(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	struct tcpcb *tp = NULL;
	TCPDEBUG0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));

	INP_INFO_WLOCK(&V_tcbinfo);
	INP_WLOCK(inp);
	KASSERT(inp->inp_socket != NULL,
	("tcp_usr_close: inp_socket == NULL"));

	/*
	* If we still have full TCP state, and we're not dropped, initiate
	* a disconnect.
	*/
	if (!(inp->inp_vflag & INP_TIMEWAIT) &&
	!(inp->inp_vflag & INP_DROPPED)) {
	tp = intotcpcb(inp);
	TCPDEBUG1();
	tcp_disconnect(tp);
	TCPDEBUG2(PRU_CLOSE);
	}
	if (!(inp->inp_vflag & INP_DROPPED)) {
	SOCK_LOCK(so);
	so->so_state \|= SS_PROTOREF;
	SOCK_UNLOCK(so);
	inp->inp_vflag \|= INP_SOCKREF;
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	}

	/*
	* Receive out-of-band data.
	*/
	static int
	tcp_usr_rcvoob(struct socket so, struct mbuf m, int flags)
	{
	int error = 0;
	struct inpcb *inp;
	struct tcpcb *tp = NULL;

	TCPDEBUG0;
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
	INP_WLOCK(inp);
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	error = ECONNRESET;
	goto out;
	}
	tp = intotcpcb(inp);
	TCPDEBUG1();
	if ((so->so_oobmark == 0 &&
	(so->so_rcv.sb_state & SBS_RCVATMARK) == 0) \|\|
	so->so_options & SO_OOBINLINE \|\|
	tp->t_oobflags & TCPOOB_HADDATA) {
	error = EINVAL;
	goto out;
	}
	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
	error = EWOULDBLOCK;
	goto out;
	}
	m->m_len = 1;
	*mtod(m, caddr_t) = tp->t_iobc;
	if ((flags & MSG_PEEK) == 0)
	tp->t_oobflags ^= (TCPOOB_HAVEDATA \| TCPOOB_HADDATA);

	out:
	TCPDEBUG2(PRU_RCVOOB);
	INP_WUNLOCK(inp);
	return (error);
	}

	struct pr_usrreqs tcp_usrreqs = {
	.pru_abort = tcp_usr_abort,
	.pru_accept = tcp_usr_accept,
	.pru_attach = tcp_usr_attach,
	.pru_bind = tcp_usr_bind,
	.pru_connect = tcp_usr_connect,
	.pru_control = in_control,
	.pru_detach = tcp_usr_detach,
	.pru_disconnect = tcp_usr_disconnect,
	.pru_listen = tcp_usr_listen,
	.pru_peeraddr = in_getpeeraddr,
	.pru_rcvd = tcp_usr_rcvd,
	.pru_rcvoob = tcp_usr_rcvoob,
	.pru_send = tcp_usr_send,
	.pru_shutdown = tcp_usr_shutdown,
	.pru_sockaddr = in_getsockaddr,
	.pru_sosetlabel = in_pcbsosetlabel,
	.pru_close = tcp_usr_close,
	};

	#ifdef INET6
	struct pr_usrreqs tcp6_usrreqs = {
	.pru_abort = tcp_usr_abort,
	.pru_accept = tcp6_usr_accept,
	.pru_attach = tcp_usr_attach,
	.pru_bind = tcp6_usr_bind,
	.pru_connect = tcp6_usr_connect,
	.pru_control = in6_control,
	.pru_detach = tcp_usr_detach,
	.pru_disconnect = tcp_usr_disconnect,
	.pru_listen = tcp6_usr_listen,
	.pru_peeraddr = in6_mapped_peeraddr,
	.pru_rcvd = tcp_usr_rcvd,
	.pru_rcvoob = tcp_usr_rcvoob,
	.pru_send = tcp_usr_send,
	.pru_shutdown = tcp_usr_shutdown,
	.pru_sockaddr = in6_mapped_sockaddr,
	.pru_sosetlabel = in_pcbsosetlabel,
	.pru_close = tcp_usr_close,
	};
	#endif /* INET6 */

	/*
	* Common subroutine to open a TCP connection to remote host specified
	* by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
	* port number if needed. Call in_pcbconnect_setup to do the routing and
	* to choose a local host address (interface). If there is an existing
	* incarnation of the same connection in TIME-WAIT state and if the remote
	* host was sending CC options and if the connection duration was < MSL, then
	* truncate the previous TIME-WAIT state and proceed.
	* Initialize connection parameters and enter SYN-SENT state.
	*/
	static int
	tcp_connect(struct tcpcb tp, struct sockaddr nam, struct thread *td)
	{
	struct inpcb inp = tp->t_inpcb, oinp;
	struct socket *so = inp->inp_socket;
	+ INIT_VNET_INET(so->so_vnet);
	struct in_addr laddr;
	u_short lport;
	int error;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	if (inp->inp_lport == 0) {
	error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
	if (error)
	return error;
	}

	/*
	* Cannot simply call in_pcbconnect, because there might be an
	* earlier incarnation of this same connection still in
	* TIME_WAIT state, creating an ADDRINUSE error.
	*/
	laddr = inp->inp_laddr;
	lport = inp->inp_lport;
	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
	&inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
	if (error && oinp == NULL)
	return error;
	if (oinp)
	return EADDRINUSE;
	inp->inp_laddr = laddr;
	in_pcbrehash(inp);

	/*
	* Compute window scaling to request:
	* Scale to fit into sweet spot. See tcp_syncache.c.
	* XXX: This should move to tcp_output().
	*/
	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	(TCP_MAXWIN << tp->request_r_scale) < sb_max)
	tp->request_r_scale++;

	soisconnecting(so);
	V_tcpstat.tcps_connattempt++;
	tp->t_state = TCPS_SYN_SENT;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
	tp->iss = tcp_new_isn(tp);
	tp->t_bw_rtseq = tp->iss;
	tcp_sendseqinit(tp);

	return 0;
	}

	#ifdef INET6
	static int
	tcp6_connect(struct tcpcb tp, struct sockaddr nam, struct thread *td)
	{
	struct inpcb inp = tp->t_inpcb, oinp;
	struct socket *so = inp->inp_socket;
	+ INIT_VNET_INET(so->so_vnet);
	struct sockaddr_in6 sin6 = (struct sockaddr_in6 )nam;
	struct in6_addr *addr6;
	int error;

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	if (inp->inp_lport == 0) {
	error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
	if (error)
	return error;
	}

	/*
	* Cannot simply call in_pcbconnect, because there might be an
	* earlier incarnation of this same connection still in
	* TIME_WAIT state, creating an ADDRINUSE error.
	* in6_pcbladdr() also handles scope zone IDs.
	*/
	error = in6_pcbladdr(inp, nam, &addr6);
	if (error)
	return error;
	oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
	&sin6->sin6_addr, sin6->sin6_port,
	IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
	? addr6
	: &inp->in6p_laddr,
	inp->inp_lport, 0, NULL);
	if (oinp)
	return EADDRINUSE;
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
	inp->in6p_laddr = *addr6;
	inp->in6p_faddr = sin6->sin6_addr;
	inp->inp_fport = sin6->sin6_port;
	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
	if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
	inp->in6p_flowinfo \|=
	(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
	in_pcbrehash(inp);

	/* Compute window scaling to request. */
	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
	(TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
	tp->request_r_scale++;

	soisconnecting(so);
	V_tcpstat.tcps_connattempt++;
	tp->t_state = TCPS_SYN_SENT;
	tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
	tp->iss = tcp_new_isn(tp);
	tp->t_bw_rtseq = tp->iss;
	tcp_sendseqinit(tp);

	return 0;
	}
	#endif /* INET6 */

	/*
	* Export TCP internal state information via a struct tcp_info, based on the
	* Linux 2.6 API. Not ABI compatible as our constants are mapped differently
	* (TCP state machine, etc). We export all information using FreeBSD-native
	* constants -- for example, the numeric values for tcpi_state will differ
	* from Linux.
	*/
	static void
	tcp_fill_info(struct tcpcb tp, struct tcp_info ti)
	{

	INP_WLOCK_ASSERT(tp->t_inpcb);
	bzero(ti, sizeof(*ti));

	ti->tcpi_state = tp->t_state;
	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
	ti->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
	if (tp->t_flags & TF_SACK_PERMIT)
	ti->tcpi_options \|= TCPI_OPT_SACK;
	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
	ti->tcpi_options \|= TCPI_OPT_WSCALE;
	ti->tcpi_snd_wscale = tp->snd_scale;
	ti->tcpi_rcv_wscale = tp->rcv_scale;
	}

	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;

	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
	ti->tcpi_snd_cwnd = tp->snd_cwnd;

	/*
	* FreeBSD-specific extension fields for tcp_info.
	*/
	ti->tcpi_rcv_space = tp->rcv_wnd;
	ti->tcpi_rcv_nxt = tp->rcv_nxt;
	ti->tcpi_snd_wnd = tp->snd_wnd;
	ti->tcpi_snd_bwnd = tp->snd_bwnd;
	ti->tcpi_snd_nxt = tp->snd_nxt;
	ti->__tcpi_snd_mss = tp->t_maxseg;
	ti->__tcpi_rcv_mss = tp->t_maxseg;
	if (tp->t_flags & TF_TOE)
	ti->tcpi_options \|= TCPI_OPT_TOE;
	}

	/*
	* tcp_ctloutput() must drop the inpcb lock before performing copyin on
	* socket option arguments. When it re-acquires the lock after the copy, it
	* has to revalidate that the connection is still valid for the socket
	* option.
	*/
	#define INP_WLOCK_RECHECK(inp) do { \
	INP_WLOCK(inp); \
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) { \
	INP_WUNLOCK(inp); \
	return (ECONNRESET); \
	} \
	tp = intotcpcb(inp); \
	} while(0)

	int
	tcp_ctloutput(struct socket so, struct sockopt sopt)
	{
	+ INIT_VNET_INET(so->so_vnet);
	int error, opt, optval;
	struct inpcb *inp;
	struct tcpcb *tp;
	struct tcp_info ti;

	error = 0;
	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
	INP_WLOCK(inp);
	if (sopt->sopt_level != IPPROTO_TCP) {
	#ifdef INET6
	if (INP_CHECK_SOCKAF(so, AF_INET6)) {
	INP_WUNLOCK(inp);
	error = ip6_ctloutput(so, sopt);
	} else {
	#endif /* INET6 */
	INP_WUNLOCK(inp);
	error = ip_ctloutput(so, sopt);
	#ifdef INET6
	}
	#endif
	return (error);
	}
	if (inp->inp_vflag & (INP_TIMEWAIT \| INP_DROPPED)) {
	INP_WUNLOCK(inp);
	return (ECONNRESET);
	}

	switch (sopt->sopt_dir) {
	case SOPT_SET:
	switch (sopt->sopt_name) {
	#ifdef TCP_SIGNATURE
	case TCP_MD5SIG:
	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	return (error);

	INP_WLOCK_RECHECK(inp);
	if (optval > 0)
	tp->t_flags \|= TF_SIGNATURE;
	else
	tp->t_flags &= ~TF_SIGNATURE;
	INP_WUNLOCK(inp);
	break;
	#endif /* TCP_SIGNATURE */
	case TCP_NODELAY:
	case TCP_NOOPT:
	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	return (error);

	INP_WLOCK_RECHECK(inp);
	switch (sopt->sopt_name) {
	case TCP_NODELAY:
	opt = TF_NODELAY;
	break;
	case TCP_NOOPT:
	opt = TF_NOOPT;
	break;
	default:
	opt = 0; /* dead code to fool gcc */
	break;
	}

	if (optval)
	tp->t_flags \|= opt;
	else
	tp->t_flags &= ~opt;
	INP_WUNLOCK(inp);
	break;

	case TCP_NOPUSH:
	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	return (error);

	INP_WLOCK_RECHECK(inp);
	if (optval)
	tp->t_flags \|= TF_NOPUSH;
	else {
	tp->t_flags &= ~TF_NOPUSH;
	error = tcp_output(tp);
	}
	INP_WUNLOCK(inp);
	break;

	case TCP_MAXSEG:
	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, &optval, sizeof optval,
	sizeof optval);
	if (error)
	return (error);

	INP_WLOCK_RECHECK(inp);
	if (optval > 0 && optval <= tp->t_maxseg &&
	optval + 40 >= V_tcp_minmss)
	tp->t_maxseg = optval;
	else
	error = EINVAL;
	INP_WUNLOCK(inp);
	break;

	case TCP_INFO:
	INP_WUNLOCK(inp);
	error = EINVAL;
	break;

	default:
	INP_WUNLOCK(inp);
	error = ENOPROTOOPT;
	break;
	}
	break;

	case SOPT_GET:
	tp = intotcpcb(inp);
	switch (sopt->sopt_name) {
	#ifdef TCP_SIGNATURE
	case TCP_MD5SIG:
	optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;
	#endif

	case TCP_NODELAY:
	optval = tp->t_flags & TF_NODELAY;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;
	case TCP_MAXSEG:
	optval = tp->t_maxseg;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;
	case TCP_NOOPT:
	optval = tp->t_flags & TF_NOOPT;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;
	case TCP_NOPUSH:
	optval = tp->t_flags & TF_NOPUSH;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof optval);
	break;
	case TCP_INFO:
	tcp_fill_info(tp, &ti);
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &ti, sizeof ti);
	break;
	default:
	INP_WUNLOCK(inp);
	error = ENOPROTOOPT;
	break;
	}
	break;
	}
	return (error);
	}
	#undef INP_WLOCK_RECHECK

	/*
	* tcp_sendspace and tcp_recvspace are the default send and receive window
	* sizes, respectively. These are obsolescent (this information should
	* be set by the route).
	*/
	u_long tcp_sendspace = 1024*32;
	SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
	&tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
	u_long tcp_recvspace = 1024*64;
	SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
	&tcp_recvspace , 0, "Maximum incoming TCP datagram size");

	/*
	* Attach TCP protocol to socket, allocating
	* internet protocol control block, tcp control block,
	* bufer space, and entering LISTEN state if to accept connections.
	*/
	static int
	tcp_attach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct tcpcb *tp;
	struct inpcb *inp;
	int error;
	#ifdef INET6
	int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
	#endif

	if (so->so_snd.sb_hiwat == 0 \|\| so->so_rcv.sb_hiwat == 0) {
	error = soreserve(so, tcp_sendspace, tcp_recvspace);
	if (error)
	return (error);
	}
	so->so_rcv.sb_flags \|= SB_AUTOSIZE;
	so->so_snd.sb_flags \|= SB_AUTOSIZE;
	INP_INFO_WLOCK(&V_tcbinfo);
	error = in_pcballoc(so, &V_tcbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (error);
	}
	inp = sotoinpcb(so);
	#ifdef INET6
	if (isipv6) {
	inp->inp_vflag \|= INP_IPV6;
	inp->in6p_hops = -1; /* use kernel default */
	}
	else
	#endif
	inp->inp_vflag \|= INP_IPV4;
	tp = tcp_newtcpcb(inp);
	if (tp == NULL) {
	#ifdef INET6
	if (isipv6) {
	in6_pcbdetach(inp);
	in6_pcbfree(inp);
	} else {
	#endif
	in_pcbdetach(inp);
	in_pcbfree(inp);
	#ifdef INET6
	}
	#endif
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (ENOBUFS);
	}
	tp->t_state = TCPS_CLOSED;
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_tcbinfo);
	return (0);
	}

	/*
	* Initiate (or continue) disconnect.
	* If embryonic state, just send reset (once).
	* If in ``let data drain'' option and linger null, just drop.
	* Otherwise (hard), mark socket disconnecting and drop
	* current input data; switch states based on user close, and
	* send segment to peer (with FIN).
	*/
	static void
	tcp_disconnect(struct tcpcb *tp)
	{
	struct inpcb *inp = tp->t_inpcb;
	struct socket *so = inp->inp_socket;
	+#ifdef INVARIANTS
	+ INIT_VNET_INET(so->so_vnet);
	+#endif

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(inp);

	/*
	* Neither tcp_close() nor tcp_drop() should return NULL, as the
	* socket is still open.
	*/
	if (tp->t_state < TCPS_ESTABLISHED) {
	tp = tcp_close(tp);
	KASSERT(tp != NULL,
	("tcp_disconnect: tcp_close() returned NULL"));
	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
	tp = tcp_drop(tp, 0);
	KASSERT(tp != NULL,
	("tcp_disconnect: tcp_drop() returned NULL"));
	} else {
	soisdisconnecting(so);
	sbflush(&so->so_rcv);
	tcp_usrclosed(tp);
	if (!(inp->inp_vflag & INP_DROPPED))
	tcp_output_disconnect(tp);
	}
	}

	/*
	* User issued close, and wish to trail through shutdown states:
	* if never received SYN, just forget it. If got a SYN from peer,
	* but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
	* If already got a FIN from peer, then almost done; go to LAST_ACK
	* state. In all other cases, have already sent FIN to peer (e.g.
	* after PRU_SHUTDOWN), and just have to play tedious game waiting
	* for peer to send FIN or not respond to keep-alives, etc.
	* We can let the user exit from the close as soon as the FIN is acked.
	*/
	static void
	tcp_usrclosed(struct tcpcb *tp)
	{
	+#ifdef INVARIANTS
	+ INIT_VNET_INET(tp->t_inpcb->inp_vnet);
	+#endif

	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
	INP_WLOCK_ASSERT(tp->t_inpcb);

	switch (tp->t_state) {
	case TCPS_LISTEN:
	tcp_offload_listen_close(tp);
	/* FALLTHROUGH */
	case TCPS_CLOSED:
	tp->t_state = TCPS_CLOSED;
	tp = tcp_close(tp);
	/*
	* tcp_close() should never return NULL here as the socket is
	* still open.
	*/
	KASSERT(tp != NULL,
	("tcp_usrclosed: tcp_close() returned NULL"));
	break;

	case TCPS_SYN_SENT:
	case TCPS_SYN_RECEIVED:
	tp->t_flags \|= TF_NEEDFIN;
	break;

	case TCPS_ESTABLISHED:
	tp->t_state = TCPS_FIN_WAIT_1;
	break;

	case TCPS_CLOSE_WAIT:
	tp->t_state = TCPS_LAST_ACK;
	break;
	}
	if (tp->t_state >= TCPS_FIN_WAIT_2) {
	soisdisconnected(tp->t_inpcb->inp_socket);
	/* Prevent the connection hanging in FIN_WAIT_2 forever. */
	if (tp->t_state == TCPS_FIN_WAIT_2) {
	int timeout;

	timeout = (tcp_fast_finwait2_recycle) ?
	tcp_finwait2_timeout : tcp_maxidle;
	tcp_timer_activate(tp, TT_2MSL, timeout);
	}
	}
	}

	#ifdef DDB
	static void
	db_print_indent(int indent)
	{
	int i;

	for (i = 0; i < indent; i++)
	db_printf(" ");
	}

	static void
	db_print_tstate(int t_state)
	{

	switch (t_state) {
	case TCPS_CLOSED:
	db_printf("TCPS_CLOSED");
	return;

	case TCPS_LISTEN:
	db_printf("TCPS_LISTEN");
	return;

	case TCPS_SYN_SENT:
	db_printf("TCPS_SYN_SENT");
	return;

	case TCPS_SYN_RECEIVED:
	db_printf("TCPS_SYN_RECEIVED");
	return;

	case TCPS_ESTABLISHED:
	db_printf("TCPS_ESTABLISHED");
	return;

	case TCPS_CLOSE_WAIT:
	db_printf("TCPS_CLOSE_WAIT");
	return;

	case TCPS_FIN_WAIT_1:
	db_printf("TCPS_FIN_WAIT_1");
	return;

	case TCPS_CLOSING:
	db_printf("TCPS_CLOSING");
	return;

	case TCPS_LAST_ACK:
	db_printf("TCPS_LAST_ACK");
	return;

	case TCPS_FIN_WAIT_2:
	db_printf("TCPS_FIN_WAIT_2");
	return;

	case TCPS_TIME_WAIT:
	db_printf("TCPS_TIME_WAIT");
	return;

	default:
	db_printf("unknown");
	return;
	}
	}

	static void
	db_print_tflags(u_int t_flags)
	{
	int comma;

	comma = 0;
	if (t_flags & TF_ACKNOW) {
	db_printf("%sTF_ACKNOW", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_DELACK) {
	db_printf("%sTF_DELACK", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_NODELAY) {
	db_printf("%sTF_NODELAY", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_NOOPT) {
	db_printf("%sTF_NOOPT", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_SENTFIN) {
	db_printf("%sTF_SENTFIN", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_REQ_SCALE) {
	db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_RCVD_SCALE) {
	db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_REQ_TSTMP) {
	db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_RCVD_TSTMP) {
	db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_SACK_PERMIT) {
	db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_NEEDSYN) {
	db_printf("%sTF_NEEDSYN", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_NEEDFIN) {
	db_printf("%sTF_NEEDFIN", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_NOPUSH) {
	db_printf("%sTF_NOPUSH", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_NOPUSH) {
	db_printf("%sTF_NOPUSH", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_MORETOCOME) {
	db_printf("%sTF_MORETOCOME", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_LQ_OVERFLOW) {
	db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_LASTIDLE) {
	db_printf("%sTF_LASTIDLE", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_RXWIN0SENT) {
	db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_FASTRECOVERY) {
	db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_WASFRECOVERY) {
	db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_SIGNATURE) {
	db_printf("%sTF_SIGNATURE", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_FORCEDATA) {
	db_printf("%sTF_FORCEDATA", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_TSO) {
	db_printf("%sTF_TSO", comma ? ", " : "");
	comma = 1;
	}
	if (t_flags & TF_ECN_PERMIT) {
	db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
	comma = 1;
	}
	}

	static void
	db_print_toobflags(char t_oobflags)
	{
	int comma;

	comma = 0;
	if (t_oobflags & TCPOOB_HAVEDATA) {
	db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
	comma = 1;
	}
	if (t_oobflags & TCPOOB_HADDATA) {
	db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
	comma = 1;
	}
	}

	static void
	db_print_tcpcb(struct tcpcb tp, const char name, int indent)
	{

	db_print_indent(indent);
	db_printf("%s at %p\n", name, tp);

	indent += 2;

	db_print_indent(indent);
	db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
	LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);

	db_print_indent(indent);
	db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n",
	&tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);

	db_print_indent(indent);
	db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl,
	&tp->t_timers->tt_delack, tp->t_inpcb);

	db_print_indent(indent);
	db_printf("t_state: %d (", tp->t_state);
	db_print_tstate(tp->t_state);
	db_printf(")\n");

	db_print_indent(indent);
	db_printf("t_flags: 0x%x (", tp->t_flags);
	db_print_tflags(tp->t_flags);
	db_printf(")\n");

	db_print_indent(indent);
	db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n",
	tp->snd_una, tp->snd_max, tp->snd_nxt);

	db_print_indent(indent);
	db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n",
	tp->snd_up, tp->snd_wl1, tp->snd_wl2);

	db_print_indent(indent);
	db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n",
	tp->iss, tp->irs, tp->rcv_nxt);

	db_print_indent(indent);
	db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n",
	tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);

	db_print_indent(indent);
	db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n",
	tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);

	db_print_indent(indent);
	db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: "
	"0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
	tp->snd_recover);

	db_print_indent(indent);
	db_printf("t_maxopd: %u t_rcvtime: %lu t_startime: %lu\n",
	tp->t_maxopd, tp->t_rcvtime, tp->t_starttime);

	db_print_indent(indent);
	db_printf("t_rttime: %d t_rtsq: 0x%08x t_bw_rtttime: %d\n",
	tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime);

	db_print_indent(indent);
	db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u "
	"t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg,
	tp->t_srtt);

	db_print_indent(indent);
	db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u "
	"t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
	tp->t_rttbest);

	db_print_indent(indent);
	db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n",
	tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);

	db_print_indent(indent);
	db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
	db_print_toobflags(tp->t_oobflags);
	db_printf(") t_iobc: 0x%02x\n", tp->t_iobc);

	db_print_indent(indent);
	db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n",
	tp->snd_scale, tp->rcv_scale, tp->request_r_scale);

	db_print_indent(indent);
	db_printf("ts_recent: %u ts_recent_age: %lu\n",
	tp->ts_recent, tp->ts_recent_age);

	db_print_indent(indent);
	db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: "
	"%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);

	db_print_indent(indent);
	db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x "
	"t_badrxtwin: %lu\n", tp->snd_ssthresh_prev,
	tp->snd_recover_prev, tp->t_badrxtwin);

	db_print_indent(indent);
	db_printf("snd_numholes: %d snd_holes first: %p\n",
	tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));

	db_print_indent(indent);
	db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: "
	"0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata);

	/* Skip sackblks, sackhint. */

	db_print_indent(indent);
	db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n",
	tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
	}

	DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
	{
	struct tcpcb *tp;

	if (!have_addr) {
	db_printf("usage: show tcpcb <addr>\n");
	return;
	}
	tp = (struct tcpcb *)addr;

	db_print_tcpcb(tp, "tcpcb", 0);
	}
	#endif
	Index: head/sys/netinet/udp_usrreq.c
	===================================================================
	--- head/sys/netinet/udp_usrreq.c (revision 183549)
	+++ head/sys/netinet/udp_usrreq.c (revision 183550)
	@@ -1,1267 +1,1283 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California.
	* Copyright (c) 2008 Robert N. M. Watson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipfw.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/domain.h>
	#include <sys/eventhandler.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#endif
	#include <netinet/ip_icmp.h>
	#include <netinet/icmp_var.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#endif
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#endif

	#include <machine/in_cksum.h>

	#include <security/mac/mac_framework.h>

	/*
	* UDP protocol implementation.
	* Per RFC 768, August, 1980.
	*/

	/*
	* BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums
	* removes the only data integrity mechanism for packets and malformed
	* packets that would otherwise be discarded due to bad checksums, and may
	* cause problems (especially for NFS data blocks).
	*/
	static int udp_cksum = 1;
	SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum,
	0, "compute udp checksum");

	int udp_log_in_vain = 0;
	SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW,
	&udp_log_in_vain, 0, "Log all incoming UDP packets");

	int udp_blackhole = 0;
	SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, &udp_blackhole, 0,
	"Do not send port unreachables for refused connects");

	u_long udp_sendspace = 9216; /* really max datagram size */
	/* 40 1K datagrams */
	SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
	&udp_sendspace, 0, "Maximum outgoing UDP datagram size");

	u_long udp_recvspace = 40 * (1024 +
	#ifdef INET6
	sizeof(struct sockaddr_in6)
	#else
	sizeof(struct sockaddr_in)
	#endif
	);

	SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
	&udp_recvspace, 0, "Maximum space for incoming UDP datagrams");

	struct inpcbhead udb; /* from udp_var.h */
	struct inpcbinfo udbinfo;

	#ifndef UDBHASHSIZE
	#define UDBHASHSIZE 128
	#endif

	struct udpstat udpstat; /* from udp_var.h */
	-SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, &udpstat,
	- udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_udp, UDPCTL_STATS, stats,
	+ CTLFLAG_RW, udpstat, udpstat,
	+ "UDP statistics (struct udpstat, netinet/udp_var.h)");

	static void udp_detach(struct socket *so);
	static int udp_output(struct inpcb , struct mbuf , struct sockaddr *,
	struct mbuf , struct thread );

	static void
	udp_zone_change(void *tag)
	{

	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
	}

	static int
	udp_inpcb_init(void *mem, int size, int flags)
	{
	struct inpcb *inp;

	inp = mem;
	INP_LOCK_INIT(inp, "inp", "udpinp");
	return (0);
	}

	void
	udp_init(void)
	{
	+ INIT_VNET_INET(curvnet);

	INP_INFO_LOCK_INIT(&V_udbinfo, "udp");
	LIST_INIT(&V_udb);
	V_udbinfo.ipi_listhead = &V_udb;
	V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB,
	&V_udbinfo.ipi_hashmask);
	V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB,
	&V_udbinfo.ipi_porthashmask);
	V_udbinfo.ipi_zone = uma_zcreate("udpcb", sizeof(struct inpcb), NULL,
	NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
	EVENTHANDLER_PRI_ANY);
	}

	/*
	* Subroutine of udp_input(), which appends the provided mbuf chain to the
	* passed pcb/socket. The caller must provide a sockaddr_in via udp_in that
	* contains the source address. If the socket ends up being an IPv6 socket,
	* udp_append() will convert to a sockaddr_in6 before passing the address
	* into the socket code.
	*/
	static void
	udp_append(struct inpcb inp, struct ip ip, struct mbuf *n, int off,
	struct sockaddr_in *udp_in)
	{
	struct sockaddr *append_sa;
	struct socket *so;
	struct mbuf *opts = 0;
	#ifdef INET6
	struct sockaddr_in6 udp_in6;
	#endif

	INP_RLOCK_ASSERT(inp);

	#ifdef IPSEC
	/* Check AH/ESP integrity. */
	if (ipsec4_in_reject(n, inp)) {
	+ INIT_VNET_IPSEC(curvnet);
	m_freem(n);
	V_ipsec4stat.in_polvio++;
	return;
	}
	#endif /* IPSEC */
	#ifdef MAC
	if (mac_inpcb_check_deliver(inp, n) != 0) {
	m_freem(n);
	return;
	}
	#endif
	if (inp->inp_flags & INP_CONTROLOPTS \|\|
	inp->inp_socket->so_options & (SO_TIMESTAMP \| SO_BINTIME)) {
	#ifdef INET6
	if (inp->inp_vflag & INP_IPV6)
	(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
	else
	#endif
	ip_savecontrol(inp, &opts, ip, n);
	}
	#ifdef INET6
	if (inp->inp_vflag & INP_IPV6) {
	bzero(&udp_in6, sizeof(udp_in6));
	udp_in6.sin6_len = sizeof(udp_in6);
	udp_in6.sin6_family = AF_INET6;
	in6_sin_2_v4mapsin6(udp_in, &udp_in6);
	append_sa = (struct sockaddr *)&udp_in6;
	} else
	#endif
	append_sa = (struct sockaddr *)udp_in;
	m_adj(n, off);

	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_rcv);
	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
	+ INIT_VNET_INET(so->so_vnet);
	SOCKBUF_UNLOCK(&so->so_rcv);
	m_freem(n);
	if (opts)
	m_freem(opts);
	V_udpstat.udps_fullsock++;
	} else
	sorwakeup_locked(so);
	}

	void
	udp_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET(curvnet);
	int iphlen = off;
	struct ip *ip;
	struct udphdr *uh;
	struct ifnet *ifp;
	struct inpcb *inp;
	int len;
	struct ip save_ip;
	struct sockaddr_in udp_in;
	#ifdef IPFIREWALL_FORWARD
	struct m_tag *fwd_tag;
	#endif

	ifp = m->m_pkthdr.rcvif;
	V_udpstat.udps_ipackets++;

	/*
	* Strip IP options, if any; should skip this, make available to
	* user, and use on returned packets, but we don't yet have a way to
	* check the checksum with options still present.
	*/
	if (iphlen > sizeof (struct ip)) {
	ip_stripoptions(m, (struct mbuf *)0);
	iphlen = sizeof(struct ip);
	}

	/*
	* Get IP and UDP header together in first mbuf.
	*/
	ip = mtod(m, struct ip *);
	if (m->m_len < iphlen + sizeof(struct udphdr)) {
	if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) {
	V_udpstat.udps_hdrops++;
	return;
	}
	ip = mtod(m, struct ip *);
	}
	uh = (struct udphdr *)((caddr_t)ip + iphlen);

	/*
	* Destination port of 0 is illegal, based on RFC768.
	*/
	if (uh->uh_dport == 0)
	goto badunlocked;

	/*
	* Construct sockaddr format source address. Stuff source address
	* and datagram in user buffer.
	*/
	bzero(&udp_in, sizeof(udp_in));
	udp_in.sin_len = sizeof(udp_in);
	udp_in.sin_family = AF_INET;
	udp_in.sin_port = uh->uh_sport;
	udp_in.sin_addr = ip->ip_src;

	/*
	* Make mbuf data length reflect UDP length. If not enough data to
	* reflect UDP length, drop.
	*/
	len = ntohs((u_short)uh->uh_ulen);
	if (ip->ip_len != len) {
	if (len > ip->ip_len \|\| len < sizeof(struct udphdr)) {
	V_udpstat.udps_badlen++;
	goto badunlocked;
	}
	m_adj(m, len - ip->ip_len);
	/* ip->ip_len = len; */
	}

	/*
	* Save a copy of the IP header in case we want restore it for
	* sending an ICMP error message in response.
	*/
	if (!V_udp_blackhole)
	save_ip = *ip;
	else
	memset(&save_ip, 0, sizeof(save_ip));

	/*
	* Checksum extended UDP header and data.
	*/
	if (uh->uh_sum) {
	u_short uh_sum;

	if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
	uh_sum = m->m_pkthdr.csum_data;
	else
	uh_sum = in_pseudo(ip->ip_src.s_addr,
	ip->ip_dst.s_addr, htonl((u_short)len +
	m->m_pkthdr.csum_data + IPPROTO_UDP));
	uh_sum ^= 0xffff;
	} else {
	char b[9];

	bcopy(((struct ipovly *)ip)->ih_x1, b, 9);
	bzero(((struct ipovly *)ip)->ih_x1, 9);
	((struct ipovly *)ip)->ih_len = uh->uh_ulen;
	uh_sum = in_cksum(m, len + sizeof (struct ip));
	bcopy(b, ((struct ipovly *)ip)->ih_x1, 9);
	}
	if (uh_sum) {
	V_udpstat.udps_badsum++;
	m_freem(m);
	return;
	}
	} else
	V_udpstat.udps_nosum++;

	#ifdef IPFIREWALL_FORWARD
	/*
	* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
	*/
	fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
	if (fwd_tag != NULL) {
	struct sockaddr_in *next_hop;

	/*
	* Do the hack.
	*/
	next_hop = (struct sockaddr_in *)(fwd_tag + 1);
	ip->ip_dst = next_hop->sin_addr;
	uh->uh_dport = ntohs(next_hop->sin_port);

	/*
	* Remove the tag from the packet. We don't need it anymore.
	*/
	m_tag_delete(m, fwd_tag);
	}
	#endif

	INP_INFO_RLOCK(&V_udbinfo);
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
	in_broadcast(ip->ip_dst, ifp)) {
	struct inpcb *last;
	struct ip_moptions *imo;

	last = NULL;
	LIST_FOREACH(inp, &V_udb, inp_list) {
	if (inp->inp_lport != uh->uh_dport)
	continue;
	#ifdef INET6
	if ((inp->inp_vflag & INP_IPV4) == 0)
	continue;
	#endif
	if (inp->inp_laddr.s_addr != INADDR_ANY &&
	inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
	continue;
	if (inp->inp_faddr.s_addr != INADDR_ANY &&
	inp->inp_faddr.s_addr != ip->ip_src.s_addr)
	continue;
	/*
	* XXX: Do not check source port of incoming datagram
	* unless inp_connect() has been called to bind the
	* fport part of the 4-tuple; the source could be
	* trying to talk to us with an ephemeral port.
	*/
	if (inp->inp_fport != 0 &&
	inp->inp_fport != uh->uh_sport)
	continue;

	INP_RLOCK(inp);

	/*
	* Handle socket delivery policy for any-source
	* and source-specific multicast. [RFC3678]
	*/
	imo = inp->inp_moptions;
	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
	imo != NULL) {
	struct sockaddr_in sin;
	struct in_msource *ims;
	int blocked, mode;
	size_t idx;

	bzero(&sin, sizeof(struct sockaddr_in));
	sin.sin_len = sizeof(struct sockaddr_in);
	sin.sin_family = AF_INET;
	sin.sin_addr = ip->ip_dst;

	blocked = 0;
	idx = imo_match_group(imo, ifp,
	(struct sockaddr *)&sin);
	if (idx == -1) {
	/*
	* No group membership for this socket.
	* Do not bump udps_noportbcast, as
	* this will happen further down.
	*/
	blocked++;
	} else {
	/*
	* Check for a multicast source filter
	* entry on this socket for this group.
	* MCAST_EXCLUDE is the default
	* behaviour. It means default accept;
	* entries, if present, denote sources
	* to be excluded from delivery.
	*/
	ims = imo_match_source(imo, idx,
	(struct sockaddr *)&udp_in);
	mode = imo->imo_mfilters[idx].imf_fmode;
	if ((ims != NULL &&
	mode == MCAST_EXCLUDE) \|\|
	(ims == NULL &&
	mode == MCAST_INCLUDE)) {
	#ifdef DIAGNOSTIC
	if (bootverbose) {
	printf("%s: blocked by"
	" source filter\n",
	__func__);
	}
	#endif
	V_udpstat.udps_filtermcast++;
	blocked++;
	}
	}
	if (blocked != 0) {
	INP_RUNLOCK(inp);
	continue;
	}
	}
	if (last != NULL) {
	struct mbuf *n;

	n = m_copy(m, 0, M_COPYALL);
	if (n != NULL)
	udp_append(last, ip, n, iphlen +
	sizeof(struct udphdr), &udp_in);
	INP_RUNLOCK(last);
	}
	last = inp;
	/*
	* Don't look for additional matches if this one does
	* not have either the SO_REUSEPORT or SO_REUSEADDR
	* socket options set. This heuristic avoids
	* searching through all pcbs in the common case of a
	* non-shared port. It assumes that an application
	* will never clear these options after setting them.
	*/
	if ((last->inp_socket->so_options &
	(SO_REUSEPORT\|SO_REUSEADDR)) == 0)
	break;
	}

	if (last == NULL) {
	/*
	* No matching pcb found; discard datagram. (No need
	* to send an ICMP Port Unreachable for a broadcast
	* or multicast datgram.)
	*/
	V_udpstat.udps_noportbcast++;
	goto badheadlocked;
	}
	udp_append(last, ip, m, iphlen + sizeof(struct udphdr),
	&udp_in);
	INP_RUNLOCK(last);
	INP_INFO_RUNLOCK(&V_udbinfo);
	return;
	}

	/*
	* Locate pcb for datagram.
	*/
	inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
	ip->ip_dst, uh->uh_dport, 1, ifp);
	if (inp == NULL) {
	if (udp_log_in_vain) {
	char buf[4*sizeof "123"];

	strcpy(buf, inet_ntoa(ip->ip_dst));
	log(LOG_INFO,
	"Connection attempt to UDP %s:%d from %s:%d\n",
	buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src),
	ntohs(uh->uh_sport));
	}
	V_udpstat.udps_noport++;
	if (m->m_flags & (M_BCAST \| M_MCAST)) {
	V_udpstat.udps_noportbcast++;
	goto badheadlocked;
	}
	if (V_udp_blackhole)
	goto badheadlocked;
	if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
	goto badheadlocked;
	*ip = save_ip;
	ip->ip_len += iphlen;
	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
	INP_INFO_RUNLOCK(&V_udbinfo);
	return;
	}

	/*
	* Check the minimum TTL for socket.
	*/
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(&V_udbinfo);
	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
	INP_RUNLOCK(inp);
	goto badunlocked;
	}
	udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in);
	INP_RUNLOCK(inp);
	return;

	badheadlocked:
	if (inp)
	INP_RUNLOCK(inp);
	INP_INFO_RUNLOCK(&V_udbinfo);
	badunlocked:
	m_freem(m);
	}

	/*
	* Notify a udp user of an asynchronous error; just wake up so that they can
	* collect error status.
	*/
	struct inpcb *
	udp_notify(struct inpcb *inp, int errno)
	{

	/*
	* While udp_ctlinput() always calls udp_notify() with a read lock
	* when invoking it directly, in_pcbnotifyall() currently uses write
	* locks due to sharing code with TCP. For now, accept either a read
	* or a write lock, but a read lock is sufficient.
	*/
	INP_LOCK_ASSERT(inp);

	inp->inp_socket->so_error = errno;
	sorwakeup(inp->inp_socket);
	sowwakeup(inp->inp_socket);
	return (inp);
	}

	void
	udp_ctlinput(int cmd, struct sockaddr sa, void vip)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip *ip = vip;
	struct udphdr *uh;
	struct in_addr faddr;
	struct inpcb *inp;

	faddr = ((struct sockaddr_in *)sa)->sin_addr;
	if (sa->sa_family != AF_INET \|\| faddr.s_addr == INADDR_ANY)
	return;

	/*
	* Redirects don't need to be handled up here.
	*/
	if (PRC_IS_REDIRECT(cmd))
	return;

	/*
	* Hostdead is ugly because it goes linearly through all PCBs.
	*
	* XXX: We never get this from ICMP, otherwise it makes an excellent
	* DoS attack on machines with many connections.
	*/
	if (cmd == PRC_HOSTDEAD)
	ip = NULL;
	else if ((unsigned)cmd >= PRC_NCMDS \|\| inetctlerrmap[cmd] == 0)
	return;
	if (ip != NULL) {
	uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
	INP_INFO_RLOCK(&V_udbinfo);
	inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport,
	ip->ip_src, uh->uh_sport, 0, NULL);
	if (inp != NULL) {
	INP_RLOCK(inp);
	if (inp->inp_socket != NULL) {
	udp_notify(inp, inetctlerrmap[cmd]);
	}
	INP_RUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(&V_udbinfo);
	} else
	in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd],
	udp_notify);
	}

	static int
	udp_pcblist(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	int error, i, n;
	struct inpcb inp, *inp_list;
	inp_gen_t gencnt;
	struct xinpgen xig;

	/*
	* The process of preparing the PCB list is too time-consuming and
	* resource-intensive to repeat twice on every request.
	*/
	if (req->oldptr == 0) {
	n = V_udbinfo.ipi_count;
	req->oldidx = 2 * (sizeof xig)
	+ (n + n/8) * sizeof(struct xinpcb);
	return (0);
	}

	if (req->newptr != 0)
	return (EPERM);

	/*
	* OK, now we're committed to doing something.
	*/
	INP_INFO_RLOCK(&V_udbinfo);
	gencnt = V_udbinfo.ipi_gencnt;
	n = V_udbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_udbinfo);

	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
	+ n * sizeof(struct xinpcb));
	if (error != 0)
	return (error);

	xig.xig_len = sizeof xig;
	xig.xig_count = n;
	xig.xig_gen = gencnt;
	xig.xig_sogen = so_gencnt;
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	if (error)
	return (error);

	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
	if (inp_list == 0)
	return (ENOMEM);

	INP_INFO_RLOCK(&V_udbinfo);
	for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
	inp = LIST_NEXT(inp, inp_list)) {
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt &&
	cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
	inp_list[i++] = inp;
	INP_RUNLOCK(inp);
	}
	INP_INFO_RUNLOCK(&V_udbinfo);
	n = i;

	error = 0;
	for (i = 0; i < n; i++) {
	inp = inp_list[i];
	INP_RLOCK(inp);
	if (inp->inp_gencnt <= gencnt) {
	struct xinpcb xi;
	bzero(&xi, sizeof(xi));
	xi.xi_len = sizeof xi;
	/* XXX should avoid extra copy */
	bcopy(inp, &xi.xi_inp, sizeof *inp);
	if (inp->inp_socket)
	sotoxsocket(inp->inp_socket, &xi.xi_socket);
	xi.xi_inp.inp_gencnt = inp->inp_gencnt;
	INP_RUNLOCK(inp);
	error = SYSCTL_OUT(req, &xi, sizeof xi);
	} else
	INP_RUNLOCK(inp);
	}
	if (!error) {
	/*
	* Give the user an updated idea of our state. If the
	* generation differs from what we told her before, she knows
	* that something happened while we were processing this
	* request, and it might be necessary to retry.
	*/
	INP_INFO_RLOCK(&V_udbinfo);
	xig.xig_gen = V_udbinfo.ipi_gencnt;
	xig.xig_sogen = so_gencnt;
	xig.xig_count = V_udbinfo.ipi_count;
	INP_INFO_RUNLOCK(&V_udbinfo);
	error = SYSCTL_OUT(req, &xig, sizeof xig);
	}
	free(inp_list, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
	udp_pcblist, "S,xinpcb", "List of active UDP sockets");

	static int
	udp_getcred(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	struct xucred xuc;
	struct sockaddr_in addrs[2];
	struct inpcb *inp;
	int error;

	error = priv_check(req->td, PRIV_NETINET_GETCRED);
	if (error)
	return (error);
	error = SYSCTL_IN(req, addrs, sizeof(addrs));
	if (error)
	return (error);
	INP_INFO_RLOCK(&V_udbinfo);
	inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
	addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
	if (inp != NULL) {
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(&V_udbinfo);
	if (inp->inp_socket == NULL)
	error = ENOENT;
	if (error == 0)
	error = cr_canseesocket(req->td->td_ucred,
	inp->inp_socket);
	if (error == 0)
	cru2x(inp->inp_socket->so_cred, &xuc);
	INP_RUNLOCK(inp);
	} else {
	INP_INFO_RUNLOCK(&V_udbinfo);
	error = ENOENT;
	}
	if (error == 0)
	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
	return (error);
	}

	SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
	CTLTYPE_OPAQUE\|CTLFLAG_RW\|CTLFLAG_PRISON, 0, 0,
	udp_getcred, "S,xucred", "Get the xucred of a UDP connection");

	static int
	udp_output(struct inpcb inp, struct mbuf m, struct sockaddr *addr,
	struct mbuf control, struct thread td)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct udpiphdr *ui;
	int len = m->m_pkthdr.len;
	struct in_addr faddr, laddr;
	struct cmsghdr *cm;
	struct sockaddr_in *sin, src;
	int error = 0;
	int ipflags;
	u_short fport, lport;
	int unlock_udbinfo;

	/*
	* udp_output() may need to temporarily bind or connect the current
	* inpcb. As such, we don't know up front whether we will need the
	* pcbinfo lock or not. Do any work to decide what is needed up
	* front before acquiring any locks.
	*/
	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
	if (control)
	m_freem(control);
	m_freem(m);
	return (EMSGSIZE);
	}

	src.sin_family = 0;
	if (control != NULL) {
	/*
	* XXX: Currently, we assume all the optional information is
	* stored in a single mbuf.
	*/
	if (control->m_next) {
	m_freem(control);
	m_freem(m);
	return (EINVAL);
	}
	for (; control->m_len > 0;
	control->m_data += CMSG_ALIGN(cm->cmsg_len),
	control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
	cm = mtod(control, struct cmsghdr *);
	if (control->m_len < sizeof(*cm) \|\| cm->cmsg_len == 0
	\|\| cm->cmsg_len > control->m_len) {
	error = EINVAL;
	break;
	}
	if (cm->cmsg_level != IPPROTO_IP)
	continue;

	switch (cm->cmsg_type) {
	case IP_SENDSRCADDR:
	if (cm->cmsg_len !=
	CMSG_LEN(sizeof(struct in_addr))) {
	error = EINVAL;
	break;
	}
	bzero(&src, sizeof(src));
	src.sin_family = AF_INET;
	src.sin_len = sizeof(src);
	src.sin_port = inp->inp_lport;
	src.sin_addr =
	(struct in_addr )CMSG_DATA(cm);
	break;

	default:
	error = ENOPROTOOPT;
	break;
	}
	if (error)
	break;
	}
	m_freem(control);
	}
	if (error) {
	m_freem(m);
	return (error);
	}

	/*
	* Depending on whether or not the application has bound or connected
	* the socket, we may have to do varying levels of work. The optimal
	* case is for a connected UDP socket, as a global lock isn't
	* required at all.
	*
	* In order to decide which we need, we require stability of the
	* inpcb binding, which we ensure by acquiring a read lock on the
	* inpcb. This doesn't strictly follow the lock order, so we play
	* the trylock and retry game; note that we may end up with more
	* conservative locks than required the second time around, so later
	* assertions have to accept that. Further analysis of the number of
	* misses under contention is required.
	*/
	sin = (struct sockaddr_in *)addr;
	INP_RLOCK(inp);
	if (sin != NULL &&
	(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
	INP_RUNLOCK(inp);
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	unlock_udbinfo = 2;
	} else if ((sin != NULL && (
	(sin->sin_addr.s_addr == INADDR_ANY) \|\|
	(sin->sin_addr.s_addr == INADDR_BROADCAST) \|\|
	(inp->inp_laddr.s_addr == INADDR_ANY) \|\|
	(inp->inp_lport == 0))) \|\|
	(src.sin_family == AF_INET)) {
	if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) {
	INP_RUNLOCK(inp);
	INP_INFO_RLOCK(&V_udbinfo);
	INP_RLOCK(inp);
	}
	unlock_udbinfo = 1;
	} else
	unlock_udbinfo = 0;

	/*
	* If the IP_SENDSRCADDR control message was specified, override the
	* source address for this datagram. Its use is invalidated if the
	* address thus specified is incomplete or clobbers other inpcbs.
	*/
	laddr = inp->inp_laddr;
	lport = inp->inp_lport;
	if (src.sin_family == AF_INET) {
	INP_INFO_LOCK_ASSERT(&V_udbinfo);
	if ((lport == 0) \|\|
	(laddr.s_addr == INADDR_ANY &&
	src.sin_addr.s_addr == INADDR_ANY)) {
	error = EINVAL;
	goto release;
	}
	error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
	&laddr.s_addr, &lport, td->td_ucred);
	if (error)
	goto release;
	}

	/*
	* If a UDP socket has been connected, then a local address/port will
	* have been selected and bound.
	*
	* If a UDP socket has not been connected to, then an explicit
	* destination address must be used, in which case a local
	* address/port may not have been selected and bound.
	*/
	if (sin != NULL) {
	INP_LOCK_ASSERT(inp);
	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	error = EISCONN;
	goto release;
	}

	/*
	* Jail may rewrite the destination address, so let it do
	* that before we use it.
	*/
	if (jailed(td->td_ucred))
	prison_remote_ip(td->td_ucred, 0,
	&sin->sin_addr.s_addr);

	/*
	* If a local address or port hasn't yet been selected, or if
	* the destination address needs to be rewritten due to using
	* a special INADDR_ constant, invoke in_pcbconnect_setup()
	* to do the heavy lifting. Once a port is selected, we
	* commit the binding back to the socket; we also commit the
	* binding of the address if in jail.
	*
	* If we already have a valid binding and we're not
	* requesting a destination address rewrite, use a fast path.
	*/
	if (inp->inp_laddr.s_addr == INADDR_ANY \|\|
	inp->inp_lport == 0 \|\|
	sin->sin_addr.s_addr == INADDR_ANY \|\|
	sin->sin_addr.s_addr == INADDR_BROADCAST) {
	INP_INFO_LOCK_ASSERT(&V_udbinfo);
	error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
	&lport, &faddr.s_addr, &fport, NULL,
	td->td_ucred);
	if (error)
	goto release;

	/*
	* XXXRW: Why not commit the port if the address is
	* !INADDR_ANY?
	*/
	/* Commit the local port if newly assigned. */
	if (inp->inp_laddr.s_addr == INADDR_ANY &&
	inp->inp_lport == 0) {
	INP_INFO_WLOCK_ASSERT(&V_udbinfo);
	INP_WLOCK_ASSERT(inp);
	/*
	* Remember addr if jailed, to prevent
	* rebinding.
	*/
	if (jailed(td->td_ucred))
	inp->inp_laddr = laddr;
	inp->inp_lport = lport;
	if (in_pcbinshash(inp) != 0) {
	inp->inp_lport = 0;
	error = EAGAIN;
	goto release;
	}
	inp->inp_flags \|= INP_ANONPORT;
	}
	} else {
	faddr = sin->sin_addr;
	fport = sin->sin_port;
	}
	} else {
	INP_LOCK_ASSERT(inp);
	faddr = inp->inp_faddr;
	fport = inp->inp_fport;
	if (faddr.s_addr == INADDR_ANY) {
	error = ENOTCONN;
	goto release;
	}
	}

	/*
	* Calculate data length and get a mbuf for UDP, IP, and possible
	* link-layer headers. Immediate slide the data pointer back forward
	* since we won't use that space at this layer.
	*/
	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT);
	if (m == NULL) {
	error = ENOBUFS;
	goto release;
	}
	m->m_data += max_linkhdr;
	m->m_len -= max_linkhdr;
	m->m_pkthdr.len -= max_linkhdr;

	/*
	* Fill in mbuf with extended UDP header and addresses and length put
	* into network format.
	*/
	ui = mtod(m, struct udpiphdr *);
	bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */
	ui->ui_pr = IPPROTO_UDP;
	ui->ui_src = laddr;
	ui->ui_dst = faddr;
	ui->ui_sport = lport;
	ui->ui_dport = fport;
	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));

	/*
	* Set the Don't Fragment bit in the IP header.
	*/
	if (inp->inp_flags & INP_DONTFRAG) {
	struct ip *ip;

	ip = (struct ip *)&ui->ui_i;
	ip->ip_off \|= IP_DF;
	}

	ipflags = 0;
	if (inp->inp_socket->so_options & SO_DONTROUTE)
	ipflags \|= IP_ROUTETOIF;
	if (inp->inp_socket->so_options & SO_BROADCAST)
	ipflags \|= IP_ALLOWBROADCAST;
	if (inp->inp_flags & INP_ONESBCAST)
	ipflags \|= IP_SENDONES;

	#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
	#endif

	/*
	* Set up checksum and output datagram.
	*/
	if (udp_cksum) {
	if (inp->inp_flags & INP_ONESBCAST)
	faddr.s_addr = INADDR_BROADCAST;
	ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
	htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP));
	m->m_pkthdr.csum_flags = CSUM_UDP;
	m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
	} else
	ui->ui_sum = 0;
	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
	((struct ip )ui)->ip_ttl = inp->inp_ip_ttl; / XXX */
	((struct ip )ui)->ip_tos = inp->inp_ip_tos; / XXX */
	V_udpstat.udps_opackets++;

	if (unlock_udbinfo == 2)
	INP_INFO_WUNLOCK(&V_udbinfo);
	else if (unlock_udbinfo == 1)
	INP_INFO_RUNLOCK(&V_udbinfo);
	error = ip_output(m, inp->inp_options, NULL, ipflags,
	inp->inp_moptions, inp);
	if (unlock_udbinfo == 2)
	INP_WUNLOCK(inp);
	else
	INP_RUNLOCK(inp);
	return (error);

	release:
	if (unlock_udbinfo == 2) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	} else if (unlock_udbinfo == 1) {
	INP_RUNLOCK(inp);
	INP_INFO_RUNLOCK(&V_udbinfo);
	} else
	INP_RUNLOCK(inp);
	m_freem(m);
	return (error);
	}

	static void
	udp_abort(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	in_pcbdisconnect(inp);
	inp->inp_laddr.s_addr = INADDR_ANY;
	soisdisconnected(so);
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	}

	static int
	udp_attach(struct socket so, int proto, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
	error = soreserve(so, udp_sendspace, udp_recvspace);
	if (error)
	return (error);
	INP_INFO_WLOCK(&V_udbinfo);
	error = in_pcballoc(so, &V_udbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);
	}

	inp = (struct inpcb *)so->so_pcb;
	INP_INFO_WUNLOCK(&V_udbinfo);
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_ip_ttl = V_ip_defttl;
	INP_WUNLOCK(inp);
	return (0);
	}

	static int
	udp_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	error = in_pcbbind(inp, nam, td->td_ucred);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);
	}

	static void
	udp_close(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	in_pcbdisconnect(inp);
	inp->inp_laddr.s_addr = INADDR_ANY;
	soisdisconnected(so);
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	}

	static int
	udp_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;
	struct sockaddr_in *sin;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (EISCONN);
	}
	sin = (struct sockaddr_in *)nam;
	if (jailed(td->td_ucred))
	prison_remote_ip(td->td_ucred, 0, &sin->sin_addr.s_addr);
	error = in_pcbconnect(inp, nam, td->td_ucred);
	if (error == 0)
	soisconnected(so);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);
	}

	static void
	udp_detach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
	("udp_detach: not disconnected"));
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	in_pcbdetach(inp);
	in_pcbfree(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	}

	static int
	udp_disconnect(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (inp->inp_faddr.s_addr == INADDR_ANY) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (ENOTCONN);
	}

	in_pcbdisconnect(inp);
	inp->inp_laddr.s_addr = INADDR_ANY;
	SOCK_LOCK(so);
	so->so_state &= ~SS_ISCONNECTED; /* XXX */
	SOCK_UNLOCK(so);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (0);
	}

	static int
	udp_send(struct socket so, int flags, struct mbuf m, struct sockaddr *addr,
	struct mbuf control, struct thread td)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
	return (udp_output(inp, m, addr, control, td));
	}

	int
	udp_shutdown(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
	INP_WLOCK(inp);
	socantsendmore(so);
	INP_WUNLOCK(inp);
	return (0);
	}

	struct pr_usrreqs udp_usrreqs = {
	.pru_abort = udp_abort,
	.pru_attach = udp_attach,
	.pru_bind = udp_bind,
	.pru_connect = udp_connect,
	.pru_control = in_control,
	.pru_detach = udp_detach,
	.pru_disconnect = udp_disconnect,
	.pru_peeraddr = in_getpeeraddr,
	.pru_send = udp_send,
	.pru_soreceive = soreceive_dgram,
	.pru_sosend = sosend_dgram,
	.pru_shutdown = udp_shutdown,
	.pru_sockaddr = in_getsockaddr,
	.pru_sosetlabel = in_pcbsosetlabel,
	.pru_close = udp_close,
	};
	Index: head/sys/netinet/vinet.h
	===================================================================
	--- head/sys/netinet/vinet.h (nonexistent)
	+++ head/sys/netinet/vinet.h (revision 183550)
	@@ -0,0 +1,331 @@
	+/*-
	+ * Copyright (c) 2006-2008 University of Zagreb
	+ * Copyright (c) 2006-2008 FreeBSD Foundation
	+ *
	+ * This software was developed by the University of Zagreb and the
	+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
	+ * FreeBSD Foundation.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _NETINET_VINET_H_
	+#define _NETINET_VINET_H_
	+
	+#ifdef VIMAGE
	+#include <sys/socketvar.h>
	+#include <sys/sysctl.h>
	+#include <sys/md5.h>
	+
	+#include <netinet/in.h>
	+#include <netinet/in_systm.h>
	+#include <netinet/in_var.h>
	+#include <netinet/in_pcb.h>
	+#include <netinet/ip_var.h>
	+#include <netinet/ip.h>
	+#include <netinet/ip_icmp.h>
	+#include <netinet/icmp_var.h>
	+#include <netinet/igmp_var.h>
	+#include <netinet/tcp.h>
	+#include <netinet/tcp_var.h>
	+#include <netinet/tcp_hostcache.h>
	+#include <netinet/tcp_syncache.h>
	+#include <netinet/udp.h>
	+#include <netinet/udp_var.h>
	+
	+struct vnet_inet {
	+ struct in_ifaddrhashhead *_in_ifaddrhashtbl;
	+ struct in_ifaddrhead _in_ifaddrhead;
	+ u_long _in_ifaddrhmask;
	+ struct in_multihead _in_multihead;
	+
	+ int _arpt_keep;
	+ int _arp_maxtries;
	+ int _useloopback;
	+ int _arp_proxyall;
	+ int _subnetsarelocal;
	+ int _sameprefixcarponly;
	+
	+ int _ipforwarding;
	+ int _ipstealth;
	+ int _ipfastforward_active;
	+ int _ipsendredirects;
	+ int _ip_defttl;
	+ int _ip_keepfaith;
	+ int _ip_sendsourcequench;
	+ int _ip_do_randomid;
	+ int _ip_checkinterface;
	+ u_short _ip_id;
	+
	+ uma_zone_t _ipq_zone;
	+ int _nipq; /* Total # of reass queues */
	+ int _maxnipq; /* Admin. limit on # reass queues. */
	+ int _maxfragsperpacket;
	+ TAILQ_HEAD(ipqhead, ipq) _ipq[IPREASS_NHASH];
	+
	+ struct inpcbhead _tcb; /* head of queue of active tcpcb's */
	+ struct inpcbinfo _tcbinfo;
	+ struct tcpstat _tcpstat; /* tcp statistics */
	+ struct tcp_hostcache _tcp_hostcache;
	+ struct callout _tcp_hc_callout;
	+ struct tcp_syncache _tcp_syncache;
	+ struct inpcbhead _divcb;
	+ struct inpcbinfo _divcbinfo;
	+ TAILQ_HEAD(, tcptw) _twq_2msl;
	+
	+ int _tcp_sc_rst_sock_fail;
	+ int _tcp_mssdflt;
	+ int _tcp_v6mssdflt;
	+ int _tcp_minmss;
	+ int _tcp_do_rfc1323;
	+ int _icmp_may_rst;
	+ int _tcp_isn_reseed_interval;
	+ int _tcp_inflight_enable;
	+ int _tcp_inflight_rttthresh;
	+ int _tcp_inflight_min;
	+ int _tcp_inflight_max;
	+ int _tcp_inflight_stab;
	+ int _nolocaltimewait;
	+ int _path_mtu_discovery;
	+ int _ss_fltsz;
	+ int _ss_fltsz_local;
	+ int _tcp_do_newreno;
	+ int _tcp_do_tso;
	+ int _tcp_do_autosndbuf;
	+ int _tcp_autosndbuf_inc;
	+ int _tcp_autosndbuf_max;
	+ int _tcp_do_sack;
	+ int _tcp_sack_maxholes;
	+ int _tcp_sack_globalmaxholes;
	+ int _tcp_sack_globalholes;
	+ int _blackhole;
	+ int _tcp_delack_enabled;
	+ int _drop_synfin;
	+ int _tcp_do_rfc3042;
	+ int _tcp_do_rfc3390;
	+ int _tcp_do_ecn;
	+ int _tcp_ecn_maxretries;
	+ int _tcp_insecure_rst;
	+ int _tcp_do_autorcvbuf;
	+ int _tcp_autorcvbuf_inc;
	+ int _tcp_autorcvbuf_max;
	+ int _tcp_reass_maxseg;
	+ int _tcp_reass_qsize;
	+ int _tcp_reass_maxqlen;
	+ int _tcp_reass_overflows;
	+
	+ u_char _isn_secret[32];
	+ int _isn_last_reseed;
	+ u_int32_t _isn_offset;
	+ u_int32_t _isn_offset_old;
	+ MD5_CTX _isn_ctx;
	+
	+ struct inpcbhead _udb;
	+ struct inpcbinfo _udbinfo;
	+ struct udpstat _udpstat;
	+ int _udp_blackhole;
	+
	+ struct inpcbhead _ripcb;
	+ struct inpcbinfo _ripcbinfo;
	+ struct socket *_ip_mrouter;
	+
	+ struct socket *_ip_rsvpd;
	+ int _ip_rsvp_on;
	+ int _rsvp_on;
	+
	+ struct icmpstat _icmpstat;
	+ struct ipstat _ipstat;
	+ struct igmpstat _igmpstat;
	+
	+ SLIST_HEAD(, router_info) _router_info_head;
	+
	+ int _rtq_timeout;
	+ int _rtq_reallyold;
	+ int _rtq_minreallyold;
	+ int _rtq_toomany;
	+ struct callout _rtq_timer;
	+
	+ int _ipport_lowfirstauto;
	+ int _ipport_lowlastauto;
	+ int _ipport_firstauto;
	+ int _ipport_lastauto;
	+ int _ipport_hifirstauto;
	+ int _ipport_hilastauto;
	+ int _ipport_reservedhigh;
	+ int _ipport_reservedlow;
	+ int _ipport_randomized;
	+ int _ipport_randomcps;
	+ int _ipport_randomtime;
	+ int _ipport_stoprandom;
	+ int _ipport_tcpallocs;
	+ int _ipport_tcplastcount;
	+
	+ int _icmpmaskrepl;
	+ u_int _icmpmaskfake;
	+ int _drop_redirect;
	+ int _log_redirect;
	+ int _icmplim;
	+ int _icmplim_output;
	+ char _reply_src[IFNAMSIZ];
	+ int _icmp_rfi;
	+ int _icmp_quotelen;
	+ int _icmpbmcastecho;
	+};
	+#endif
	+
	+
	+/*
	+ * Symbol translation macros
	+ */
	+#define INIT_VNET_INET(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_INET, struct vnet_inet, vnet_inet)
	+
	+#define VNET_INET(sym) VSYM(vnet_inet, sym)
	+
	+#define V_arp_maxtries VNET_INET(arp_maxtries)
	+#define V_arp_proxyall VNET_INET(arp_proxyall)
	+#define V_arpt_keep VNET_INET(arpt_keep)
	+#define V_arpt_prune VNET_INET(arpt_prune)
	+#define V_blackhole VNET_INET(blackhole)
	+#define V_divcb VNET_INET(divcb)
	+#define V_divcbinfo VNET_INET(divcbinfo)
	+#define V_drop_redirect VNET_INET(drop_redirect)
	+#define V_drop_synfin VNET_INET(drop_synfin)
	+#define V_icmp_may_rst VNET_INET(icmp_may_rst)
	+#define V_icmp_quotelen VNET_INET(icmp_quotelen)
	+#define V_icmp_rfi VNET_INET(icmp_rfi)
	+#define V_icmpbmcastecho VNET_INET(icmpbmcastecho)
	+#define V_icmplim VNET_INET(icmplim)
	+#define V_icmplim_output VNET_INET(icmplim_output)
	+#define V_icmpmaskfake VNET_INET(icmpmaskfake)
	+#define V_icmpmaskrepl VNET_INET(icmpmaskrepl)
	+#define V_icmpstat VNET_INET(icmpstat)
	+#define V_igmpstat VNET_INET(igmpstat)
	+#define V_in_ifaddrhashtbl VNET_INET(in_ifaddrhashtbl)
	+#define V_in_ifaddrhead VNET_INET(in_ifaddrhead)
	+#define V_in_ifaddrhmask VNET_INET(in_ifaddrhmask)
	+#define V_in_multihead VNET_INET(in_multihead)
	+#define V_ip_checkinterface VNET_INET(ip_checkinterface)
	+#define V_ip_defttl VNET_INET(ip_defttl)
	+#define V_ip_do_randomid VNET_INET(ip_do_randomid)
	+#define V_ip_id VNET_INET(ip_id)
	+#define V_ip_keepfaith VNET_INET(ip_keepfaith)
	+#define V_ip_mrouter VNET_INET(ip_mrouter)
	+#define V_ip_rsvp_on VNET_INET(ip_rsvp_on)
	+#define V_ip_rsvpd VNET_INET(ip_rsvpd)
	+#define V_ip_sendsourcequench VNET_INET(ip_sendsourcequench)
	+#define V_ipfastforward_active VNET_INET(ipfastforward_active)
	+#define V_ipforwarding VNET_INET(ipforwarding)
	+#define V_ipport_firstauto VNET_INET(ipport_firstauto)
	+#define V_ipport_hifirstauto VNET_INET(ipport_hifirstauto)
	+#define V_ipport_hilastauto VNET_INET(ipport_hilastauto)
	+#define V_ipport_lastauto VNET_INET(ipport_lastauto)
	+#define V_ipport_lowfirstauto VNET_INET(ipport_lowfirstauto)
	+#define V_ipport_lowlastauto VNET_INET(ipport_lowlastauto)
	+#define V_ipport_randomcps VNET_INET(ipport_randomcps)
	+#define V_ipport_randomized VNET_INET(ipport_randomized)
	+#define V_ipport_randomtime VNET_INET(ipport_randomtime)
	+#define V_ipport_reservedhigh VNET_INET(ipport_reservedhigh)
	+#define V_ipport_reservedlow VNET_INET(ipport_reservedlow)
	+#define V_ipport_stoprandom VNET_INET(ipport_stoprandom)
	+#define V_ipport_tcpallocs VNET_INET(ipport_tcpallocs)
	+#define V_ipport_tcplastcount VNET_INET(ipport_tcplastcount)
	+#define V_ipq VNET_INET(ipq)
	+#define V_ipq_zone VNET_INET(ipq_zone)
	+#define V_ipsendredirects VNET_INET(ipsendredirects)
	+#define V_ipstat VNET_INET(ipstat)
	+#define V_ipstealth VNET_INET(ipstealth)
	+#define V_isn_ctx VNET_INET(isn_ctx)
	+#define V_isn_last_reseed VNET_INET(isn_last_reseed)
	+#define V_isn_offset VNET_INET(isn_offset)
	+#define V_isn_offset_old VNET_INET(isn_offset_old)
	+#define V_isn_secret VNET_INET(isn_secret)
	+#define V_llinfo_arp VNET_INET(llinfo_arp)
	+#define V_log_redirect VNET_INET(log_redirect)
	+#define V_maxfragsperpacket VNET_INET(maxfragsperpacket)
	+#define V_maxnipq VNET_INET(maxnipq)
	+#define V_nipq VNET_INET(nipq)
	+#define V_nolocaltimewait VNET_INET(nolocaltimewait)
	+#define V_path_mtu_discovery VNET_INET(path_mtu_discovery)
	+#define V_reply_src VNET_INET(reply_src)
	+#define V_ripcb VNET_INET(ripcb)
	+#define V_ripcbinfo VNET_INET(ripcbinfo)
	+#define V_router_info_head VNET_INET(router_info_head)
	+#define V_rsvp_on VNET_INET(rsvp_on)
	+#define V_rtq_minreallyold VNET_INET(rtq_minreallyold)
	+#define V_rtq_reallyold VNET_INET(rtq_reallyold)
	+#define V_rtq_timeout VNET_INET(rtq_timeout)
	+#define V_rtq_timer VNET_INET(rtq_timer)
	+#define V_rtq_toomany VNET_INET(rtq_toomany)
	+#define V_sameprefixcarponly VNET_INET(sameprefixcarponly)
	+#define V_ss_fltsz VNET_INET(ss_fltsz)
	+#define V_ss_fltsz_local VNET_INET(ss_fltsz_local)
	+#define V_subnetsarelocal VNET_INET(subnetsarelocal)
	+#define V_tcb VNET_INET(tcb)
	+#define V_tcbinfo VNET_INET(tcbinfo)
	+#define V_tcp_autorcvbuf_inc VNET_INET(tcp_autorcvbuf_inc)
	+#define V_tcp_autorcvbuf_max VNET_INET(tcp_autorcvbuf_max)
	+#define V_tcp_autosndbuf_inc VNET_INET(tcp_autosndbuf_inc)
	+#define V_tcp_autosndbuf_max VNET_INET(tcp_autosndbuf_max)
	+#define V_tcp_delack_enabled VNET_INET(tcp_delack_enabled)
	+#define V_tcp_do_autorcvbuf VNET_INET(tcp_do_autorcvbuf)
	+#define V_tcp_do_autosndbuf VNET_INET(tcp_do_autosndbuf)
	+#define V_tcp_do_ecn VNET_INET(tcp_do_ecn)
	+#define V_tcp_do_newreno VNET_INET(tcp_do_newreno)
	+#define V_tcp_do_rfc1323 VNET_INET(tcp_do_rfc1323)
	+#define V_tcp_do_rfc3042 VNET_INET(tcp_do_rfc3042)
	+#define V_tcp_do_rfc3390 VNET_INET(tcp_do_rfc3390)
	+#define V_tcp_do_sack VNET_INET(tcp_do_sack)
	+#define V_tcp_do_tso VNET_INET(tcp_do_tso)
	+#define V_tcp_ecn_maxretries VNET_INET(tcp_ecn_maxretries)
	+#define V_tcp_hc_callout VNET_INET(tcp_hc_callout)
	+#define V_tcp_hostcache VNET_INET(tcp_hostcache)
	+#define V_tcp_inflight_enable VNET_INET(tcp_inflight_enable)
	+#define V_tcp_inflight_max VNET_INET(tcp_inflight_max)
	+#define V_tcp_inflight_min VNET_INET(tcp_inflight_min)
	+#define V_tcp_inflight_rttthresh VNET_INET(tcp_inflight_rttthresh)
	+#define V_tcp_inflight_stab VNET_INET(tcp_inflight_stab)
	+#define V_tcp_insecure_rst VNET_INET(tcp_insecure_rst)
	+#define V_tcp_isn_reseed_interval VNET_INET(tcp_isn_reseed_interval)
	+#define V_tcp_minmss VNET_INET(tcp_minmss)
	+#define V_tcp_mssdflt VNET_INET(tcp_mssdflt)
	+#define V_tcp_reass_maxqlen VNET_INET(tcp_reass_maxqlen)
	+#define V_tcp_reass_maxseg VNET_INET(tcp_reass_maxseg)
	+#define V_tcp_reass_overflows VNET_INET(tcp_reass_overflows)
	+#define V_tcp_reass_qsize VNET_INET(tcp_reass_qsize)
	+#define V_tcp_sack_globalholes VNET_INET(tcp_sack_globalholes)
	+#define V_tcp_sack_globalmaxholes VNET_INET(tcp_sack_globalmaxholes)
	+#define V_tcp_sack_maxholes VNET_INET(tcp_sack_maxholes)
	+#define V_tcp_sc_rst_sock_fail VNET_INET(tcp_sc_rst_sock_fail)
	+#define V_tcp_syncache VNET_INET(tcp_syncache)
	+#define V_tcp_v6mssdflt VNET_INET(tcp_v6mssdflt)
	+#define V_tcpstat VNET_INET(tcpstat)
	+#define V_twq_2msl VNET_INET(twq_2msl)
	+#define V_udb VNET_INET(udb)
	+#define V_udbinfo VNET_INET(udbinfo)
	+#define V_udp_blackhole VNET_INET(udp_blackhole)
	+#define V_udpstat VNET_INET(udpstat)
	+#define V_useloopback VNET_INET(useloopback)
	+
	+#endif /* !_NETINET_VINET_H_ */

	Property changes on: head/sys/netinet/vinet.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/sys/netinet6/dest6.c
	===================================================================
	--- head/sys/netinet6/dest6.c (revision 183549)
	+++ head/sys/netinet6/dest6.c (revision 183550)
	@@ -1,124 +1,125 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: dest6.c,v 1.59 2003/07/11 13:21:16 t-momose Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet/icmp6.h>

	/*
	* Destination options header processing.
	*/
	int
	dest6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	int off = *offp, dstoptlen, optlen;
	struct ip6_dest *dstopts;
	u_int8_t *opt;

	/* validation of the length of the header */
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(*dstopts), IPPROTO_DONE);
	dstopts = (struct ip6_dest *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(dstopts, struct ip6_dest , m, off, sizeof(dstopts));
	if (dstopts == NULL)
	return IPPROTO_DONE;
	#endif
	dstoptlen = (dstopts->ip6d_len + 1) << 3;

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, dstoptlen, IPPROTO_DONE);
	dstopts = (struct ip6_dest *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(dstopts, struct ip6_dest *, m, off, dstoptlen);
	if (dstopts == NULL)
	return IPPROTO_DONE;
	#endif
	off += dstoptlen;
	dstoptlen -= sizeof(struct ip6_dest);
	opt = (u_int8_t *)dstopts + sizeof(struct ip6_dest);

	/* search header for all options. */
	for (optlen = 0; dstoptlen > 0; dstoptlen -= optlen, opt += optlen) {
	if (*opt != IP6OPT_PAD1 &&
	(dstoptlen < IP6OPT_MINLEN \|\| *(opt + 1) + 2 > dstoptlen)) {
	V_ip6stat.ip6s_toosmall++;
	goto bad;
	}

	switch (*opt) {
	case IP6OPT_PAD1:
	optlen = 1;
	break;
	case IP6OPT_PADN:
	optlen = *(opt + 1) + 2;
	break;
	default: /* unknown option */
	optlen = ip6_unknown_opt(opt, m,
	opt - mtod(m, u_int8_t *));
	if (optlen == -1)
	return (IPPROTO_DONE);
	optlen += 2;
	break;
	}
	}

	*offp = off;
	return (dstopts->ip6d_nxt);

	bad:
	m_freem(m);
	return (IPPROTO_DONE);
	}
	Index: head/sys/netinet6/frag6.c
	===================================================================
	--- head/sys/netinet6/frag6.c (revision 183549)
	+++ head/sys/netinet6/frag6.c (revision 183550)
	@@ -1,744 +1,764 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet/icmp6.h>
	#include <netinet/in_systm.h> /* for ECN definitions */
	#include <netinet/ip.h> /* for ECN definitions */

	/*
	* Define it to get a correct behavior on per-interface statistics.
	* You will need to perform an extra routing table lookup, per fragment,
	* to do it. This may, or may not be, a performance hit.
	*/
	#define IN6_IFSTAT_STRICT

	static void frag6_enq(struct ip6asfrag , struct ip6asfrag );
	static void frag6_deq(struct ip6asfrag *);
	static void frag6_insque(struct ip6q , struct ip6q );
	static void frag6_remque(struct ip6q *);
	static void frag6_freef(struct ip6q *);

	static struct mtx ip6qlock;
	/*
	* These fields all protected by ip6qlock.
	*/
	static u_int frag6_nfragpackets;
	static u_int frag6_nfrags;
	static struct ip6q ip6q; /* ip6 reassemble queue */

	#define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF);
	#define IP6Q_LOCK() mtx_lock(&ip6qlock)
	#define IP6Q_TRYLOCK() mtx_trylock(&ip6qlock)
	#define IP6Q_LOCK_ASSERT() mtx_assert(&ip6qlock, MA_OWNED)
	#define IP6Q_UNLOCK() mtx_unlock(&ip6qlock)

	static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");

	/*
	* Initialise reassembly queue and fragment identifier.
	*/
	static void
	frag6_change(void *tag)
	{
	+ INIT_VNET_INET6(curvnet);

	V_ip6_maxfragpackets = nmbclusters / 4;
	V_ip6_maxfrags = nmbclusters / 4;
	}

	void
	frag6_init(void)
	{
	+ INIT_VNET_INET6(curvnet);

	V_ip6_maxfragpackets = nmbclusters / 4;
	V_ip6_maxfrags = nmbclusters / 4;
	EVENTHANDLER_REGISTER(nmbclusters_change,
	frag6_change, NULL, EVENTHANDLER_PRI_ANY);

	IP6Q_LOCK_INIT();

	V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q;
	}

	/*
	* In RFC2460, fragment and reassembly rule do not agree with each other,
	* in terms of next header field handling in fragment header.
	* While the sender will use the same value for all of the fragmented packets,
	* receiver is suggested not to check the consistency.
	*
	* fragment rule (p20):
	* (2) A Fragment header containing:
	* The Next Header value that identifies the first header of
	* the Fragmentable Part of the original packet.
	* -> next header field is same for all fragments
	*
	* reassembly rule (p21):
	* The Next Header field of the last header of the Unfragmentable
	* Part is obtained from the Next Header field of the first
	* fragment's Fragment header.
	* -> should grab it from the first fragment only
	*
	* The following note also contradicts with fragment rule - noone is going to
	* send different fragment with different next header field.
	*
	* additional note (p22):
	* The Next Header values in the Fragment headers of different
	* fragments of the same original packet may differ. Only the value
	* from the Offset zero fragment packet is used for reassembly.
	* -> should grab it from the first fragment only
	*
	* There is no explicit reason given in the RFC. Historical reason maybe?
	*/
	/*
	* Fragment input
	*/
	int
	frag6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp, *t;
	struct ip6_hdr *ip6;
	struct ip6_frag *ip6f;
	struct ip6q *q6;
	struct ip6asfrag af6, ip6af, *af6dwn;
	#ifdef IN6_IFSTAT_STRICT
	struct in6_ifaddr *ia;
	#endif
	int offset = *offp, nxt, i, next;
	int first_frag = 0;
	int fragoff, frgpartlen; /* must be larger than u_int16_t */
	struct ifnet *dstifp;
	u_int8_t ecn, ecn0;
	#if 0
	char ip6buf[INET6_ADDRSTRLEN];
	#endif

	ip6 = mtod(m, struct ip6_hdr *);
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), IPPROTO_DONE);
	ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset);
	#else
	IP6_EXTHDR_GET(ip6f, struct ip6_frag , m, offset, sizeof(ip6f));
	if (ip6f == NULL)
	return (IPPROTO_DONE);
	#endif

	dstifp = NULL;
	#ifdef IN6_IFSTAT_STRICT
	/* find the destination interface of the packet. */
	if ((ia = ip6_getdstifaddr(m)) != NULL)
	dstifp = ia->ia_ifp;
	#else
	/* we are violating the spec, this is not the destination interface */
	if ((m->m_flags & M_PKTHDR) != 0)
	dstifp = m->m_pkthdr.rcvif;
	#endif

	/* jumbo payload can't contain a fragment header */
	if (ip6->ip6_plen == 0) {
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset);
	in6_ifstat_inc(dstifp, ifs6_reass_fail);
	return IPPROTO_DONE;
	}

	/*
	* check whether fragment packet's fragment length is
	* multiple of 8 octets.
	* sizeof(struct ip6_frag) == 8
	* sizeof(struct ip6_hdr) = 40
	*/
	if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) &&
	(((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) {
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
	offsetof(struct ip6_hdr, ip6_plen));
	in6_ifstat_inc(dstifp, ifs6_reass_fail);
	return IPPROTO_DONE;
	}

	V_ip6stat.ip6s_fragments++;
	in6_ifstat_inc(dstifp, ifs6_reass_reqd);

	/* offset now points to data portion */
	offset += sizeof(struct ip6_frag);

	IP6Q_LOCK();

	/*
	* Enforce upper bound on number of fragments.
	* If maxfrag is 0, never accept fragments.
	* If maxfrag is -1, accept all fragments without limitation.
	*/
	if (V_ip6_maxfrags < 0)
	;
	else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags)
	goto dropfrag;

	for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next)
	if (ip6f->ip6f_ident == q6->ip6q_ident &&
	IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
	IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst))
	break;

	if (q6 == &V_ip6q) {
	/*
	* the first fragment to arrive, create a reassembly queue.
	*/
	first_frag = 1;

	/*
	* Enforce upper bound on number of fragmented packets
	* for which we attempt reassembly;
	* If maxfragpackets is 0, never accept fragments.
	* If maxfragpackets is -1, accept all fragments without
	* limitation.
	*/
	if (V_ip6_maxfragpackets < 0)
	;
	else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
	goto dropfrag;
	V_frag6_nfragpackets++;
	q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
	M_NOWAIT);
	if (q6 == NULL)
	goto dropfrag;
	bzero(q6, sizeof(*q6));

	frag6_insque(q6, &V_ip6q);

	/* ip6q_nxt will be filled afterwards, from 1st fragment */
	q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6;
	#ifdef notyet
	q6->ip6q_nxtp = (u_char *)nxtp;
	#endif
	q6->ip6q_ident = ip6f->ip6f_ident;
	q6->ip6q_ttl = IPV6_FRAGTTL;
	q6->ip6q_src = ip6->ip6_src;
	q6->ip6q_dst = ip6->ip6_dst;
	q6->ip6q_ecn =
	(ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
	q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */

	q6->ip6q_nfrag = 0;
	}

	/*
	* If it's the 1st fragment, record the length of the
	* unfragmentable part and the next header of the fragment header.
	*/
	fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
	if (fragoff == 0) {
	q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) -
	sizeof(struct ip6_frag);
	q6->ip6q_nxt = ip6f->ip6f_nxt;
	}

	/*
	* Check that the reassembled packet would not exceed 65535 bytes
	* in size.
	* If it would exceed, discard the fragment and return an ICMP error.
	*/
	frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
	if (q6->ip6q_unfrglen >= 0) {
	/* The 1st fragment has already arrived. */
	if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
	offset - sizeof(struct ip6_frag) +
	offsetof(struct ip6_frag, ip6f_offlg));
	IP6Q_UNLOCK();
	return (IPPROTO_DONE);
	}
	} else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
	offset - sizeof(struct ip6_frag) +
	offsetof(struct ip6_frag, ip6f_offlg));
	IP6Q_UNLOCK();
	return (IPPROTO_DONE);
	}
	/*
	* If it's the first fragment, do the above check for each
	* fragment already stored in the reassembly queue.
	*/
	if (fragoff == 0) {
	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
	af6 = af6dwn) {
	af6dwn = af6->ip6af_down;

	if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen >
	IPV6_MAXPACKET) {
	struct mbuf *merr = IP6_REASS_MBUF(af6);
	struct ip6_hdr *ip6err;
	int erroff = af6->ip6af_offset;

	/* dequeue the fragment. */
	frag6_deq(af6);
	free(af6, M_FTABLE);

	/* adjust pointer. */
	ip6err = mtod(merr, struct ip6_hdr *);

	/*
	* Restore source and destination addresses
	* in the erroneous IPv6 header.
	*/
	ip6err->ip6_src = q6->ip6q_src;
	ip6err->ip6_dst = q6->ip6q_dst;

	icmp6_error(merr, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	erroff - sizeof(struct ip6_frag) +
	offsetof(struct ip6_frag, ip6f_offlg));
	}
	}
	}

	ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FTABLE,
	M_NOWAIT);
	if (ip6af == NULL)
	goto dropfrag;
	bzero(ip6af, sizeof(*ip6af));
	ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
	ip6af->ip6af_off = fragoff;
	ip6af->ip6af_frglen = frgpartlen;
	ip6af->ip6af_offset = offset;
	IP6_REASS_MBUF(ip6af) = m;

	if (first_frag) {
	af6 = (struct ip6asfrag *)q6;
	goto insert;
	}

	/*
	* Handle ECN by comparing this segment with the first one;
	* if CE is set, do not lose CE.
	* drop if CE and not-ECT are mixed for the same packet.
	*/
	ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
	ecn0 = q6->ip6q_ecn;
	if (ecn == IPTOS_ECN_CE) {
	if (ecn0 == IPTOS_ECN_NOTECT) {
	free(ip6af, M_FTABLE);
	goto dropfrag;
	}
	if (ecn0 != IPTOS_ECN_CE)
	q6->ip6q_ecn = IPTOS_ECN_CE;
	}
	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
	free(ip6af, M_FTABLE);
	goto dropfrag;
	}

	/*
	* Find a segment which begins after this one does.
	*/
	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
	af6 = af6->ip6af_down)
	if (af6->ip6af_off > ip6af->ip6af_off)
	break;

	#if 0
	/*
	* If there is a preceding segment, it may provide some of
	* our data already. If so, drop the data from the incoming
	* segment. If it provides all of our data, drop us.
	*/
	if (af6->ip6af_up != (struct ip6asfrag *)q6) {
	i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen
	- ip6af->ip6af_off;
	if (i > 0) {
	if (i >= ip6af->ip6af_frglen)
	goto dropfrag;
	m_adj(IP6_REASS_MBUF(ip6af), i);
	ip6af->ip6af_off += i;
	ip6af->ip6af_frglen -= i;
	}
	}

	/*
	* While we overlap succeeding segments trim them or,
	* if they are completely covered, dequeue them.
	*/
	while (af6 != (struct ip6asfrag *)q6 &&
	ip6af->ip6af_off + ip6af->ip6af_frglen > af6->ip6af_off) {
	i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off;
	if (i < af6->ip6af_frglen) {
	af6->ip6af_frglen -= i;
	af6->ip6af_off += i;
	m_adj(IP6_REASS_MBUF(af6), i);
	break;
	}
	af6 = af6->ip6af_down;
	m_freem(IP6_REASS_MBUF(af6->ip6af_up));
	frag6_deq(af6->ip6af_up);
	}
	#else
	/*
	* If the incoming framgent overlaps some existing fragments in
	* the reassembly queue, drop it, since it is dangerous to override
	* existing fragments from a security point of view.
	* We don't know which fragment is the bad guy - here we trust
	* fragment that came in earlier, with no real reason.
	*
	* Note: due to changes after disabling this part, mbuf passed to
	* m_adj() below now does not meet the requirement.
	*/
	if (af6->ip6af_up != (struct ip6asfrag *)q6) {
	i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen
	- ip6af->ip6af_off;
	if (i > 0) {
	#if 0 /* suppress the noisy log */
	log(LOG_ERR, "%d bytes of a fragment from %s "
	"overlaps the previous fragment\n",
	i, ip6_sprintf(ip6buf, &q6->ip6q_src));
	#endif
	free(ip6af, M_FTABLE);
	goto dropfrag;
	}
	}
	if (af6 != (struct ip6asfrag *)q6) {
	i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off;
	if (i > 0) {
	#if 0 /* suppress the noisy log */
	log(LOG_ERR, "%d bytes of a fragment from %s "
	"overlaps the succeeding fragment",
	i, ip6_sprintf(ip6buf, &q6->ip6q_src));
	#endif
	free(ip6af, M_FTABLE);
	goto dropfrag;
	}
	}
	#endif

	insert:

	/*
	* Stick new segment in its place;
	* check for complete reassembly.
	* Move to front of packet queue, as we are
	* the most recently active fragmented packet.
	*/
	frag6_enq(ip6af, af6->ip6af_up);
	V_frag6_nfrags++;
	q6->ip6q_nfrag++;
	#if 0 /* xxx */
	if (q6 != V_ip6q.ip6q_next) {
	frag6_remque(q6);
	frag6_insque(q6, &V_ip6q);
	}
	#endif
	next = 0;
	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
	af6 = af6->ip6af_down) {
	if (af6->ip6af_off != next) {
	IP6Q_UNLOCK();
	return IPPROTO_DONE;
	}
	next += af6->ip6af_frglen;
	}
	if (af6->ip6af_up->ip6af_mff) {
	IP6Q_UNLOCK();
	return IPPROTO_DONE;
	}

	/*
	* Reassembly is complete; concatenate fragments.
	*/
	ip6af = q6->ip6q_down;
	t = m = IP6_REASS_MBUF(ip6af);
	af6 = ip6af->ip6af_down;
	frag6_deq(ip6af);
	while (af6 != (struct ip6asfrag *)q6) {
	af6dwn = af6->ip6af_down;
	frag6_deq(af6);
	while (t->m_next)
	t = t->m_next;
	t->m_next = IP6_REASS_MBUF(af6);
	m_adj(t->m_next, af6->ip6af_offset);
	free(af6, M_FTABLE);
	af6 = af6dwn;
	}

	/* adjust offset to point where the original next header starts */
	offset = ip6af->ip6af_offset - sizeof(struct ip6_frag);
	free(ip6af, M_FTABLE);
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr));
	if (q6->ip6q_ecn == IPTOS_ECN_CE)
	ip6->ip6_flow \|= htonl(IPTOS_ECN_CE << 20);
	nxt = q6->ip6q_nxt;
	#ifdef notyet
	*q6->ip6q_nxtp = (u_char)(nxt & 0xff);
	#endif

	/* Delete frag6 header */
	if (m->m_len >= offset + sizeof(struct ip6_frag)) {
	/* This is the only possible case with !PULLDOWN_TEST */
	ovbcopy((caddr_t)ip6, (caddr_t)ip6 + sizeof(struct ip6_frag),
	offset);
	m->m_data += sizeof(struct ip6_frag);
	m->m_len -= sizeof(struct ip6_frag);
	} else {
	/* this comes with no copy if the boundary is on cluster */
	if ((t = m_split(m, offset, M_DONTWAIT)) == NULL) {
	frag6_remque(q6);
	V_frag6_nfrags -= q6->ip6q_nfrag;
	free(q6, M_FTABLE);
	V_frag6_nfragpackets--;
	goto dropfrag;
	}
	m_adj(t, sizeof(struct ip6_frag));
	m_cat(m, t);
	}

	/*
	* Store NXT to the original.
	*/
	{
	char prvnxtp = ip6_get_prevhdr(m, offset); / XXX */
	*prvnxtp = nxt;
	}

	frag6_remque(q6);
	V_frag6_nfrags -= q6->ip6q_nfrag;
	free(q6, M_FTABLE);
	V_frag6_nfragpackets--;

	if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
	int plen = 0;
	for (t = m; t; t = t->m_next)
	plen += t->m_len;
	m->m_pkthdr.len = plen;
	}

	V_ip6stat.ip6s_reassembled++;
	in6_ifstat_inc(dstifp, ifs6_reass_ok);

	/*
	* Tell launch routine the next header
	*/

	*mp = m;
	*offp = offset;

	IP6Q_UNLOCK();
	return nxt;

	dropfrag:
	IP6Q_UNLOCK();
	in6_ifstat_inc(dstifp, ifs6_reass_fail);
	V_ip6stat.ip6s_fragdropped++;
	m_freem(m);
	return IPPROTO_DONE;
	}

	/*
	* Free a fragment reassembly header and all
	* associated datagrams.
	*/
	void
	frag6_freef(struct ip6q *q6)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6asfrag af6, down6;

	IP6Q_LOCK_ASSERT();

	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
	af6 = down6) {
	struct mbuf *m = IP6_REASS_MBUF(af6);

	down6 = af6->ip6af_down;
	frag6_deq(af6);

	/*
	* Return ICMP time exceeded error for the 1st fragment.
	* Just free other fragments.
	*/
	if (af6->ip6af_off == 0) {
	struct ip6_hdr *ip6;

	/* adjust pointer */
	ip6 = mtod(m, struct ip6_hdr *);

	/* restore source and destination addresses */
	ip6->ip6_src = q6->ip6q_src;
	ip6->ip6_dst = q6->ip6q_dst;

	icmp6_error(m, ICMP6_TIME_EXCEEDED,
	ICMP6_TIME_EXCEED_REASSEMBLY, 0);
	} else
	m_freem(m);
	free(af6, M_FTABLE);
	}
	frag6_remque(q6);
	V_frag6_nfrags -= q6->ip6q_nfrag;
	free(q6, M_FTABLE);
	V_frag6_nfragpackets--;
	}

	/*
	* Put an ip fragment on a reassembly chain.
	* Like insque, but pointers in middle of structure.
	*/
	void
	frag6_enq(struct ip6asfrag af6, struct ip6asfrag up6)
	{

	IP6Q_LOCK_ASSERT();

	af6->ip6af_up = up6;
	af6->ip6af_down = up6->ip6af_down;
	up6->ip6af_down->ip6af_up = af6;
	up6->ip6af_down = af6;
	}

	/*
	* To frag6_enq as remque is to insque.
	*/
	void
	frag6_deq(struct ip6asfrag *af6)
	{

	IP6Q_LOCK_ASSERT();

	af6->ip6af_up->ip6af_down = af6->ip6af_down;
	af6->ip6af_down->ip6af_up = af6->ip6af_up;
	}

	void
	frag6_insque(struct ip6q new, struct ip6q old)
	{

	IP6Q_LOCK_ASSERT();

	new->ip6q_prev = old;
	new->ip6q_next = old->ip6q_next;
	old->ip6q_next->ip6q_prev= new;
	old->ip6q_next = new;
	}

	void
	frag6_remque(struct ip6q *p6)
	{

	IP6Q_LOCK_ASSERT();

	p6->ip6q_prev->ip6q_next = p6->ip6q_next;
	p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
	}

	/*
	* IPv6 reassembling timer processing;
	* if a timer expires on a reassembly
	* queue, discard it.
	*/
	void
	frag6_slowtimo(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	struct ip6q *q6;

	IP6Q_LOCK();
	- q6 = V_ip6q.ip6q_next;
	- if (q6)
	- while (q6 != &V_ip6q) {
	- --q6->ip6q_ttl;
	- q6 = q6->ip6q_next;
	- if (q6->ip6q_prev->ip6q_ttl == 0) {
	- V_ip6stat.ip6s_fragtimeout++;
	- /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
	- frag6_freef(q6->ip6q_prev);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET6(vnet_iter);
	+ q6 = V_ip6q.ip6q_next;
	+ if (q6)
	+ while (q6 != &V_ip6q) {
	+ --q6->ip6q_ttl;
	+ q6 = q6->ip6q_next;
	+ if (q6->ip6q_prev->ip6q_ttl == 0) {
	+ V_ip6stat.ip6s_fragtimeout++;
	+ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
	+ frag6_freef(q6->ip6q_prev);
	+ }
	}
	+ /*
	+ * If we are over the maximum number of fragments
	+ * (due to the limit being lowered), drain off
	+ * enough to get down to the new limit.
	+ */
	+ while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets &&
	+ V_ip6q.ip6q_prev) {
	+ V_ip6stat.ip6s_fragoverflow++;
	+ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
	+ frag6_freef(V_ip6q.ip6q_prev);
	}
	- /*
	- * If we are over the maximum number of fragments
	- * (due to the limit being lowered), drain off
	- * enough to get down to the new limit.
	- */
	- while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets &&
	- V_ip6q.ip6q_prev) {
	- V_ip6stat.ip6s_fragoverflow++;
	- /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
	- frag6_freef(V_ip6q.ip6q_prev);
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IP6Q_UNLOCK();

	#if 0
	/*
	* Routing changes might produce a better route than we last used;
	* make sure we notice eventually, even if forwarding only for one
	* destination and the cache is never replaced.
	*/
	if (V_ip6_forward_rt.ro_rt) {
	RTFREE(V_ip6_forward_rt.ro_rt);
	V_ip6_forward_rt.ro_rt = 0;
	}
	if (ipsrcchk_rt.ro_rt) {
	RTFREE(ipsrcchk_rt.ro_rt);
	ipsrcchk_rt.ro_rt = 0;
	}
	#endif
	}

	/*
	* Drain off all datagram fragments.
	*/
	void
	frag6_drain(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);

	if (IP6Q_TRYLOCK() == 0)
	return;
	- while (V_ip6q.ip6q_next != &V_ip6q) {
	- V_ip6stat.ip6s_fragdropped++;
	- /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
	- frag6_freef(V_ip6q.ip6q_next);
	+ VNET_LIST_RLOCK();
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ INIT_VNET_INET6(vnet_iter);
	+ while (V_ip6q.ip6q_next != &V_ip6q) {
	+ V_ip6stat.ip6s_fragdropped++;
	+ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
	+ frag6_freef(V_ip6q.ip6q_next);
	+ }
	+ CURVNET_RESTORE();
	}
	+ VNET_LIST_RUNLOCK();
	IP6Q_UNLOCK();
	}
	Index: head/sys/netinet6/icmp6.c
	===================================================================
	--- head/sys/netinet6/icmp6.c (revision 183549)
	+++ head/sys/netinet6/icmp6.c (revision 183550)
	@@ -1,2800 +1,2819 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"

	#include <sys/param.h>
	#include <sys/domain.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_var.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet/tcp_var.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6protosw.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/mld6_var.h>
	#include <netinet6/nd6.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/key.h>
	#endif

	extern struct domain inet6domain;

	struct icmp6stat icmp6stat;

	extern struct inpcbinfo ripcbinfo;
	extern struct inpcbhead ripcb;
	extern int icmp6errppslim;
	static int icmp6errpps_count = 0;
	static struct timeval icmp6errppslim_last;
	extern int icmp6_nodeinfo;

	static void icmp6_errcount(struct icmp6errstat *, int, int);
	static int icmp6_rip6_input(struct mbuf **, int);
	static int icmp6_ratelimit(const struct in6_addr *, const int, const int);
	static const char icmp6_redirect_diag __P((struct in6_addr ,
	struct in6_addr , struct in6_addr ));
	static struct mbuf ni6_input(struct mbuf , int);
	static struct mbuf ni6_nametodns(const char , int, int);
	static int ni6_dnsmatch(const char , int, const char , int);
	static int ni6_addrs __P((struct icmp6_nodeinfo , struct mbuf ,
	struct ifnet *, struct in6_addr ));
	static int ni6_store_addrs __P((struct icmp6_nodeinfo , struct icmp6_nodeinfo ,
	struct ifnet *, int));
	static int icmp6_notify_error(struct mbuf **, int, int, int);


	void
	icmp6_init(void)
	{
	+ INIT_VNET_INET6(curvnet);

	mld6_init();
	}

	static void
	icmp6_errcount(struct icmp6errstat *stat, int type, int code)
	{
	switch (type) {
	case ICMP6_DST_UNREACH:
	switch (code) {
	case ICMP6_DST_UNREACH_NOROUTE:
	stat->icp6errs_dst_unreach_noroute++;
	return;
	case ICMP6_DST_UNREACH_ADMIN:
	stat->icp6errs_dst_unreach_admin++;
	return;
	case ICMP6_DST_UNREACH_BEYONDSCOPE:
	stat->icp6errs_dst_unreach_beyondscope++;
	return;
	case ICMP6_DST_UNREACH_ADDR:
	stat->icp6errs_dst_unreach_addr++;
	return;
	case ICMP6_DST_UNREACH_NOPORT:
	stat->icp6errs_dst_unreach_noport++;
	return;
	}
	break;
	case ICMP6_PACKET_TOO_BIG:
	stat->icp6errs_packet_too_big++;
	return;
	case ICMP6_TIME_EXCEEDED:
	switch (code) {
	case ICMP6_TIME_EXCEED_TRANSIT:
	stat->icp6errs_time_exceed_transit++;
	return;
	case ICMP6_TIME_EXCEED_REASSEMBLY:
	stat->icp6errs_time_exceed_reassembly++;
	return;
	}
	break;
	case ICMP6_PARAM_PROB:
	switch (code) {
	case ICMP6_PARAMPROB_HEADER:
	stat->icp6errs_paramprob_header++;
	return;
	case ICMP6_PARAMPROB_NEXTHEADER:
	stat->icp6errs_paramprob_nextheader++;
	return;
	case ICMP6_PARAMPROB_OPTION:
	stat->icp6errs_paramprob_option++;
	return;
	}
	break;
	case ND_REDIRECT:
	stat->icp6errs_redirect++;
	return;
	}
	stat->icp6errs_unknown++;
	}

	/*
	* A wrapper function for icmp6_error() necessary when the erroneous packet
	* may not contain enough scope zone information.
	*/
	void
	icmp6_error2(struct mbuf *m, int type, int code, int param,
	struct ifnet *ifp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr *ip6;

	if (ifp == NULL)
	return;

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
	#else
	if (m->m_len < sizeof(struct ip6_hdr)) {
	m = m_pullup(m, sizeof(struct ip6_hdr));
	if (m == NULL)
	return;
	}
	#endif

	ip6 = mtod(m, struct ip6_hdr *);

	if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
	return;
	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
	return;

	icmp6_error(m, type, code, param);
	}

	/*
	* Generate an error packet of type error in response to bad IP6 packet.
	*/
	void
	icmp6_error(struct mbuf *m, int type, int code, int param)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr oip6, nip6;
	struct icmp6_hdr *icmp6;
	u_int preplen;
	int off;
	int nxt;

	V_icmp6stat.icp6s_error++;

	/* count per-type-code statistics */
	icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, type, code);

	#ifdef M_DECRYPTED /not openbsd/
	if (m->m_flags & M_DECRYPTED) {
	V_icmp6stat.icp6s_canterror++;
	goto freeit;
	}
	#endif

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), );
	#else
	if (m->m_len < sizeof(struct ip6_hdr)) {
	m = m_pullup(m, sizeof(struct ip6_hdr));
	if (m == NULL)
	return;
	}
	#endif
	oip6 = mtod(m, struct ip6_hdr *);

	/*
	* If the destination address of the erroneous packet is a multicast
	* address, or the packet was sent using link-layer multicast,
	* we should basically suppress sending an error (RFC 2463, Section
	* 2.4).
	* We have two exceptions (the item e.2 in that section):
	* - the Pakcet Too Big message can be sent for path MTU discovery.
	* - the Parameter Problem Message that can be allowed an icmp6 error
	* in the option type field. This check has been done in
	* ip6_unknown_opt(), so we can just check the type and code.
	*/
	if ((m->m_flags & (M_BCAST\|M_MCAST) \|\|
	IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
	(type != ICMP6_PACKET_TOO_BIG &&
	(type != ICMP6_PARAM_PROB \|\|
	code != ICMP6_PARAMPROB_OPTION)))
	goto freeit;

	/*
	* RFC 2463, 2.4 (e.5): source address check.
	* XXX: the case of anycast source?
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) \|\|
	IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
	goto freeit;

	/*
	* If we are about to send ICMPv6 against ICMPv6 error/redirect,
	* don't do it.
	*/
	nxt = -1;
	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
	struct icmp6_hdr *icp;

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0, off + sizeof(struct icmp6_hdr), );
	icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(icp, struct icmp6_hdr *, m, off,
	sizeof(*icp));
	if (icp == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif
	if (icp->icmp6_type < ICMP6_ECHO_REQUEST \|\|
	icp->icmp6_type == ND_REDIRECT) {
	/*
	* ICMPv6 error
	* Special case: for redirect (which is
	* informational) we must not send icmp6 error.
	*/
	V_icmp6stat.icp6s_canterror++;
	goto freeit;
	} else {
	/* ICMPv6 informational - send the error */
	}
	} else {
	/* non-ICMPv6 - send the error */
	}

	oip6 = mtod(m, struct ip6_hdr ); / adjust pointer */

	/* Finally, do rate limitation check. */
	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
	V_icmp6stat.icp6s_toofreq++;
	goto freeit;
	}

	/*
	* OK, ICMP6 can be generated.
	*/

	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
	m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);

	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
	M_PREPEND(m, preplen, M_DONTWAIT);
	if (m && m->m_len < preplen)
	m = m_pullup(m, preplen);
	if (m == NULL) {
	nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
	return;
	}

	nip6 = mtod(m, struct ip6_hdr *);
	nip6->ip6_src = oip6->ip6_src;
	nip6->ip6_dst = oip6->ip6_dst;

	in6_clearscope(&oip6->ip6_src);
	in6_clearscope(&oip6->ip6_dst);

	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
	icmp6->icmp6_type = type;
	icmp6->icmp6_code = code;
	icmp6->icmp6_pptr = htonl((u_int32_t)param);

	/*
	* icmp6_reflect() is designed to be in the input path.
	* icmp6_error() can be called from both input and output path,
	* and if we are in output path rcvif could contain bogus value.
	* clear m->m_pkthdr.rcvif for safety, we should have enough scope
	* information in ip header (nip6).
	*/
	m->m_pkthdr.rcvif = NULL;

	V_icmp6stat.icp6s_outhist[type]++;
	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */

	return;

	freeit:
	/*
	* If we can't tell whether or not we can generate ICMP6, free it.
	*/
	m_freem(m);
	}

	/*
	* Process a received ICMP6 message.
	*/
	int
	icmp6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	+ INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
	struct mbuf m = mp, *n;
	struct ip6_hdr ip6, nip6;
	struct icmp6_hdr icmp6, nicmp6;
	int off = *offp;
	int icmp6len = m->m_pkthdr.len - *offp;
	int code, sum, noff;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), IPPROTO_DONE);
	/* m might change if M_LOOP. So, call mtod after this */
	#endif

	/*
	* Locate icmp6 structure in mbuf, and check
	* that not corrupted and of at least minimum length
	*/

	ip6 = mtod(m, struct ip6_hdr *);
	if (icmp6len < sizeof(struct icmp6_hdr)) {
	V_icmp6stat.icp6s_tooshort++;
	goto freeit;
	}

	/*
	* calculate the checksum
	*/
	#ifndef PULLDOWN_TEST
	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr , m, off, sizeof(icmp6));
	if (icmp6 == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return IPPROTO_DONE;
	}
	#endif
	code = icmp6->icmp6_code;

	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
	nd6log((LOG_ERR,
	"ICMP6 checksum error(%d\|%x) %s\n",
	icmp6->icmp6_type, sum,
	ip6_sprintf(ip6bufs, &ip6->ip6_src)));
	V_icmp6stat.icp6s_checksum++;
	goto freeit;
	}

	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
	/*
	* Deliver very specific ICMP6 type only.
	* This is important to deliver TOOBIG. Otherwise PMTUD
	* will not work.
	*/
	switch (icmp6->icmp6_type) {
	case ICMP6_DST_UNREACH:
	case ICMP6_PACKET_TOO_BIG:
	case ICMP6_TIME_EXCEEDED:
	break;
	default:
	goto freeit;
	}
	}

	V_icmp6stat.icp6s_inhist[icmp6->icmp6_type]++;
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_msg);
	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error);

	switch (icmp6->icmp6_type) {
	case ICMP6_DST_UNREACH:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach);
	switch (code) {
	case ICMP6_DST_UNREACH_NOROUTE:
	code = PRC_UNREACH_NET;
	break;
	case ICMP6_DST_UNREACH_ADMIN:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_adminprohib);
	code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
	break;
	case ICMP6_DST_UNREACH_ADDR:
	code = PRC_HOSTDEAD;
	break;
	case ICMP6_DST_UNREACH_BEYONDSCOPE:
	/* I mean "source address was incorrect." */
	code = PRC_PARAMPROB;
	break;
	case ICMP6_DST_UNREACH_NOPORT:
	code = PRC_UNREACH_PORT;
	break;
	default:
	goto badcode;
	}
	goto deliver;
	break;

	case ICMP6_PACKET_TOO_BIG:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig);

	/* validation is made in icmp6_mtudisc_update */

	code = PRC_MSGSIZE;

	/*
	* Updating the path MTU will be done after examining
	* intermediate extension headers.
	*/
	goto deliver;
	break;

	case ICMP6_TIME_EXCEEDED:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed);
	switch (code) {
	case ICMP6_TIME_EXCEED_TRANSIT:
	code = PRC_TIMXCEED_INTRANS;
	break;
	case ICMP6_TIME_EXCEED_REASSEMBLY:
	code = PRC_TIMXCEED_REASS;
	break;
	default:
	goto badcode;
	}
	goto deliver;
	break;

	case ICMP6_PARAM_PROB:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_paramprob);
	switch (code) {
	case ICMP6_PARAMPROB_NEXTHEADER:
	code = PRC_UNREACH_PROTOCOL;
	break;
	case ICMP6_PARAMPROB_HEADER:
	case ICMP6_PARAMPROB_OPTION:
	code = PRC_PARAMPROB;
	break;
	default:
	goto badcode;
	}
	goto deliver;
	break;

	case ICMP6_ECHO_REQUEST:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echo);
	if (code != 0)
	goto badcode;
	if ((n = m_copy(m, 0, M_COPYALL)) == NULL) {
	/* Give up remote */
	break;
	}
	if ((n->m_flags & M_EXT) != 0
	\|\| n->m_len < off + sizeof(struct icmp6_hdr)) {
	struct mbuf *n0 = n;
	const int maxlen = sizeof(nip6) + sizeof(nicmp6);
	int n0len;

	MGETHDR(n, M_DONTWAIT, n0->m_type);
	n0len = n0->m_pkthdr.len; /* save for use below */
	if (n)
	M_MOVE_PKTHDR(n, n0);
	if (n && maxlen >= MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_free(n);
	n = NULL;
	}
	}
	if (n == NULL) {
	/* Give up remote */
	m_freem(n0);
	break;
	}
	/*
	* Copy IPv6 and ICMPv6 only.
	*/
	nip6 = mtod(n, struct ip6_hdr *);
	bcopy(ip6, nip6, sizeof(struct ip6_hdr));
	nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
	bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
	noff = sizeof(struct ip6_hdr);
	/* new mbuf contains only ipv6+icmpv6 headers */
	n->m_len = noff + sizeof(struct icmp6_hdr);
	/*
	* Adjust mbuf. ip6_plen will be adjusted in
	* ip6_output().
	*/
	m_adj(n0, off + sizeof(struct icmp6_hdr));
	/* recalculate complete packet size */
	n->m_pkthdr.len = n0len + (noff - off);
	n->m_next = n0;
	} else {
	nip6 = mtod(n, struct ip6_hdr *);
	IP6_EXTHDR_GET(nicmp6, struct icmp6_hdr *, n, off,
	sizeof(*nicmp6));
	noff = off;
	}
	nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
	nicmp6->icmp6_code = 0;
	if (n) {
	V_icmp6stat.icp6s_reflect++;
	V_icmp6stat.icp6s_outhist[ICMP6_ECHO_REPLY]++;
	icmp6_reflect(n, noff);
	}
	break;

	case ICMP6_ECHO_REPLY:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_echoreply);
	if (code != 0)
	goto badcode;
	break;

	case MLD_LISTENER_QUERY:
	case MLD_LISTENER_REPORT:
	if (icmp6len < sizeof(struct mld_hdr))
	goto badlen;
	if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery);
	else
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport);
	if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
	/* give up local */
	mld6_input(m, off);
	m = NULL;
	goto freeit;
	}
	mld6_input(n, off);
	/* m stays. */
	break;

	case MLD_LISTENER_DONE:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone);
	if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */
	goto badlen;
	break; /* nothing to be done in kernel */

	case MLD_MTRACE_RESP:
	case MLD_MTRACE:
	/* XXX: these two are experimental. not officially defined. */
	/* XXX: per-interface statistics? */
	break; /* just pass it to applications */

	case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */
	{
	enum { WRU, FQDN } mode;

	if (!V_icmp6_nodeinfo)
	break;

	if (icmp6len == sizeof(struct icmp6_hdr) + 4)
	mode = WRU;
	else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
	mode = FQDN;
	else
	goto badlen;

	#define hostnamelen strlen(V_hostname)
	if (mode == FQDN) {
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_nodeinfo),
	IPPROTO_DONE);
	#endif
	n = m_copy(m, 0, M_COPYALL);
	if (n)
	n = ni6_input(n, off);
	/* XXX meaningless if n == NULL */
	noff = sizeof(struct ip6_hdr);
	} else {
	u_char *p;
	int maxlen, maxhlen;

	/*
	* XXX: this combination of flags is pointless,
	* but should we keep this for compatibility?
	*/
	if ((V_icmp6_nodeinfo & 5) != 5)
	break;

	if (code != 0)
	goto badcode;
	maxlen = sizeof(nip6) + sizeof(nicmp6) + 4;
	if (maxlen >= MCLBYTES) {
	/* Give up remote */
	break;
	}
	MGETHDR(n, M_DONTWAIT, m->m_type);
	if (n && maxlen > MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_free(n);
	n = NULL;
	}
	}
	if (n && !m_dup_pkthdr(n, m, M_DONTWAIT)) {
	/*
	* Previous code did a blind M_COPY_PKTHDR
	* and said "just for rcvif". If true, then
	* we could tolerate the dup failing (due to
	* the deep copy of the tag chain). For now
	* be conservative and just fail.
	*/
	m_free(n);
	n = NULL;
	}
	if (n == NULL) {
	/* Give up remote */
	break;
	}
	n->m_pkthdr.rcvif = NULL;
	n->m_len = 0;
	maxhlen = M_TRAILINGSPACE(n) - maxlen;
	mtx_lock(&hostname_mtx);
	if (maxhlen > hostnamelen)
	maxhlen = hostnamelen;
	/*
	* Copy IPv6 and ICMPv6 only.
	*/
	nip6 = mtod(n, struct ip6_hdr *);
	bcopy(ip6, nip6, sizeof(struct ip6_hdr));
	nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
	bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
	p = (u_char *)(nicmp6 + 1);
	bzero(p, 4);
	bcopy(V_hostname, p + 4, maxhlen); /* meaningless TTL */
	mtx_unlock(&hostname_mtx);
	noff = sizeof(struct ip6_hdr);
	n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
	sizeof(struct icmp6_hdr) + 4 + maxhlen;
	nicmp6->icmp6_type = ICMP6_WRUREPLY;
	nicmp6->icmp6_code = 0;
	}
	#undef hostnamelen
	if (n) {
	V_icmp6stat.icp6s_reflect++;
	V_icmp6stat.icp6s_outhist[ICMP6_WRUREPLY]++;
	icmp6_reflect(n, noff);
	}
	break;
	}

	case ICMP6_WRUREPLY:
	if (code != 0)
	goto badcode;
	break;

	case ND_ROUTER_SOLICIT:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routersolicit);
	if (code != 0)
	goto badcode;
	if (icmp6len < sizeof(struct nd_router_solicit))
	goto badlen;
	if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
	/* give up local */
	nd6_rs_input(m, off, icmp6len);
	m = NULL;
	goto freeit;
	}
	nd6_rs_input(n, off, icmp6len);
	/* m stays. */
	break;

	case ND_ROUTER_ADVERT:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_routeradvert);
	if (code != 0)
	goto badcode;
	if (icmp6len < sizeof(struct nd_router_advert))
	goto badlen;
	if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
	/* give up local */
	nd6_ra_input(m, off, icmp6len);
	m = NULL;
	goto freeit;
	}
	nd6_ra_input(n, off, icmp6len);
	/* m stays. */
	break;

	case ND_NEIGHBOR_SOLICIT:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighborsolicit);
	if (code != 0)
	goto badcode;
	if (icmp6len < sizeof(struct nd_neighbor_solicit))
	goto badlen;
	if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
	/* give up local */
	nd6_ns_input(m, off, icmp6len);
	m = NULL;
	goto freeit;
	}
	nd6_ns_input(n, off, icmp6len);
	/* m stays. */
	break;

	case ND_NEIGHBOR_ADVERT:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_neighboradvert);
	if (code != 0)
	goto badcode;
	if (icmp6len < sizeof(struct nd_neighbor_advert))
	goto badlen;
	if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
	/* give up local */
	nd6_na_input(m, off, icmp6len);
	m = NULL;
	goto freeit;
	}
	nd6_na_input(n, off, icmp6len);
	/* m stays. */
	break;

	case ND_REDIRECT:
	icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_redirect);
	if (code != 0)
	goto badcode;
	if (icmp6len < sizeof(struct nd_redirect))
	goto badlen;
	if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) {
	/* give up local */
	icmp6_redirect_input(m, off);
	m = NULL;
	goto freeit;
	}
	icmp6_redirect_input(n, off);
	/* m stays. */
	break;

	case ICMP6_ROUTER_RENUMBERING:
	if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
	code != ICMP6_ROUTER_RENUMBERING_RESULT)
	goto badcode;
	if (icmp6len < sizeof(struct icmp6_router_renum))
	goto badlen;
	break;

	default:
	nd6log((LOG_DEBUG,
	"icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
	icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	m->m_pkthdr.rcvif ? m->m_pkthdr.rcvif->if_index : 0));
	if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
	/* ICMPv6 error: MUST deliver it by spec... */
	code = PRC_NCMDS;
	/* deliver */
	} else {
	/* ICMPv6 informational: MUST not deliver */
	break;
	}
	deliver:
	if (icmp6_notify_error(&m, off, icmp6len, code)) {
	/* In this case, m should've been freed. */
	return (IPPROTO_DONE);
	}
	break;

	badcode:
	V_icmp6stat.icp6s_badcode++;
	break;

	badlen:
	V_icmp6stat.icp6s_badlen++;
	break;
	}

	/* deliver the packet to appropriate sockets */
	icmp6_rip6_input(&m, *offp);

	return IPPROTO_DONE;

	freeit:
	m_freem(m);
	return IPPROTO_DONE;
	}

	static int
	icmp6_notify_error(struct mbuf **mp, int off, int icmp6len, int code)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	struct icmp6_hdr *icmp6;
	struct ip6_hdr *eip6;
	u_int32_t notifymtu;
	struct sockaddr_in6 icmp6src, icmp6dst;

	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
	V_icmp6stat.icp6s_tooshort++;
	goto freeit;
	}
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off,
	sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr), -1);
	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
	sizeof(*icmp6) + sizeof(struct ip6_hdr));
	if (icmp6 == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return (-1);
	}
	#endif
	eip6 = (struct ip6_hdr *)(icmp6 + 1);

	/* Detect the upper level protocol */
	{
	void (ctlfunc)(int, struct sockaddr , void *);
	u_int8_t nxt = eip6->ip6_nxt;
	int eoff = off + sizeof(struct icmp6_hdr) +
	sizeof(struct ip6_hdr);
	struct ip6ctlparam ip6cp;
	struct in6_addr *finaldst = NULL;
	int icmp6type = icmp6->icmp6_type;
	struct ip6_frag *fh;
	struct ip6_rthdr *rth;
	struct ip6_rthdr0 *rth0;
	int rthlen;

	while (1) { /* XXX: should avoid infinite loop explicitly? */
	struct ip6_ext *eh;

	switch (nxt) {
	case IPPROTO_HOPOPTS:
	case IPPROTO_DSTOPTS:
	case IPPROTO_AH:
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0,
	eoff + sizeof(struct ip6_ext), -1);
	eh = (struct ip6_ext *)(mtod(m, caddr_t) + eoff);
	#else
	IP6_EXTHDR_GET(eh, struct ip6_ext *, m,
	eoff, sizeof(*eh));
	if (eh == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return (-1);
	}
	#endif

	if (nxt == IPPROTO_AH)
	eoff += (eh->ip6e_len + 2) << 2;
	else
	eoff += (eh->ip6e_len + 1) << 3;
	nxt = eh->ip6e_nxt;
	break;
	case IPPROTO_ROUTING:
	/*
	* When the erroneous packet contains a
	* routing header, we should examine the
	* header to determine the final destination.
	* Otherwise, we can't properly update
	* information that depends on the final
	* destination (e.g. path MTU).
	*/
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0, eoff + sizeof(*rth), -1);
	rth = (struct ip6_rthdr *)
	(mtod(m, caddr_t) + eoff);
	#else
	IP6_EXTHDR_GET(rth, struct ip6_rthdr *, m,
	eoff, sizeof(*rth));
	if (rth == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return (-1);
	}
	#endif
	rthlen = (rth->ip6r_len + 1) << 3;
	/*
	* XXX: currently there is no
	* officially defined type other
	* than type-0.
	* Note that if the segment left field
	* is 0, all intermediate hops must
	* have been passed.
	*/
	if (rth->ip6r_segleft &&
	rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
	int hops;

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0, eoff + rthlen, -1);
	rth0 = (struct ip6_rthdr0 *)
	(mtod(m, caddr_t) + eoff);
	#else
	IP6_EXTHDR_GET(rth0,
	struct ip6_rthdr0 *, m,
	eoff, rthlen);
	if (rth0 == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return (-1);
	}
	#endif
	/* just ignore a bogus header */
	if ((rth0->ip6r0_len % 2) == 0 &&
	(hops = rth0->ip6r0_len/2))
	finaldst = (struct in6_addr *)(rth0 + 1) + (hops - 1);
	}
	eoff += rthlen;
	nxt = rth->ip6r_nxt;
	break;
	case IPPROTO_FRAGMENT:
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, 0, eoff +
	sizeof(struct ip6_frag), -1);
	fh = (struct ip6_frag *)(mtod(m, caddr_t) +
	eoff);
	#else
	IP6_EXTHDR_GET(fh, struct ip6_frag *, m,
	eoff, sizeof(*fh));
	if (fh == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return (-1);
	}
	#endif
	/*
	* Data after a fragment header is meaningless
	* unless it is the first fragment, but
	* we'll go to the notify label for path MTU
	* discovery.
	*/
	if (fh->ip6f_offlg & IP6F_OFF_MASK)
	goto notify;

	eoff += sizeof(struct ip6_frag);
	nxt = fh->ip6f_nxt;
	break;
	default:
	/*
	* This case includes ESP and the No Next
	* Header. In such cases going to the notify
	* label does not have any meaning
	* (i.e. ctlfunc will be NULL), but we go
	* anyway since we might have to update
	* path MTU information.
	*/
	goto notify;
	}
	}
	notify:
	#ifndef PULLDOWN_TEST
	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off,
	sizeof(*icmp6) + sizeof(struct ip6_hdr));
	if (icmp6 == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return (-1);
	}
	#endif

	/*
	* retrieve parameters from the inner IPv6 header, and convert
	* them into sockaddr structures.
	* XXX: there is no guarantee that the source or destination
	* addresses of the inner packet are in the same scope as
	* the addresses of the icmp packet. But there is no other
	* way to determine the zone.
	*/
	eip6 = (struct ip6_hdr *)(icmp6 + 1);

	bzero(&icmp6dst, sizeof(icmp6dst));
	icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
	icmp6dst.sin6_family = AF_INET6;
	if (finaldst == NULL)
	icmp6dst.sin6_addr = eip6->ip6_dst;
	else
	icmp6dst.sin6_addr = *finaldst;
	if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL))
	goto freeit;
	bzero(&icmp6src, sizeof(icmp6src));
	icmp6src.sin6_len = sizeof(struct sockaddr_in6);
	icmp6src.sin6_family = AF_INET6;
	icmp6src.sin6_addr = eip6->ip6_src;
	if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL))
	goto freeit;
	icmp6src.sin6_flowinfo =
	(eip6->ip6_flow & IPV6_FLOWLABEL_MASK);

	if (finaldst == NULL)
	finaldst = &eip6->ip6_dst;
	ip6cp.ip6c_m = m;
	ip6cp.ip6c_icmp6 = icmp6;
	ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
	ip6cp.ip6c_off = eoff;
	ip6cp.ip6c_finaldst = finaldst;
	ip6cp.ip6c_src = &icmp6src;
	ip6cp.ip6c_nxt = nxt;

	if (icmp6type == ICMP6_PACKET_TOO_BIG) {
	notifymtu = ntohl(icmp6->icmp6_mtu);
	ip6cp.ip6c_cmdarg = (void *)&notifymtu;
	icmp6_mtudisc_update(&ip6cp, 1); /XXX/
	}

	ctlfunc = (void ()(int, struct sockaddr , void *))
	(inet6sw[ip6_protox[nxt]].pr_ctlinput);
	if (ctlfunc) {
	(void) (ctlfunc)(code, (struct sockaddr )&icmp6dst,
	&ip6cp);
	}
	}
	*mp = m;
	return (0);

	freeit:
	m_freem(m);
	return (-1);
	}

	void
	icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_addr *dst = ip6cp->ip6c_finaldst;
	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
	struct mbuf m = ip6cp->ip6c_m; / will be necessary for scope issue */
	u_int mtu = ntohl(icmp6->icmp6_mtu);
	struct in_conninfo inc;

	#if 0
	/*
	* RFC2460 section 5, last paragraph.
	* even though minimum link MTU for IPv6 is IPV6_MMTU,
	* we may see ICMPv6 too big with mtu < IPV6_MMTU
	* due to packet translator in the middle.
	* see ip6_output() and ip6_getpmtu() "alwaysfrag" case for
	* special handling.
	*/
	if (mtu < IPV6_MMTU)
	return;
	#endif

	/*
	* we reject ICMPv6 too big with abnormally small value.
	* XXX what is the good definition of "abnormally small"?
	*/
	if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
	return;

	if (!validated)
	return;

	/*
	* In case the suggested mtu is less than IPV6_MMTU, we
	* only need to remember that it was for above mentioned
	* "alwaysfrag" case.
	* Try to be as close to the spec as possible.
	*/
	if (mtu < IPV6_MMTU)
	mtu = IPV6_MMTU - 8;

	bzero(&inc, sizeof(inc));
	inc.inc_flags = 1; /* IPv6 */
	inc.inc6_faddr = *dst;
	if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL))
	return;

	if (mtu < tcp_maxmtu6(&inc, NULL)) {
	tcp_hc_updatemtu(&inc, mtu);
	V_icmp6stat.icp6s_pmtuchg++;
	}
	}

	/*
	* Process a Node Information Query packet, based on
	* draft-ietf-ipngwg-icmp-name-lookups-07.
	*
	* Spec incompatibilities:
	* - IPv6 Subject address handling
	* - IPv4 Subject address handling support missing
	* - Proxy reply (answer even if it's not for me)
	* - joins NI group address at in6_ifattach() time only, does not cope
	* with hostname changes by sethostname(3)
	*/
	#define hostnamelen strlen(V_hostname)
	static struct mbuf *
	ni6_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET6(curvnet);
	+ INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
	struct icmp6_nodeinfo ni6, nni6;
	struct mbuf *n = NULL;
	u_int16_t qtype;
	int subjlen;
	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
	struct ni_reply_fqdn *fqdn;
	int addrs; /* for NI_QTYPE_NODEADDR */
	struct ifnet ifp = NULL; / for NI_QTYPE_NODEADDR */
	struct in6_addr in6_subj; /* subject address */
	struct ip6_hdr *ip6;
	int oldfqdn = 0; /* if 1, return pascal string (03 draft) */
	char *subj = NULL;
	struct in6_ifaddr *ia6 = NULL;

	ip6 = mtod(m, struct ip6_hdr *);
	#ifndef PULLDOWN_TEST
	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo , m, off, sizeof(ni6));
	if (ni6 == NULL) {
	/* m is already reclaimed */
	return (NULL);
	}
	#endif

	/*
	* Validate IPv6 source address.
	* The default configuration MUST be to refuse answering queries from
	* global-scope addresses according to RFC4602.
	* Notes:
	* - it's not very clear what "refuse" means; this implementation
	* simply drops it.
	* - it's not very easy to identify global-scope (unicast) addresses
	* since there are many prefixes for them. It should be safer
	* and in practice sufficient to check "all" but loopback and
	* link-local (note that site-local unicast was deprecated and
	* ULA is defined as global scope-wise)
	*/
	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 &&
	!IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
	!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
	goto bad;

	/*
	* Validate IPv6 destination address.
	*
	* The Responder must discard the Query without further processing
	* unless it is one of the Responder's unicast or anycast addresses, or
	* a link-local scope multicast address which the Responder has joined.
	* [RFC4602, Section 5.]
	*/
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
	goto bad;
	/* else it's a link-local multicast, fine */
	} else { /* unicast or anycast */
	if ((ia6 = ip6_getdstifaddr(m)) == NULL)
	goto bad; /* XXX impossible */

	if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) &&
	!(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
	nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
	"a temporary address in %s:%d",
	__FILE__, __LINE__));
	goto bad;
	}
	}

	/* validate query Subject field. */
	qtype = ntohs(ni6->ni_qtype);
	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
	switch (qtype) {
	case NI_QTYPE_NOOP:
	case NI_QTYPE_SUPTYPES:
	/* 07 draft */
	if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
	break;
	/* FALLTHROUGH */
	case NI_QTYPE_FQDN:
	case NI_QTYPE_NODEADDR:
	case NI_QTYPE_IPV4ADDR:
	switch (ni6->ni_code) {
	case ICMP6_NI_SUBJ_IPV6:
	#if ICMP6_NI_SUBJ_IPV6 != 0
	case 0:
	#endif
	/*
	* backward compatibility - try to accept 03 draft
	* format, where no Subject is present.
	*/
	if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
	subjlen == 0) {
	oldfqdn++;
	break;
	}
	#if ICMP6_NI_SUBJ_IPV6 != 0
	if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
	goto bad;
	#endif

	if (subjlen != sizeof(struct in6_addr))
	goto bad;

	/*
	* Validate Subject address.
	*
	* Not sure what exactly "address belongs to the node"
	* means in the spec, is it just unicast, or what?
	*
	* At this moment we consider Subject address as
	* "belong to the node" if the Subject address equals
	* to the IPv6 destination address; validation for
	* IPv6 destination address should have done enough
	* check for us.
	*
	* We do not do proxy at this moment.
	*/
	/* m_pulldown instead of copy? */
	m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
	subjlen, (caddr_t)&in6_subj);
	if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL))
	goto bad;

	subj = (char *)&in6_subj;
	if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
	break;

	/*
	* XXX if we are to allow other cases, we should really
	* be careful about scope here.
	* basically, we should disallow queries toward IPv6
	* destination X with subject Y,
	* if scope(X) > scope(Y).
	* if we allow scope(X) > scope(Y), it will result in
	* information leakage across scope boundary.
	*/
	goto bad;

	case ICMP6_NI_SUBJ_FQDN:
	/*
	* Validate Subject name with gethostname(3).
	*
	* The behavior may need some debate, since:
	* - we are not sure if the node has FQDN as
	* hostname (returned by gethostname(3)).
	* - the code does wildcard match for truncated names.
	* however, we are not sure if we want to perform
	* wildcard match, if gethostname(3) side has
	* truncated hostname.
	*/
	mtx_lock(&hostname_mtx);
	n = ni6_nametodns(V_hostname, hostnamelen, 0);
	mtx_unlock(&hostname_mtx);
	if (!n \|\| n->m_next \|\| n->m_len == 0)
	goto bad;
	IP6_EXTHDR_GET(subj, char *, m,
	off + sizeof(struct icmp6_nodeinfo), subjlen);
	if (subj == NULL)
	goto bad;
	if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
	n->m_len)) {
	goto bad;
	}
	m_freem(n);
	n = NULL;
	break;

	case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */
	default:
	goto bad;
	}
	break;
	}

	/* refuse based on configuration. XXX ICMP6_NI_REFUSED? */
	switch (qtype) {
	case NI_QTYPE_FQDN:
	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0)
	goto bad;
	break;
	case NI_QTYPE_NODEADDR:
	case NI_QTYPE_IPV4ADDR:
	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0)
	goto bad;
	break;
	}

	/* guess reply length */
	switch (qtype) {
	case NI_QTYPE_NOOP:
	break; /* no reply data */
	case NI_QTYPE_SUPTYPES:
	replylen += sizeof(u_int32_t);
	break;
	case NI_QTYPE_FQDN:
	/* XXX will append an mbuf */
	replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
	break;
	case NI_QTYPE_NODEADDR:
	addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj);
	if ((replylen += addrs * (sizeof(struct in6_addr) +
	sizeof(u_int32_t))) > MCLBYTES)
	replylen = MCLBYTES; /* XXX: will truncate pkt later */
	break;
	case NI_QTYPE_IPV4ADDR:
	/* unsupported - should respond with unknown Qtype? */
	break;
	default:
	/*
	* XXX: We must return a reply with the ICMP6 code
	* `unknown Qtype' in this case. However we regard the case
	* as an FQDN query for backward compatibility.
	* Older versions set a random value to this field,
	* so it rarely varies in the defined qtypes.
	* But the mechanism is not reliable...
	* maybe we should obsolete older versions.
	*/
	qtype = NI_QTYPE_FQDN;
	/* XXX will append an mbuf */
	replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
	oldfqdn++;
	break;
	}

	/* allocate an mbuf to reply. */
	MGETHDR(n, M_DONTWAIT, m->m_type);
	if (n == NULL) {
	m_freem(m);
	return (NULL);
	}
	M_MOVE_PKTHDR(n, m); /* just for recvif */
	if (replylen > MHLEN) {
	if (replylen > MCLBYTES) {
	/*
	* XXX: should we try to allocate more? But MCLBYTES
	* is probably much larger than IPV6_MMTU...
	*/
	goto bad;
	}
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	goto bad;
	}
	}
	n->m_pkthdr.len = n->m_len = replylen;

	/* copy mbuf header and IPv6 + Node Information base headers */
	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
	nni6 = (struct icmp6_nodeinfo )(mtod(n, struct ip6_hdr ) + 1);
	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));

	/* qtype dependent procedure */
	switch (qtype) {
	case NI_QTYPE_NOOP:
	nni6->ni_code = ICMP6_NI_SUCCESS;
	nni6->ni_flags = 0;
	break;
	case NI_QTYPE_SUPTYPES:
	{
	u_int32_t v;
	nni6->ni_code = ICMP6_NI_SUCCESS;
	nni6->ni_flags = htons(0x0000); /* raw bitmap */
	/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
	v = (u_int32_t)htonl(0x0000000f);
	bcopy(&v, nni6 + 1, sizeof(u_int32_t));
	break;
	}
	case NI_QTYPE_FQDN:
	nni6->ni_code = ICMP6_NI_SUCCESS;
	fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
	sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
	nni6->ni_flags = 0; /* XXX: meaningless TTL */
	fqdn->ni_fqdn_ttl = 0; /* ditto. */
	/*
	* XXX do we really have FQDN in variable "hostname"?
	*/
	mtx_lock(&hostname_mtx);
	n->m_next = ni6_nametodns(V_hostname, hostnamelen, oldfqdn);
	mtx_unlock(&hostname_mtx);
	if (n->m_next == NULL)
	goto bad;
	/* XXX we assume that n->m_next is not a chain */
	if (n->m_next->m_next != NULL)
	goto bad;
	n->m_pkthdr.len += n->m_next->m_len;
	break;
	case NI_QTYPE_NODEADDR:
	{
	int lenlim, copied;

	nni6->ni_code = ICMP6_NI_SUCCESS;
	n->m_pkthdr.len = n->m_len =
	sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
	lenlim = M_TRAILINGSPACE(n);
	copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
	/* XXX: reset mbuf length */
	n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
	sizeof(struct icmp6_nodeinfo) + copied;
	break;
	}
	default:
	break; /* XXX impossible! */
	}

	nni6->ni_type = ICMP6_NI_REPLY;
	m_freem(m);
	return (n);

	bad:
	m_freem(m);
	if (n)
	m_freem(n);
	return (NULL);
	}
	#undef hostnamelen

	/*
	* make a mbuf with DNS-encoded string. no compression support.
	*
	* XXX names with less than 2 dots (like "foo" or "foo.section") will be
	* treated as truncated name (two \0 at the end). this is a wild guess.
	*
	* old - return pascal string if non-zero
	*/
	static struct mbuf *
	ni6_nametodns(const char *name, int namelen, int old)
	{
	struct mbuf *m;
	char cp, ep;
	const char p, q;
	int i, len, nterm;

	if (old)
	len = namelen + 1;
	else
	len = MCLBYTES;

	/* because MAXHOSTNAMELEN is usually 256, we use cluster mbuf */
	MGET(m, M_DONTWAIT, MT_DATA);
	if (m && len > MLEN) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0)
	goto fail;
	}
	if (!m)
	goto fail;
	m->m_next = NULL;

	if (old) {
	m->m_len = len;
	mtod(m, char ) = namelen;
	bcopy(name, mtod(m, char *) + 1, namelen);
	return m;
	} else {
	m->m_len = 0;
	cp = mtod(m, char *);
	ep = mtod(m, char *) + M_TRAILINGSPACE(m);

	/* if not certain about my name, return empty buffer */
	if (namelen == 0)
	return m;

	/*
	* guess if it looks like shortened hostname, or FQDN.
	* shortened hostname needs two trailing "\0".
	*/
	i = 0;
	for (p = name; p < name + namelen; p++) {
	if (p && p == '.')
	i++;
	}
	if (i < 2)
	nterm = 2;
	else
	nterm = 1;

	p = name;
	while (cp < ep && p < name + namelen) {
	i = 0;
	for (q = p; q < name + namelen && q && q != '.'; q++)
	i++;
	/* result does not fit into mbuf */
	if (cp + i + 1 >= ep)
	goto fail;
	/*
	* DNS label length restriction, RFC1035 page 8.
	* "i == 0" case is included here to avoid returning
	* 0-length label on "foo..bar".
	*/
	if (i <= 0 \|\| i >= 64)
	goto fail;
	*cp++ = i;
	bcopy(p, cp, i);
	cp += i;
	p = q;
	if (p < name + namelen && *p == '.')
	p++;
	}
	/* termination */
	if (cp + nterm >= ep)
	goto fail;
	while (nterm-- > 0)
	*cp++ = '\0';
	m->m_len = cp - mtod(m, char *);
	return m;
	}

	panic("should not reach here");
	/* NOTREACHED */

	fail:
	if (m)
	m_freem(m);
	return NULL;
	}

	/*
	* check if two DNS-encoded string matches. takes care of truncated
	* form (with \0\0 at the end). no compression support.
	* XXX upper/lowercase match (see RFC2065)
	*/
	static int
	ni6_dnsmatch(const char a, int alen, const char b, int blen)
	{
	const char a0, b0;
	int l;

	/* simplest case - need validation? */
	if (alen == blen && bcmp(a, b, alen) == 0)
	return 1;

	a0 = a;
	b0 = b;

	/* termination is mandatory */
	if (alen < 2 \|\| blen < 2)
	return 0;
	if (a0[alen - 1] != '\0' \|\| b0[blen - 1] != '\0')
	return 0;
	alen--;
	blen--;

	while (a - a0 < alen && b - b0 < blen) {
	if (a - a0 + 1 > alen \|\| b - b0 + 1 > blen)
	return 0;

	if ((signed char)a[0] < 0 \|\| (signed char)b[0] < 0)
	return 0;
	/* we don't support compression yet */
	if (a[0] >= 64 \|\| b[0] >= 64)
	return 0;

	/* truncated case */
	if (a[0] == 0 && a - a0 == alen - 1)
	return 1;
	if (b[0] == 0 && b - b0 == blen - 1)
	return 1;
	if (a[0] == 0 \|\| b[0] == 0)
	return 0;

	if (a[0] != b[0])
	return 0;
	l = a[0];
	if (a - a0 + 1 + l > alen \|\| b - b0 + 1 + l > blen)
	return 0;
	if (bcmp(a + 1, b + 1, l) != 0)
	return 0;

	a += 1 + l;
	b += 1 + l;
	}

	if (a - a0 == alen && b - b0 == blen)
	return 1;
	else
	return 0;
	}

	/*
	* calculate the number of addresses to be returned in the node info reply.
	*/
	static int
	ni6_addrs(struct icmp6_nodeinfo ni6, struct mbuf m, struct ifnet **ifpp,
	struct in6_addr *subj)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp;
	struct in6_ifaddr *ifa6;
	struct ifaddr *ifa;
	int addrs = 0, addrsofif, iffound = 0;
	int niflags = ni6->ni_flags;

	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
	switch (ni6->ni_code) {
	case ICMP6_NI_SUBJ_IPV6:
	if (subj == NULL) /* must be impossible... */
	return (0);
	break;
	default:
	/*
	* XXX: we only support IPv6 subject address for
	* this Qtype.
	*/
	return (0);
	}
	}

	IFNET_RLOCK();
	for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
	addrsofif = 0;
	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	ifa6 = (struct in6_ifaddr *)ifa;

	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
	IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr))
	iffound = 1;

	/*
	* IPv4-mapped addresses can only be returned by a
	* Node Information proxy, since they represent
	* addresses of IPv4-only nodes, which perforce do
	* not implement this protocol.
	* [icmp-name-lookups-07, Section 5.4]
	* So we don't support NI_NODEADDR_FLAG_COMPAT in
	* this function at this moment.
	*/

	/* What do we have to do about ::1? */
	switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
	case IPV6_ADDR_SCOPE_LINKLOCAL:
	if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
	continue;
	break;
	case IPV6_ADDR_SCOPE_SITELOCAL:
	if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
	continue;
	break;
	case IPV6_ADDR_SCOPE_GLOBAL:
	if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
	continue;
	break;
	default:
	continue;
	}

	/*
	* check if anycast is okay.
	* XXX: just experimental. not in the spec.
	*/
	if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
	(niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
	continue; /* we need only unicast addresses */
	if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
	(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
	continue;
	}
	addrsofif++; /* count the address */
	}
	if (iffound) {
	*ifpp = ifp;
	IFNET_RUNLOCK();
	return (addrsofif);
	}

	addrs += addrsofif;
	}
	IFNET_RUNLOCK();

	return (addrs);
	}

	static int
	ni6_store_addrs(struct icmp6_nodeinfo ni6, struct icmp6_nodeinfo nni6,
	struct ifnet *ifp0, int resid)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = ifp0 ? ifp0 : TAILQ_FIRST(&V_ifnet);
	struct in6_ifaddr *ifa6;
	struct ifaddr *ifa;
	struct ifnet *ifp_dep = NULL;
	int copied = 0, allow_deprecated = 0;
	u_char cp = (u_char )(nni6 + 1);
	int niflags = ni6->ni_flags;
	u_int32_t ltime;

	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
	return (0); /* needless to copy */

	IFNET_RLOCK();
	again:

	for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) {
	for (ifa = ifp->if_addrlist.tqh_first; ifa;
	ifa = ifa->ifa_list.tqe_next) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	ifa6 = (struct in6_ifaddr *)ifa;

	if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
	allow_deprecated == 0) {
	/*
	* prefererred address should be put before
	* deprecated addresses.
	*/

	/* record the interface for later search */
	if (ifp_dep == NULL)
	ifp_dep = ifp;

	continue;
	} else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
	allow_deprecated != 0)
	continue; /* we now collect deprecated addrs */

	/* What do we have to do about ::1? */
	switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
	case IPV6_ADDR_SCOPE_LINKLOCAL:
	if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
	continue;
	break;
	case IPV6_ADDR_SCOPE_SITELOCAL:
	if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
	continue;
	break;
	case IPV6_ADDR_SCOPE_GLOBAL:
	if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
	continue;
	break;
	default:
	continue;
	}

	/*
	* check if anycast is okay.
	* XXX: just experimental. not in the spec.
	*/
	if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
	(niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
	continue;
	if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
	(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
	continue;
	}

	/* now we can copy the address */
	if (resid < sizeof(struct in6_addr) +
	sizeof(u_int32_t)) {
	/*
	* We give up much more copy.
	* Set the truncate flag and return.
	*/
	nni6->ni_flags \|= NI_NODEADDR_FLAG_TRUNCATE;
	IFNET_RUNLOCK();
	return (copied);
	}

	/*
	* Set the TTL of the address.
	* The TTL value should be one of the following
	* according to the specification:
	*
	* 1. The remaining lifetime of a DHCP lease on the
	* address, or
	* 2. The remaining Valid Lifetime of a prefix from
	* which the address was derived through Stateless
	* Autoconfiguration.
	*
	* Note that we currently do not support stateful
	* address configuration by DHCPv6, so the former
	* case can't happen.
	*/
	if (ifa6->ia6_lifetime.ia6t_expire == 0)
	ltime = ND6_INFINITE_LIFETIME;
	else {
	if (ifa6->ia6_lifetime.ia6t_expire >
	time_second)
	ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_second);
	else
	ltime = 0;
	}

	bcopy(&ltime, cp, sizeof(u_int32_t));
	cp += sizeof(u_int32_t);

	/* copy the address itself */
	bcopy(&ifa6->ia_addr.sin6_addr, cp,
	sizeof(struct in6_addr));
	in6_clearscope((struct in6_addr )cp); / XXX */
	cp += sizeof(struct in6_addr);

	resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
	copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
	}
	if (ifp0) /* we need search only on the specified IF */
	break;
	}

	if (allow_deprecated == 0 && ifp_dep != NULL) {
	ifp = ifp_dep;
	allow_deprecated = 1;

	goto again;
	}

	IFNET_RUNLOCK();

	return (copied);
	}

	/*
	* XXX almost dup'ed code with rip6_input.
	*/
	static int
	icmp6_rip6_input(struct mbuf **mp, int off)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct in6pcb *in6p;
	struct in6pcb *last = NULL;
	struct sockaddr_in6 fromsa;
	struct icmp6_hdr *icmp6;
	struct mbuf *opts = NULL;

	#ifndef PULLDOWN_TEST
	/* this is assumed to be safe. */
	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(icmp6, struct icmp6_hdr , m, off, sizeof(icmp6));
	if (icmp6 == NULL) {
	/* m is already reclaimed */
	return (IPPROTO_DONE);
	}
	#endif

	/*
	* XXX: the address may have embedded scope zone ID, which should be
	* hidden from applications.
	*/
	bzero(&fromsa, sizeof(fromsa));
	fromsa.sin6_family = AF_INET6;
	fromsa.sin6_len = sizeof(struct sockaddr_in6);
	fromsa.sin6_addr = ip6->ip6_src;
	if (sa6_recoverscope(&fromsa)) {
	m_freem(m);
	return (IPPROTO_DONE);
	}

	INP_INFO_RLOCK(&V_ripcbinfo);
	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
	if ((in6p->inp_vflag & INP_IPV6) == 0)
	continue;
	if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6)
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
	!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
	!IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
	continue;
	INP_RLOCK(in6p);
	if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
	in6p->in6p_icmp6filt)) {
	INP_RUNLOCK(in6p);
	continue;
	}
	if (last) {
	struct mbuf *n = NULL;

	/*
	* Recent network drivers tend to allocate a single
	* mbuf cluster, rather than to make a couple of
	* mbufs without clusters. Also, since the IPv6 code
	* path tries to avoid m_pullup(), it is highly
	* probable that we still have an mbuf cluster here
	* even though the necessary length can be stored in an
	* mbuf's internal buffer.
	* Meanwhile, the default size of the receive socket
	* buffer for raw sockets is not so large. This means
	* the possibility of packet loss is relatively higher
	* than before. To avoid this scenario, we copy the
	* received data to a separate mbuf that does not use
	* a cluster, if possible.
	* XXX: it is better to copy the data after stripping
	* intermediate headers.
	*/
	if ((m->m_flags & M_EXT) && m->m_next == NULL &&
	m->m_len <= MHLEN) {
	MGET(n, M_DONTWAIT, m->m_type);
	if (n != NULL) {
	if (m_dup_pkthdr(n, m, M_NOWAIT)) {
	bcopy(m->m_data, n->m_data,
	m->m_len);
	n->m_len = m->m_len;
	} else {
	m_free(n);
	n = NULL;
	}
	}
	}
	if (n != NULL \|\|
	(n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
	if (last->in6p_flags & IN6P_CONTROLOPTS)
	ip6_savecontrol(last, n, &opts);
	/* strip intermediate headers */
	m_adj(n, off);
	SOCKBUF_LOCK(&last->in6p_socket->so_rcv);
	if (sbappendaddr_locked(
	&last->in6p_socket->so_rcv,
	(struct sockaddr *)&fromsa, n, opts)
	== 0) {
	/* should notify about lost packet */
	m_freem(n);
	if (opts) {
	m_freem(opts);
	}
	SOCKBUF_UNLOCK(
	&last->in6p_socket->so_rcv);
	} else
	sorwakeup_locked(last->in6p_socket);
	opts = NULL;
	}
	INP_RUNLOCK(last);
	}
	last = in6p;
	}
	INP_INFO_RUNLOCK(&V_ripcbinfo);
	if (last) {
	if (last->in6p_flags & IN6P_CONTROLOPTS)
	ip6_savecontrol(last, m, &opts);
	/* strip intermediate headers */
	m_adj(m, off);

	/* avoid using mbuf clusters if possible (see above) */
	if ((m->m_flags & M_EXT) && m->m_next == NULL &&
	m->m_len <= MHLEN) {
	struct mbuf *n;

	MGET(n, M_DONTWAIT, m->m_type);
	if (n != NULL) {
	if (m_dup_pkthdr(n, m, M_NOWAIT)) {
	bcopy(m->m_data, n->m_data, m->m_len);
	n->m_len = m->m_len;

	m_freem(m);
	m = n;
	} else {
	m_freem(n);
	n = NULL;
	}
	}
	}
	SOCKBUF_LOCK(&last->in6p_socket->so_rcv);
	if (sbappendaddr_locked(&last->in6p_socket->so_rcv,
	(struct sockaddr *)&fromsa, m, opts) == 0) {
	m_freem(m);
	if (opts)
	m_freem(opts);
	SOCKBUF_UNLOCK(&last->in6p_socket->so_rcv);
	} else
	sorwakeup_locked(last->in6p_socket);
	INP_RUNLOCK(last);
	} else {
	m_freem(m);
	V_ip6stat.ip6s_delivered--;
	}
	return IPPROTO_DONE;
	}

	/*
	* Reflect the ip6 packet back to the source.
	* OFF points to the icmp6 header, counted from the top of the mbuf.
	*/
	void
	icmp6_reflect(struct mbuf *m, size_t off)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr *ip6;
	struct icmp6_hdr *icmp6;
	struct in6_ifaddr *ia;
	int plen;
	int type, code;
	struct ifnet *outif = NULL;
	struct in6_addr origdst, *src = NULL;

	/* too short to reflect */
	if (off < sizeof(struct ip6_hdr)) {
	nd6log((LOG_DEBUG,
	"sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
	(u_long)off, (u_long)sizeof(struct ip6_hdr),
	__FILE__, __LINE__));
	goto bad;
	}

	/*
	* If there are extra headers between IPv6 and ICMPv6, strip
	* off that header first.
	*/
	#ifdef DIAGNOSTIC
	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
	panic("assumption failed in icmp6_reflect");
	#endif
	if (off > sizeof(struct ip6_hdr)) {
	size_t l;
	struct ip6_hdr nip6;

	l = off - sizeof(struct ip6_hdr);
	m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
	m_adj(m, l);
	l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
	if (m->m_len < l) {
	if ((m = m_pullup(m, l)) == NULL)
	return;
	}
	bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
	} else /* off == sizeof(struct ip6_hdr) */ {
	size_t l;
	l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
	if (m->m_len < l) {
	if ((m = m_pullup(m, l)) == NULL)
	return;
	}
	}
	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
	type = icmp6->icmp6_type; /* keep type for statistics */
	code = icmp6->icmp6_code; /* ditto. */

	origdst = ip6->ip6_dst;
	/*
	* ip6_input() drops a packet if its src is multicast.
	* So, the src is never multicast.
	*/
	ip6->ip6_dst = ip6->ip6_src;

	/*
	* If the incoming packet was addressed directly to us (i.e. unicast),
	* use dst as the src for the reply.
	* The IN6_IFF_NOTREADY case should be VERY rare, but is possible
	* (for example) when we encounter an error while forwarding procedure
	* destined to a duplicated address of ours.
	* Note that ip6_getdstifaddr() may fail if we are in an error handling
	* procedure of an outgoing packet of our own, in which case we need
	* to search in the ifaddr list.
	*/
	if (!IN6_IS_ADDR_MULTICAST(&origdst)) {
	if ((ia = ip6_getdstifaddr(m))) {
	if (!(ia->ia6_flags &
	(IN6_IFF_ANYCAST\|IN6_IFF_NOTREADY)))
	src = &ia->ia_addr.sin6_addr;
	} else {
	struct sockaddr_in6 d;

	bzero(&d, sizeof(d));
	d.sin6_family = AF_INET6;
	d.sin6_len = sizeof(d);
	d.sin6_addr = origdst;
	ia = (struct in6_ifaddr *)
	ifa_ifwithaddr((struct sockaddr *)&d);
	if (ia &&
	!(ia->ia6_flags &
	(IN6_IFF_ANYCAST\|IN6_IFF_NOTREADY))) {
	src = &ia->ia_addr.sin6_addr;
	}
	}
	}

	if (src == NULL) {
	int e;
	struct sockaddr_in6 sin6;
	struct route_in6 ro;

	/*
	* This case matches to multicasts, our anycast, or unicasts
	* that we do not own. Select a source address based on the
	* source address of the erroneous packet.
	*/
	bzero(&sin6, sizeof(sin6));
	sin6.sin6_family = AF_INET6;
	sin6.sin6_len = sizeof(sin6);
	sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */

	bzero(&ro, sizeof(ro));
	src = in6_selectsrc(&sin6, NULL, NULL, &ro, NULL, &outif, &e);
	if (ro.ro_rt)
	RTFREE(ro.ro_rt); /* XXX: we could use this */
	if (src == NULL) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_DEBUG,
	"icmp6_reflect: source can't be determined: "
	"dst=%s, error=%d\n",
	ip6_sprintf(ip6buf, &sin6.sin6_addr), e));
	goto bad;
	}
	}

	ip6->ip6_src = *src;
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	if (outif)
	ip6->ip6_hlim = ND_IFINFO(outif)->chlim;
	else if (m->m_pkthdr.rcvif) {
	/* XXX: This may not be the outgoing interface */
	ip6->ip6_hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim;
	} else
	ip6->ip6_hlim = V_ip6_defhlim;

	icmp6->icmp6_cksum = 0;
	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
	sizeof(struct ip6_hdr), plen);

	/*
	* XXX option handling
	*/

	m->m_flags &= ~(M_BCAST\|M_MCAST);

	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
	if (outif)
	icmp6_ifoutstat_inc(outif, type, code);

	return;

	bad:
	m_freem(m);
	return;
	}

	void
	icmp6_fasttimo(void)
	{

	return;
	}

	static const char *
	icmp6_redirect_diag(struct in6_addr src6, struct in6_addr dst6,
	struct in6_addr *tgt6)
	{
	static char buf[1024];
	char ip6bufs[INET6_ADDRSTRLEN];
	char ip6bufd[INET6_ADDRSTRLEN];
	char ip6buft[INET6_ADDRSTRLEN];
	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
	ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6),
	ip6_sprintf(ip6buft, tgt6));
	return buf;
	}

	void
	icmp6_redirect_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_redirect *nd_rd;
	int icmp6len = ntohs(ip6->ip6_plen);
	char *lladdr = NULL;
	int lladdrlen = 0;
	u_char *redirhdr = NULL;
	int redirhdrlen = 0;
	struct rtentry *rt = NULL;
	int is_router;
	int is_onlink;
	struct in6_addr src6 = ip6->ip6_src;
	struct in6_addr redtgt6;
	struct in6_addr reddst6;
	union nd_opts ndopts;
	char ip6buf[INET6_ADDRSTRLEN];

	if (!m)
	return;

	ifp = m->m_pkthdr.rcvif;

	if (!ifp)
	return;

	/* XXX if we are router, we don't update route by icmp6 redirect */
	if (V_ip6_forwarding)
	goto freeit;
	if (!V_icmp6_rediraccept)
	goto freeit;

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_rd, struct nd_redirect *, m, off, icmp6len);
	if (nd_rd == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif
	redtgt6 = nd_rd->nd_rd_target;
	reddst6 = nd_rd->nd_rd_dst;

	if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) \|\|
	in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) {
	goto freeit;
	}

	/* validation */
	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
	nd6log((LOG_ERR,
	"ICMP6 redirect sent from %s rejected; "
	"must be from linklocal\n",
	ip6_sprintf(ip6buf, &src6)));
	goto bad;
	}
	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"ICMP6 redirect sent from %s rejected; "
	"hlim=%d (must be 255)\n",
	ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim));
	goto bad;
	}
	{
	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
	struct sockaddr_in6 sin6;
	struct in6_addr *gw6;

	bzero(&sin6, sizeof(sin6));
	sin6.sin6_family = AF_INET6;
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6));
	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
	if (rt) {
	if (rt->rt_gateway == NULL \|\|
	rt->rt_gateway->sa_family != AF_INET6) {
	nd6log((LOG_ERR,
	"ICMP6 redirect rejected; no route "
	"with inet6 gateway found for redirect dst: %s\n",
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	RTFREE_LOCKED(rt);
	goto bad;
	}

	gw6 = &(((struct sockaddr_in6 *)rt->rt_gateway)->sin6_addr);
	if (bcmp(&src6, gw6, sizeof(struct in6_addr)) != 0) {
	nd6log((LOG_ERR,
	"ICMP6 redirect rejected; "
	"not equal to gw-for-src=%s (must be same): "
	"%s\n",
	ip6_sprintf(ip6buf, gw6),
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	RTFREE_LOCKED(rt);
	goto bad;
	}
	} else {
	nd6log((LOG_ERR,
	"ICMP6 redirect rejected; "
	"no route found for redirect dst: %s\n",
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	goto bad;
	}
	RTFREE_LOCKED(rt);
	rt = NULL;
	}
	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
	nd6log((LOG_ERR,
	"ICMP6 redirect rejected; "
	"redirect dst must be unicast: %s\n",
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	goto bad;
	}

	is_router = is_onlink = 0;
	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
	is_router = 1; /* router case */
	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
	is_onlink = 1; /* on-link destination case */
	if (!is_router && !is_onlink) {
	nd6log((LOG_ERR,
	"ICMP6 redirect rejected; "
	"neither router case nor onlink case: %s\n",
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	goto bad;
	}
	/* validation passed */

	icmp6len -= sizeof(*nd_rd);
	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO, "icmp6_redirect_input: "
	"invalid ND option, rejected: %s\n",
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	if (ndopts.nd_opts_tgt_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
	lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
	}

	if (ndopts.nd_opts_rh) {
	redirhdrlen = ndopts.nd_opts_rh->nd_opt_rh_len;
	redirhdr = (u_char )(ndopts.nd_opts_rh + 1); / xxx */
	}

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO,
	"icmp6_redirect_input: lladdrlen mismatch for %s "
	"(if %d, icmp6 packet %d): %s\n",
	ip6_sprintf(ip6buf, &redtgt6),
	ifp->if_addrlen, lladdrlen - 2,
	icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
	goto bad;
	}

	/* RFC 2461 8.3 */
	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
	is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);

	if (!is_onlink) { /* better router case. perform rtredirect. */
	/* perform rtredirect */
	struct sockaddr_in6 sdst;
	struct sockaddr_in6 sgw;
	struct sockaddr_in6 ssrc;

	bzero(&sdst, sizeof(sdst));
	bzero(&sgw, sizeof(sgw));
	bzero(&ssrc, sizeof(ssrc));
	sdst.sin6_family = sgw.sin6_family = ssrc.sin6_family = AF_INET6;
	sdst.sin6_len = sgw.sin6_len = ssrc.sin6_len =
	sizeof(struct sockaddr_in6);
	bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr));
	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
	bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
	rtredirect((struct sockaddr )&sdst, (struct sockaddr )&sgw,
	(struct sockaddr *)NULL, RTF_GATEWAY \| RTF_HOST,
	(struct sockaddr *)&ssrc);
	}
	/* finally update cached route in each socket via pfctlinput */
	{
	struct sockaddr_in6 sdst;

	bzero(&sdst, sizeof(sdst));
	sdst.sin6_family = AF_INET6;
	sdst.sin6_len = sizeof(struct sockaddr_in6);
	bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
	pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&sdst);
	#ifdef IPSEC
	key_sa_routechange((struct sockaddr *)&sdst);
	#endif /* IPSEC */
	}

	freeit:
	m_freem(m);
	return;

	bad:
	V_icmp6stat.icp6s_badredirect++;
	m_freem(m);
	}

	void
	icmp6_redirect_output(struct mbuf m0, struct rtentry rt)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet ifp; / my outgoing interface */
	struct in6_addr *ifp_ll6;
	struct in6_addr *router_ll6;
	struct ip6_hdr sip6; / m0 as struct ip6_hdr */
	struct mbuf m = NULL; / newly allocated one */
	struct ip6_hdr ip6; / m as struct ip6_hdr */
	struct nd_redirect *nd_rd;
	size_t maxlen;
	u_char *p;
	struct ifnet *outif = NULL;
	struct sockaddr_in6 src_sa;

	icmp6_errcount(&V_icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0);

	/* if we are not router, we don't send icmp6 redirect */
	if (!V_ip6_forwarding)
	goto fail;

	/* sanity check */
	if (!m0 \|\| !rt \|\| !(rt->rt_flags & RTF_UP) \|\| !(ifp = rt->rt_ifp))
	goto fail;

	/*
	* Address check:
	* the source address must identify a neighbor, and
	* the destination address must not be a multicast address
	* [RFC 2461, sec 8.2]
	*/
	sip6 = mtod(m0, struct ip6_hdr *);
	bzero(&src_sa, sizeof(src_sa));
	src_sa.sin6_family = AF_INET6;
	src_sa.sin6_len = sizeof(src_sa);
	src_sa.sin6_addr = sip6->ip6_src;
	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
	goto fail;
	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
	goto fail; /* what should we do here? */

	/* rate limit */
	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
	goto fail;

	/*
	* Since we are going to append up to 1280 bytes (= IPV6_MMTU),
	* we almost always ask for an mbuf cluster for simplicity.
	* (MHLEN < IPV6_MMTU is almost always true)
	*/
	#if IPV6_MMTU >= MCLBYTES
	# error assumption failed about IPV6_MMTU and MCLBYTES
	#endif
	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	if (m && IPV6_MMTU >= MHLEN)
	MCLGET(m, M_DONTWAIT);
	if (!m)
	goto fail;
	m->m_pkthdr.rcvif = NULL;
	m->m_len = 0;
	maxlen = M_TRAILINGSPACE(m);
	maxlen = min(IPV6_MMTU, maxlen);
	/* just for safety */
	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
	((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
	goto fail;
	}

	{
	/* get ip6 linklocal address for ifp(my outgoing interface). */
	struct in6_ifaddr *ia;
	if ((ia = in6ifa_ifpforlinklocal(ifp,
	IN6_IFF_NOTREADY\|
	IN6_IFF_ANYCAST)) == NULL)
	goto fail;
	ifp_ll6 = &ia->ia_addr.sin6_addr;
	}

	/* get ip6 linklocal address for the router. */
	if (rt->rt_gateway && (rt->rt_flags & RTF_GATEWAY)) {
	struct sockaddr_in6 *sin6;
	sin6 = (struct sockaddr_in6 *)rt->rt_gateway;
	router_ll6 = &sin6->sin6_addr;
	if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
	router_ll6 = (struct in6_addr *)NULL;
	} else
	router_ll6 = (struct in6_addr *)NULL;

	/* ip6 */
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	/* ip6->ip6_plen will be set later */
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_hlim = 255;
	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));

	/* ND Redirect */
	nd_rd = (struct nd_redirect *)(ip6 + 1);
	nd_rd->nd_rd_type = ND_REDIRECT;
	nd_rd->nd_rd_code = 0;
	nd_rd->nd_rd_reserved = 0;
	if (rt->rt_flags & RTF_GATEWAY) {
	/*
	* nd_rd->nd_rd_target must be a link-local address in
	* better router cases.
	*/
	if (!router_ll6)
	goto fail;
	bcopy(router_ll6, &nd_rd->nd_rd_target,
	sizeof(nd_rd->nd_rd_target));
	bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
	sizeof(nd_rd->nd_rd_dst));
	} else {
	/* make sure redtgt == reddst */
	bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
	sizeof(nd_rd->nd_rd_target));
	bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
	sizeof(nd_rd->nd_rd_dst));
	}

	p = (u_char *)(nd_rd + 1);

	if (!router_ll6)
	goto nolladdropt;

	{
	/* target lladdr option */
	struct rtentry *rt_router = NULL;
	int len;
	struct sockaddr_dl *sdl;
	struct nd_opt_hdr *nd_opt;
	char *lladdr;

	rt_router = nd6_lookup(router_ll6, 0, ifp);
	if (!rt_router)
	goto nolladdropt;
	len = sizeof(*nd_opt) + ifp->if_addrlen;
	len = (len + 7) & ~7; /* round by 8 */
	/* safety check */
	if (len + (p - (u_char *)ip6) > maxlen)
	goto nolladdropt;
	if (!(rt_router->rt_flags & RTF_GATEWAY) &&
	(rt_router->rt_flags & RTF_LLINFO) &&
	(rt_router->rt_gateway->sa_family == AF_LINK) &&
	(sdl = (struct sockaddr_dl *)rt_router->rt_gateway) &&
	sdl->sdl_alen) {
	nd_opt = (struct nd_opt_hdr *)p;
	nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
	nd_opt->nd_opt_len = len >> 3;
	lladdr = (char *)(nd_opt + 1);
	bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen);
	p += len;
	}
	}
	nolladdropt:;

	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;

	/* just to be safe */
	#ifdef M_DECRYPTED /not openbsd/
	if (m0->m_flags & M_DECRYPTED)
	goto noredhdropt;
	#endif
	if (p - (u_char *)ip6 > maxlen)
	goto noredhdropt;

	{
	/* redirected header option */
	int len;
	struct nd_opt_rd_hdr *nd_opt_rh;

	/*
	* compute the maximum size for icmp6 redirect header option.
	* XXX room for auth header?
	*/
	len = maxlen - (p - (u_char *)ip6);
	len &= ~7;

	/* This is just for simplicity. */
	if (m0->m_pkthdr.len != m0->m_len) {
	if (m0->m_next) {
	m_freem(m0->m_next);
	m0->m_next = NULL;
	}
	m0->m_pkthdr.len = m0->m_len;
	}

	/*
	* Redirected header option spec (RFC2461 4.6.3) talks nothing
	* about padding/truncate rule for the original IP packet.
	* From the discussion on IPv6imp in Feb 1999,
	* the consensus was:
	* - "attach as much as possible" is the goal
	* - pad if not aligned (original size can be guessed by
	* original ip6 header)
	* Following code adds the padding if it is simple enough,
	* and truncates if not.
	*/
	if (m0->m_next \|\| m0->m_pkthdr.len != m0->m_len)
	panic("assumption failed in %s:%d", __FILE__,
	__LINE__);

	if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
	/* not enough room, truncate */
	m0->m_pkthdr.len = m0->m_len = len -
	sizeof(*nd_opt_rh);
	} else {
	/* enough room, pad or truncate */
	size_t extra;

	extra = m0->m_pkthdr.len % 8;
	if (extra) {
	/* pad if easy enough, truncate if not */
	if (8 - extra <= M_TRAILINGSPACE(m0)) {
	/* pad */
	m0->m_len += (8 - extra);
	m0->m_pkthdr.len += (8 - extra);
	} else {
	/* truncate */
	m0->m_pkthdr.len -= extra;
	m0->m_len -= extra;
	}
	}
	len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
	m0->m_pkthdr.len = m0->m_len = len -
	sizeof(*nd_opt_rh);
	}

	nd_opt_rh = (struct nd_opt_rd_hdr *)p;
	bzero(nd_opt_rh, sizeof(*nd_opt_rh));
	nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
	nd_opt_rh->nd_opt_rh_len = len >> 3;
	p += sizeof(*nd_opt_rh);
	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;

	/* connect m0 to m */
	m_tag_delete_chain(m0, NULL);
	m0->m_flags &= ~M_PKTHDR;
	m->m_next = m0;
	m->m_pkthdr.len = m->m_len + m0->m_len;
	m0 = NULL;
	}
	noredhdropt:;
	if (m0) {
	m_freem(m0);
	m0 = NULL;
	}

	/* XXX: clear embedded link IDs in the inner header */
	in6_clearscope(&sip6->ip6_src);
	in6_clearscope(&sip6->ip6_dst);
	in6_clearscope(&nd_rd->nd_rd_target);
	in6_clearscope(&nd_rd->nd_rd_dst);

	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));

	nd_rd->nd_rd_cksum = 0;
	nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6,
	sizeof(*ip6), ntohs(ip6->ip6_plen));

	/* send the packet to outside... */
	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
	if (outif) {
	icmp6_ifstat_inc(outif, ifs6_out_msg);
	icmp6_ifstat_inc(outif, ifs6_out_redirect);
	}
	V_icmp6stat.icp6s_outhist[ND_REDIRECT]++;

	return;

	fail:
	if (m)
	m_freem(m);
	if (m0)
	m_freem(m0);
	}

	/*
	* ICMPv6 socket option processing.
	*/
	int
	icmp6_ctloutput(struct socket so, struct sockopt sopt)
	{
	int error = 0;
	int optlen;
	struct inpcb *inp = sotoinpcb(so);
	int level, op, optname;

	if (sopt) {
	level = sopt->sopt_level;
	op = sopt->sopt_dir;
	optname = sopt->sopt_name;
	optlen = sopt->sopt_valsize;
	} else
	level = op = optname = optlen = 0;

	if (level != IPPROTO_ICMPV6) {
	return EINVAL;
	}

	switch (op) {
	case PRCO_SETOPT:
	switch (optname) {
	case ICMP6_FILTER:
	{
	struct icmp6_filter ic6f;

	if (optlen != sizeof(ic6f)) {
	error = EMSGSIZE;
	break;
	}
	error = sooptcopyin(sopt, &ic6f, optlen, optlen);
	if (error == 0) {
	INP_WLOCK(inp);
	*inp->in6p_icmp6filt = ic6f;
	INP_WUNLOCK(inp);
	}
	break;
	}

	default:
	error = ENOPROTOOPT;
	break;
	}
	break;

	case PRCO_GETOPT:
	switch (optname) {
	case ICMP6_FILTER:
	{
	struct icmp6_filter ic6f;

	INP_RLOCK(inp);
	ic6f = *inp->in6p_icmp6filt;
	INP_RUNLOCK(inp);
	error = sooptcopyout(sopt, &ic6f, sizeof(ic6f));
	break;
	}

	default:
	error = ENOPROTOOPT;
	break;
	}
	break;
	}

	return (error);
	}

	/*
	* Perform rate limit check.
	* Returns 0 if it is okay to send the icmp6 packet.
	* Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
	* limitation.
	*
	* XXX per-destination/type check necessary?
	*
	* dst - not used at this moment
	* type - not used at this moment
	* code - not used at this moment
	*/
	static int
	icmp6_ratelimit(const struct in6_addr *dst, const int type,
	const int code)
	{
	+ INIT_VNET_INET6(curvnet);
	int ret;

	ret = 0; /* okay to send */

	/* PPS limit */
	if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count,
	V_icmp6errppslim)) {
	/* The packet is subject to rate limit */
	ret++;
	}

	return ret;
	}
	Index: head/sys/netinet6/in6.c
	===================================================================
	--- head/sys/netinet6/in6.c (revision 183549)
	+++ head/sys/netinet6/in6.c (revision 183550)
	@@ -1,2320 +1,2330 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in.c 8.2 (Berkeley) 11/15/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/errno.h>
	#include <sys/malloc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/systm.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>
	#include <net/if_dl.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>

	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/mld6_var.h>
	#include <netinet6/ip6_mroute.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/in6_pcb.h>

	MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "internet multicast address");

	/*
	* Definitions of some costant IP6 addresses.
	*/
	const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
	const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
	const struct in6_addr in6addr_nodelocal_allnodes =
	IN6ADDR_NODELOCAL_ALLNODES_INIT;
	const struct in6_addr in6addr_linklocal_allnodes =
	IN6ADDR_LINKLOCAL_ALLNODES_INIT;
	const struct in6_addr in6addr_linklocal_allrouters =
	IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;

	const struct in6_addr in6mask0 = IN6MASK0;
	const struct in6_addr in6mask32 = IN6MASK32;
	const struct in6_addr in6mask64 = IN6MASK64;
	const struct in6_addr in6mask96 = IN6MASK96;
	const struct in6_addr in6mask128 = IN6MASK128;

	const struct sockaddr_in6 sa6_any =
	{ sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 };

	static int in6_lifaddr_ioctl __P((struct socket *, u_long, caddr_t,
	struct ifnet , struct thread ));
	static int in6_ifinit __P((struct ifnet , struct in6_ifaddr ,
	struct sockaddr_in6 *, int));
	static void in6_unlink_ifa(struct in6_ifaddr , struct ifnet );

	struct in6_multihead in6_multihead; /* XXX BSS initialization */
	int (faithprefix_p)(struct in6_addr );

	/*
	* Subroutine for in6_ifaddloop() and in6_ifremloop().
	* This routine does actual work.
	*/
	static void
	in6_ifloop_request(int cmd, struct ifaddr *ifa)
	{
	struct sockaddr_in6 all1_sa;
	struct rtentry *nrt = NULL;
	int e;
	char ip6buf[INET6_ADDRSTRLEN];

	bzero(&all1_sa, sizeof(all1_sa));
	all1_sa.sin6_family = AF_INET6;
	all1_sa.sin6_len = sizeof(struct sockaddr_in6);
	all1_sa.sin6_addr = in6mask128;

	/*
	* We specify the address itself as the gateway, and set the
	* RTF_LLINFO flag, so that the corresponding host route would have
	* the flag, and thus applications that assume traditional behavior
	* would be happy. Note that we assume the caller of the function
	* (probably implicitly) set nd6_rtrequest() to ifa->ifa_rtrequest,
	* which changes the outgoing interface to the loopback interface.
	*/
	e = rtrequest(cmd, ifa->ifa_addr, ifa->ifa_addr,
	(struct sockaddr *)&all1_sa, RTF_UP\|RTF_HOST\|RTF_LLINFO, &nrt);
	if (e != 0) {
	/* XXX need more descriptive message */

	log(LOG_ERR, "in6_ifloop_request: "
	"%s operation failed for %s (errno=%d)\n",
	cmd == RTM_ADD ? "ADD" : "DELETE",
	ip6_sprintf(ip6buf,
	&((struct in6_ifaddr *)ifa)->ia_addr.sin6_addr), e);
	}

	/*
	* Report the addition/removal of the address to the routing socket.
	* XXX: since we called rtinit for a p2p interface with a destination,
	* we end up reporting twice in such a case. Should we rather
	* omit the second report?
	*/
	if (nrt) {
	RT_LOCK(nrt);
	/*
	* Make sure rt_ifa be equal to IFA, the second argument of
	* the function. We need this because when we refer to
	* rt_ifa->ia6_flags in ip6_input, we assume that the rt_ifa
	* points to the address instead of the loopback address.
	*/
	if (cmd == RTM_ADD && ifa != nrt->rt_ifa) {
	IFAFREE(nrt->rt_ifa);
	IFAREF(ifa);
	nrt->rt_ifa = ifa;
	}

	rt_newaddrmsg(cmd, ifa, e, nrt);
	if (cmd == RTM_DELETE)
	RTFREE_LOCKED(nrt);
	else {
	/* the cmd must be RTM_ADD here */
	RT_REMREF(nrt);
	RT_UNLOCK(nrt);
	}
	}
	}

	/*
	* Add ownaddr as loopback rtentry. We previously add the route only if
	* necessary (ex. on a p2p link). However, since we now manage addresses
	* separately from prefixes, we should always add the route. We can't
	* rely on the cloning mechanism from the corresponding interface route
	* any more.
	*/
	void
	in6_ifaddloop(struct ifaddr *ifa)
	{
	struct rtentry *rt;
	int need_loop;

	/* If there is no loopback entry, allocate one. */
	rt = rtalloc1(ifa->ifa_addr, 0, 0);
	need_loop = (rt == NULL \|\| (rt->rt_flags & RTF_HOST) == 0 \|\|
	(rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0);
	if (rt)
	RTFREE_LOCKED(rt);
	if (need_loop)
	in6_ifloop_request(RTM_ADD, ifa);
	}

	/*
	* Remove loopback rtentry of ownaddr generated by in6_ifaddloop(),
	* if it exists.
	*/
	void
	in6_ifremloop(struct ifaddr *ifa)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr *ia;
	struct rtentry *rt;
	int ia_count = 0;

	/*
	* Some of BSD variants do not remove cloned routes
	* from an interface direct route, when removing the direct route
	* (see comments in net/net_osdep.h). Even for variants that do remove
	* cloned routes, they could fail to remove the cloned routes when
	* we handle multple addresses that share a common prefix.
	* So, we should remove the route corresponding to the deleted address.
	*/

	/*
	* Delete the entry only if exact one ifa exists. More than one ifa
	* can exist if we assign a same single address to multiple
	* (probably p2p) interfaces.
	* XXX: we should avoid such a configuration in IPv6...
	*/
	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr)) {
	ia_count++;
	if (ia_count > 1)
	break;
	}
	}

	if (ia_count == 1) {
	/*
	* Before deleting, check if a corresponding loopbacked host
	* route surely exists. With this check, we can avoid to
	* delete an interface direct route whose destination is same
	* as the address being removed. This can happen when removing
	* a subnet-router anycast address on an interface attahced
	* to a shared medium.
	*/
	rt = rtalloc1(ifa->ifa_addr, 0, 0);
	if (rt != NULL) {
	if ((rt->rt_flags & RTF_HOST) != 0 &&
	(rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
	RTFREE_LOCKED(rt);
	in6_ifloop_request(RTM_DELETE, ifa);
	} else
	RT_UNLOCK(rt);
	}
	}
	}

	int
	in6_mask2len(struct in6_addr mask, u_char lim0)
	{
	int x = 0, y;
	u_char lim = lim0, p;

	/* ignore the scope_id part */
	if (lim0 == NULL \|\| lim0 - (u_char )mask > sizeof(mask))
	lim = (u_char )mask + sizeof(mask);
	for (p = (u_char *)mask; p < lim; x++, p++) {
	if (*p != 0xff)
	break;
	}
	y = 0;
	if (p < lim) {
	for (y = 0; y < 8; y++) {
	if ((*p & (0x80 >> y)) == 0)
	break;
	}
	}

	/*
	* when the limit pointer is given, do a stricter check on the
	* remaining bits.
	*/
	if (p < lim) {
	if (y != 0 && (*p & (0x00ff >> y)) != 0)
	return (-1);
	for (p = p + 1; p < lim; p++)
	if (*p != 0)
	return (-1);
	}

	return x * 8 + y;
	}

	#define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa))
	#define ia62ifa(ia6) (&((ia6)->ia_ifa))

	int
	in6_control(struct socket *so, u_long cmd, caddr_t data,
	struct ifnet ifp, struct thread td)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifreq ifr = (struct in6_ifreq )data;
	struct in6_ifaddr *ia = NULL;
	struct in6_aliasreq ifra = (struct in6_aliasreq )data;
	struct sockaddr_in6 *sa6;
	int error;

	switch (cmd) {
	case SIOCGETSGCNT_IN6:
	case SIOCGETMIFCNT_IN6:
	return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP);
	}

	switch(cmd) {
	case SIOCAADDRCTL_POLICY:
	case SIOCDADDRCTL_POLICY:
	if (td != NULL) {
	error = priv_check(td, PRIV_NETINET_ADDRCTRL6);
	if (error)
	return (error);
	}
	return (in6_src_ioctl(cmd, data));
	}

	if (ifp == NULL)
	return (EOPNOTSUPP);

	switch (cmd) {
	case SIOCSNDFLUSH_IN6:
	case SIOCSPFXFLUSH_IN6:
	case SIOCSRTRFLUSH_IN6:
	case SIOCSDEFIFACE_IN6:
	case SIOCSIFINFO_FLAGS:
	if (td != NULL) {
	error = priv_check(td, PRIV_NETINET_ND6);
	if (error)
	return (error);
	}
	/* FALLTHROUGH */
	case OSIOCGIFINFO_IN6:
	case SIOCGIFINFO_IN6:
	case SIOCSIFINFO_IN6:
	case SIOCGDRLST_IN6:
	case SIOCGPRLST_IN6:
	case SIOCGNBRINFO_IN6:
	case SIOCGDEFIFACE_IN6:
	return (nd6_ioctl(cmd, data, ifp));
	}

	switch (cmd) {
	case SIOCSIFPREFIX_IN6:
	case SIOCDIFPREFIX_IN6:
	case SIOCAIFPREFIX_IN6:
	case SIOCCIFPREFIX_IN6:
	case SIOCSGIFPREFIX_IN6:
	case SIOCGIFPREFIX_IN6:
	log(LOG_NOTICE,
	"prefix ioctls are now invalidated. "
	"please use ifconfig.\n");
	return (EOPNOTSUPP);
	}

	switch (cmd) {
	case SIOCSSCOPE6:
	if (td != NULL) {
	error = priv_check(td, PRIV_NETINET_SCOPE6);
	if (error)
	return (error);
	}
	return (scope6_set(ifp,
	(struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
	case SIOCGSCOPE6:
	return (scope6_get(ifp,
	(struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
	case SIOCGSCOPE6DEF:
	return (scope6_get_default((struct scope6_id *)
	ifr->ifr_ifru.ifru_scope_id));
	}

	switch (cmd) {
	case SIOCALIFADDR:
	if (td != NULL) {
	error = priv_check(td, PRIV_NET_ADDIFADDR);
	if (error)
	return (error);
	}
	return in6_lifaddr_ioctl(so, cmd, data, ifp, td);

	case SIOCDLIFADDR:
	if (td != NULL) {
	error = priv_check(td, PRIV_NET_DELIFADDR);
	if (error)
	return (error);
	}
	/* FALLTHROUGH */
	case SIOCGLIFADDR:
	return in6_lifaddr_ioctl(so, cmd, data, ifp, td);
	}

	/*
	* Find address for this interface, if it exists.
	*
	* In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
	* only, and used the first interface address as the target of other
	* operations (without checking ifra_addr). This was because netinet
	* code/API assumed at most 1 interface address per interface.
	* Since IPv6 allows a node to assign multiple addresses
	* on a single interface, we almost always look and check the
	* presence of ifra_addr, and reject invalid ones here.
	* It also decreases duplicated code among SIOC*_IN6 operations.
	*/
	switch (cmd) {
	case SIOCAIFADDR_IN6:
	case SIOCSIFPHYADDR_IN6:
	sa6 = &ifra->ifra_addr;
	break;
	case SIOCSIFADDR_IN6:
	case SIOCGIFADDR_IN6:
	case SIOCSIFDSTADDR_IN6:
	case SIOCSIFNETMASK_IN6:
	case SIOCGIFDSTADDR_IN6:
	case SIOCGIFNETMASK_IN6:
	case SIOCDIFADDR_IN6:
	case SIOCGIFPSRCADDR_IN6:
	case SIOCGIFPDSTADDR_IN6:
	case SIOCGIFAFLAG_IN6:
	case SIOCSNDFLUSH_IN6:
	case SIOCSPFXFLUSH_IN6:
	case SIOCSRTRFLUSH_IN6:
	case SIOCGIFALIFETIME_IN6:
	case SIOCSIFALIFETIME_IN6:
	case SIOCGIFSTAT_IN6:
	case SIOCGIFSTAT_ICMP6:
	sa6 = &ifr->ifr_addr;
	break;
	default:
	sa6 = NULL;
	break;
	}
	if (sa6 && sa6->sin6_family == AF_INET6) {
	int error = 0;

	if (sa6->sin6_scope_id != 0)
	error = sa6_embedscope(sa6, 0);
	else
	error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
	if (error != 0)
	return (error);
	ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
	} else
	ia = NULL;

	switch (cmd) {
	case SIOCSIFADDR_IN6:
	case SIOCSIFDSTADDR_IN6:
	case SIOCSIFNETMASK_IN6:
	/*
	* Since IPv6 allows a node to assign multiple addresses
	* on a single interface, SIOCSIFxxx ioctls are deprecated.
	*/
	/* we decided to obsolete this command (20000704) */
	return (EINVAL);

	case SIOCDIFADDR_IN6:
	/*
	* for IPv4, we look for existing in_ifaddr here to allow
	* "ifconfig if0 delete" to remove the first IPv4 address on
	* the interface. For IPv6, as the spec allows multiple
	* interface address from the day one, we consider "remove the
	* first one" semantics to be not preferable.
	*/
	if (ia == NULL)
	return (EADDRNOTAVAIL);
	/* FALLTHROUGH */
	case SIOCAIFADDR_IN6:
	/*
	* We always require users to specify a valid IPv6 address for
	* the corresponding operation.
	*/
	if (ifra->ifra_addr.sin6_family != AF_INET6 \|\|
	ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6))
	return (EAFNOSUPPORT);

	if (td != NULL) {
	error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ?
	PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR);
	if (error)
	return (error);
	}

	break;

	case SIOCGIFADDR_IN6:
	/* This interface is basically deprecated. use SIOCGIFCONF. */
	/* FALLTHROUGH */
	case SIOCGIFAFLAG_IN6:
	case SIOCGIFNETMASK_IN6:
	case SIOCGIFDSTADDR_IN6:
	case SIOCGIFALIFETIME_IN6:
	/* must think again about its semantics */
	if (ia == NULL)
	return (EADDRNOTAVAIL);
	break;
	case SIOCSIFALIFETIME_IN6:
	{
	struct in6_addrlifetime *lt;

	if (td != NULL) {
	error = priv_check(td, PRIV_NETINET_ALIFETIME6);
	if (error)
	return (error);
	}
	if (ia == NULL)
	return (EADDRNOTAVAIL);
	/* sanity for overflow - beware unsigned */
	lt = &ifr->ifr_ifru.ifru_lifetime;
	if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME &&
	lt->ia6t_vltime + time_second < time_second) {
	return EINVAL;
	}
	if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME &&
	lt->ia6t_pltime + time_second < time_second) {
	return EINVAL;
	}
	break;
	}
	}

	switch (cmd) {

	case SIOCGIFADDR_IN6:
	ifr->ifr_addr = ia->ia_addr;
	if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0)
	return (error);
	break;

	case SIOCGIFDSTADDR_IN6:
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
	return (EINVAL);
	/*
	* XXX: should we check if ifa_dstaddr is NULL and return
	* an error?
	*/
	ifr->ifr_dstaddr = ia->ia_dstaddr;
	if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0)
	return (error);
	break;

	case SIOCGIFNETMASK_IN6:
	ifr->ifr_addr = ia->ia_prefixmask;
	break;

	case SIOCGIFAFLAG_IN6:
	ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
	break;

	case SIOCGIFSTAT_IN6:
	if (ifp == NULL)
	return EINVAL;
	bzero(&ifr->ifr_ifru.ifru_stat,
	sizeof(ifr->ifr_ifru.ifru_stat));
	ifr->ifr_ifru.ifru_stat =
	((struct in6_ifextra )ifp->if_afdata[AF_INET6])->in6_ifstat;
	break;

	case SIOCGIFSTAT_ICMP6:
	if (ifp == NULL)
	return EINVAL;
	bzero(&ifr->ifr_ifru.ifru_icmp6stat,
	sizeof(ifr->ifr_ifru.ifru_icmp6stat));
	ifr->ifr_ifru.ifru_icmp6stat =
	((struct in6_ifextra )ifp->if_afdata[AF_INET6])->icmp6_ifstat;
	break;

	case SIOCGIFALIFETIME_IN6:
	ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
	time_t maxexpire;
	struct in6_addrlifetime *retlt =
	&ifr->ifr_ifru.ifru_lifetime;

	/*
	* XXX: adjust expiration time assuming time_t is
	* signed.
	*/
	maxexpire = (-1) &
	~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
	if (ia->ia6_lifetime.ia6t_vltime <
	maxexpire - ia->ia6_updatetime) {
	retlt->ia6t_expire = ia->ia6_updatetime +
	ia->ia6_lifetime.ia6t_vltime;
	} else
	retlt->ia6t_expire = maxexpire;
	}
	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
	time_t maxexpire;
	struct in6_addrlifetime *retlt =
	&ifr->ifr_ifru.ifru_lifetime;

	/*
	* XXX: adjust expiration time assuming time_t is
	* signed.
	*/
	maxexpire = (-1) &
	~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
	if (ia->ia6_lifetime.ia6t_pltime <
	maxexpire - ia->ia6_updatetime) {
	retlt->ia6t_preferred = ia->ia6_updatetime +
	ia->ia6_lifetime.ia6t_pltime;
	} else
	retlt->ia6t_preferred = maxexpire;
	}
	break;

	case SIOCSIFALIFETIME_IN6:
	ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime;
	/* for sanity */
	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
	ia->ia6_lifetime.ia6t_expire =
	time_second + ia->ia6_lifetime.ia6t_vltime;
	} else
	ia->ia6_lifetime.ia6t_expire = 0;
	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
	ia->ia6_lifetime.ia6t_preferred =
	time_second + ia->ia6_lifetime.ia6t_pltime;
	} else
	ia->ia6_lifetime.ia6t_preferred = 0;
	break;

	case SIOCAIFADDR_IN6:
	{
	int i, error = 0;
	struct nd_prefixctl pr0;
	struct nd_prefix *pr;

	/*
	* first, make or update the interface address structure,
	* and link it to the list.
	*/
	if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0)
	return (error);
	if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr))
	== NULL) {
	/*
	* this can happen when the user specify the 0 valid
	* lifetime.
	*/
	break;
	}

	/*
	* then, make the prefix on-link on the interface.
	* XXX: we'd rather create the prefix before the address, but
	* we need at least one address to install the corresponding
	* interface route, so we configure the address first.
	*/

	/*
	* convert mask to prefix length (prefixmask has already
	* been validated in in6_update_ifa().
	*/
	bzero(&pr0, sizeof(pr0));
	pr0.ndpr_ifp = ifp;
	pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
	NULL);
	if (pr0.ndpr_plen == 128) {
	break; /* we don't need to install a host route. */
	}
	pr0.ndpr_prefix = ifra->ifra_addr;
	/* apply the mask for safety. */
	for (i = 0; i < 4; i++) {
	pr0.ndpr_prefix.sin6_addr.s6_addr32[i] &=
	ifra->ifra_prefixmask.sin6_addr.s6_addr32[i];
	}
	/*
	* XXX: since we don't have an API to set prefix (not address)
	* lifetimes, we just use the same lifetimes as addresses.
	* The (temporarily) installed lifetimes can be overridden by
	* later advertised RAs (when accept_rtadv is non 0), which is
	* an intended behavior.
	*/
	pr0.ndpr_raf_onlink = 1; /* should be configurable? */
	pr0.ndpr_raf_auto =
	((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0);
	pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime;
	pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime;

	/* add the prefix if not yet. */
	if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
	/*
	* nd6_prelist_add will install the corresponding
	* interface route.
	*/
	if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0)
	return (error);
	if (pr == NULL) {
	log(LOG_ERR, "nd6_prelist_add succeeded but "
	"no prefix\n");
	return (EINVAL); /* XXX panic here? */
	}
	}

	/* relate the address to the prefix */
	if (ia->ia6_ndpr == NULL) {
	ia->ia6_ndpr = pr;
	pr->ndpr_refcnt++;

	/*
	* If this is the first autoconf address from the
	* prefix, create a temporary address as well
	* (when required).
	*/
	if ((ia->ia6_flags & IN6_IFF_AUTOCONF) &&
	V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) {
	int e;
	if ((e = in6_tmpifadd(ia, 1, 0)) != 0) {
	log(LOG_NOTICE, "in6_control: failed "
	"to create a temporary address, "
	"errno=%d\n", e);
	}
	}
	}

	/*
	* this might affect the status of autoconfigured addresses,
	* that is, this address might make other addresses detached.
	*/
	pfxlist_onlink_check();
	if (error == 0 && ia)
	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
	break;
	}

	case SIOCDIFADDR_IN6:
	{
	struct nd_prefix *pr;

	/*
	* If the address being deleted is the only one that owns
	* the corresponding prefix, expire the prefix as well.
	* XXX: theoretically, we don't have to worry about such
	* relationship, since we separate the address management
	* and the prefix management. We do this, however, to provide
	* as much backward compatibility as possible in terms of
	* the ioctl operation.
	* Note that in6_purgeaddr() will decrement ndpr_refcnt.
	*/
	pr = ia->ia6_ndpr;
	in6_purgeaddr(&ia->ia_ifa);
	if (pr && pr->ndpr_refcnt == 0)
	prelist_remove(pr);
	EVENTHANDLER_INVOKE(ifaddr_event, ifp);
	break;
	}

	default:
	if (ifp == NULL \|\| ifp->if_ioctl == 0)
	return (EOPNOTSUPP);
	return ((*ifp->if_ioctl)(ifp, cmd, data));
	}

	return (0);
	}

	/*
	* Update parameters of an IPv6 interface address.
	* If necessary, a new entry is created and linked into address chains.
	* This function is separated from in6_control().
	* XXX: should this be performed under splnet()?
	*/
	int
	in6_update_ifa(struct ifnet ifp, struct in6_aliasreq ifra,
	struct in6_ifaddr *ia, int flags)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	+ INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
	int error = 0, hostIsNew = 0, plen = -1;
	struct in6_ifaddr *oia;
	struct sockaddr_in6 dst6;
	struct in6_addrlifetime *lt;
	struct in6_multi_mship *imm;
	struct in6_multi *in6m_sol;
	struct rtentry *rt;
	int delay;
	char ip6buf[INET6_ADDRSTRLEN];

	/* Validate parameters */
	if (ifp == NULL \|\| ifra == NULL) /* this maybe redundant */
	return (EINVAL);

	/*
	* The destination address for a p2p link must have a family
	* of AF_UNSPEC or AF_INET6.
	*/
	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
	ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
	ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
	return (EAFNOSUPPORT);
	/*
	* validate ifra_prefixmask. don't check sin6_family, netmask
	* does not carry fields other than sin6_len.
	*/
	if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
	return (EINVAL);
	/*
	* Because the IPv6 address architecture is classless, we require
	* users to specify a (non 0) prefix length (mask) for a new address.
	* We also require the prefix (when specified) mask is valid, and thus
	* reject a non-consecutive mask.
	*/
	if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
	return (EINVAL);
	if (ifra->ifra_prefixmask.sin6_len != 0) {
	plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
	(u_char *)&ifra->ifra_prefixmask +
	ifra->ifra_prefixmask.sin6_len);
	if (plen <= 0)
	return (EINVAL);
	} else {
	/*
	* In this case, ia must not be NULL. We just use its prefix
	* length.
	*/
	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
	}
	/*
	* If the destination address on a p2p interface is specified,
	* and the address is a scoped one, validate/set the scope
	* zone identifier.
	*/
	dst6 = ifra->ifra_dstaddr;
	if ((ifp->if_flags & (IFF_POINTOPOINT\|IFF_LOOPBACK)) != 0 &&
	(dst6.sin6_family == AF_INET6)) {
	struct in6_addr in6_tmp;
	u_int32_t zoneid;

	in6_tmp = dst6.sin6_addr;
	if (in6_setscope(&in6_tmp, ifp, &zoneid))
	return (EINVAL); /* XXX: should be impossible */

	if (dst6.sin6_scope_id != 0) {
	if (dst6.sin6_scope_id != zoneid)
	return (EINVAL);
	} else /* user omit to specify the ID. */
	dst6.sin6_scope_id = zoneid;

	/* convert into the internal form */
	if (sa6_embedscope(&dst6, 0))
	return (EINVAL); /* XXX: should be impossible */
	}
	/*
	* The destination address can be specified only for a p2p or a
	* loopback interface. If specified, the corresponding prefix length
	* must be 128.
	*/
	if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
	if ((ifp->if_flags & (IFF_POINTOPOINT\|IFF_LOOPBACK)) == 0) {
	/* XXX: noisy message */
	nd6log((LOG_INFO, "in6_update_ifa: a destination can "
	"be specified for a p2p or a loopback IF only\n"));
	return (EINVAL);
	}
	if (plen != 128) {
	nd6log((LOG_INFO, "in6_update_ifa: prefixlen should "
	"be 128 when dstaddr is specified\n"));
	return (EINVAL);
	}
	}
	/* lifetime consistency check */
	lt = &ifra->ifra_lifetime;
	if (lt->ia6t_pltime > lt->ia6t_vltime)
	return (EINVAL);
	if (lt->ia6t_vltime == 0) {
	/*
	* the following log might be noisy, but this is a typical
	* configuration mistake or a tool's bug.
	*/
	nd6log((LOG_INFO,
	"in6_update_ifa: valid lifetime is 0 for %s\n",
	ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr)));

	if (ia == NULL)
	return (0); /* there's nothing to do */
	}

	/*
	* If this is a new address, allocate a new ifaddr and link it
	* into chains.
	*/
	if (ia == NULL) {
	hostIsNew = 1;
	/*
	* When in6_update_ifa() is called in a process of a received
	* RA, it is called under an interrupt context. So, we should
	* call malloc with M_NOWAIT.
	*/
	ia = (struct in6_ifaddr ) malloc(sizeof(ia), M_IFADDR,
	M_NOWAIT);
	if (ia == NULL)
	return (ENOBUFS);
	bzero((caddr_t)ia, sizeof(*ia));
	LIST_INIT(&ia->ia6_memberships);
	/* Initialize the address and masks, and put time stamp */
	IFA_LOCK_INIT(&ia->ia_ifa);
	ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
	ia->ia_addr.sin6_family = AF_INET6;
	ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
	ia->ia6_createtime = time_second;
	if ((ifp->if_flags & (IFF_POINTOPOINT \| IFF_LOOPBACK)) != 0) {
	/*
	* XXX: some functions expect that ifa_dstaddr is not
	* NULL for p2p interfaces.
	*/
	ia->ia_ifa.ifa_dstaddr =
	(struct sockaddr *)&ia->ia_dstaddr;
	} else {
	ia->ia_ifa.ifa_dstaddr = NULL;
	}
	ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask;

	ia->ia_ifp = ifp;
	if ((oia = V_in6_ifaddr) != NULL) {
	for ( ; oia->ia_next; oia = oia->ia_next)
	continue;
	oia->ia_next = ia;
	} else
	V_in6_ifaddr = ia;

	ia->ia_ifa.ifa_refcnt = 1;
	TAILQ_INSERT_TAIL(&ifp->if_addrlist, &ia->ia_ifa, ifa_list);
	}

	/* update timestamp */
	ia->ia6_updatetime = time_second;

	/* set prefix mask */
	if (ifra->ifra_prefixmask.sin6_len) {
	/*
	* We prohibit changing the prefix length of an existing
	* address, because
	* + such an operation should be rare in IPv6, and
	* + the operation would confuse prefix management.
	*/
	if (ia->ia_prefixmask.sin6_len &&
	in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) {
	nd6log((LOG_INFO, "in6_update_ifa: the prefix length of an"
	" existing (%s) address should not be changed\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
	error = EINVAL;
	goto unlink;
	}
	ia->ia_prefixmask = ifra->ifra_prefixmask;
	}

	/*
	* If a new destination address is specified, scrub the old one and
	* install the new destination. Note that the interface must be
	* p2p or loopback (see the check above.)
	*/
	if (dst6.sin6_family == AF_INET6 &&
	!IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, &ia->ia_dstaddr.sin6_addr)) {
	int e;

	if ((ia->ia_flags & IFA_ROUTE) != 0 &&
	(e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST)) != 0) {
	nd6log((LOG_ERR, "in6_update_ifa: failed to remove "
	"a route to the old destination: %s\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
	/* proceed anyway... */
	} else
	ia->ia_flags &= ~IFA_ROUTE;
	ia->ia_dstaddr = dst6;
	}

	/*
	* Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred
	* to see if the address is deprecated or invalidated, but initialize
	* these members for applications.
	*/
	ia->ia6_lifetime = ifra->ifra_lifetime;
	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
	ia->ia6_lifetime.ia6t_expire =
	time_second + ia->ia6_lifetime.ia6t_vltime;
	} else
	ia->ia6_lifetime.ia6t_expire = 0;
	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
	ia->ia6_lifetime.ia6t_preferred =
	time_second + ia->ia6_lifetime.ia6t_pltime;
	} else
	ia->ia6_lifetime.ia6t_preferred = 0;

	/* reset the interface and routing table appropriately. */
	if ((error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew)) != 0)
	goto unlink;

	/*
	* configure address flags.
	*/
	ia->ia6_flags = ifra->ifra_flags;
	/*
	* backward compatibility - if IN6_IFF_DEPRECATED is set from the
	* userland, make it deprecated.
	*/
	if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
	ia->ia6_lifetime.ia6t_pltime = 0;
	ia->ia6_lifetime.ia6t_preferred = time_second;
	}
	/*
	* Make the address tentative before joining multicast addresses,
	* so that corresponding MLD responses would not have a tentative
	* source address.
	*/
	ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */
	if (hostIsNew && in6if_do_dad(ifp))
	ia->ia6_flags \|= IN6_IFF_TENTATIVE;

	/*
	* We are done if we have simply modified an existing address.
	*/
	if (!hostIsNew)
	return (error);

	/*
	* Beyond this point, we should call in6_purgeaddr upon an error,
	* not just go to unlink.
	*/

	/* Join necessary multicast groups */
	in6m_sol = NULL;
	if ((ifp->if_flags & IFF_MULTICAST) != 0) {
	struct sockaddr_in6 mltaddr, mltmask;
	struct in6_addr llsol;

	/* join solicited multicast addr for new host id */
	bzero(&llsol, sizeof(struct in6_addr));
	llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL;
	llsol.s6_addr32[1] = 0;
	llsol.s6_addr32[2] = htonl(1);
	llsol.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3];
	llsol.s6_addr8[12] = 0xff;
	if ((error = in6_setscope(&llsol, ifp, NULL)) != 0) {
	/* XXX: should not happen */
	log(LOG_ERR, "in6_update_ifa: "
	"in6_setscope failed\n");
	goto cleanup;
	}
	delay = 0;
	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
	/*
	* We need a random delay for DAD on the address
	* being configured. It also means delaying
	* transmission of the corresponding MLD report to
	* avoid report collision.
	* [draft-ietf-ipv6-rfc2462bis-02.txt]
	*/
	delay = arc4random() %
	(MAX_RTR_SOLICITATION_DELAY * hz);
	}
	imm = in6_joingroup(ifp, &llsol, &error, delay);
	if (imm == NULL) {
	nd6log((LOG_WARNING,
	"in6_update_ifa: addmulti failed for "
	"%s on %s (errno=%d)\n",
	ip6_sprintf(ip6buf, &llsol), if_name(ifp),
	error));
	in6_purgeaddr((struct ifaddr *)ia);
	return (error);
	}
	LIST_INSERT_HEAD(&ia->ia6_memberships,
	imm, i6mm_chain);
	in6m_sol = imm->i6mm_maddr;

	bzero(&mltmask, sizeof(mltmask));
	mltmask.sin6_len = sizeof(struct sockaddr_in6);
	mltmask.sin6_family = AF_INET6;
	mltmask.sin6_addr = in6mask32;
	#define MLTMASK_LEN 4 /* mltmask's masklen (=32bit=4octet) */

	/*
	* join link-local all-nodes address
	*/
	bzero(&mltaddr, sizeof(mltaddr));
	mltaddr.sin6_len = sizeof(struct sockaddr_in6);
	mltaddr.sin6_family = AF_INET6;
	mltaddr.sin6_addr = in6addr_linklocal_allnodes;
	if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) !=
	0)
	goto cleanup; /* XXX: should not fail */

	/*
	* XXX: do we really need this automatic routes?
	* We should probably reconsider this stuff. Most applications
	* actually do not need the routes, since they usually specify
	* the outgoing interface.
	*/
	rt = rtalloc1((struct sockaddr *)&mltaddr, 0, 0UL);
	if (rt) {
	/* XXX: only works in !SCOPEDROUTING case. */
	if (memcmp(&mltaddr.sin6_addr,
	&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr,
	MLTMASK_LEN)) {
	RTFREE_LOCKED(rt);
	rt = NULL;
	}
	}
	if (!rt) {
	/* XXX: we need RTF_CLONING to fake nd6_rtrequest */
	error = rtrequest(RTM_ADD, (struct sockaddr *)&mltaddr,
	(struct sockaddr *)&ia->ia_addr,
	(struct sockaddr *)&mltmask, RTF_UP \| RTF_CLONING,
	(struct rtentry **)0);
	if (error)
	goto cleanup;
	} else {
	RTFREE_LOCKED(rt);
	}

	imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
	if (!imm) {
	nd6log((LOG_WARNING,
	"in6_update_ifa: addmulti failed for "
	"%s on %s (errno=%d)\n",
	ip6_sprintf(ip6buf, &mltaddr.sin6_addr),
	if_name(ifp), error));
	goto cleanup;
	}
	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);

	/*
	* join node information group address
	*/
	#define hostnamelen strlen(V_hostname)
	delay = 0;
	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
	/*
	* The spec doesn't say anything about delay for this
	* group, but the same logic should apply.
	*/
	delay = arc4random() %
	(MAX_RTR_SOLICITATION_DELAY * hz);
	}
	mtx_lock(&hostname_mtx);
	if (in6_nigroup(ifp, V_hostname, hostnamelen,
	&mltaddr.sin6_addr) == 0) {
	mtx_unlock(&hostname_mtx);
	imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error,
	delay); /* XXX jinmei */
	if (!imm) {
	nd6log((LOG_WARNING, "in6_update_ifa: "
	"addmulti failed for %s on %s "
	"(errno=%d)\n",
	ip6_sprintf(ip6buf, &mltaddr.sin6_addr),
	if_name(ifp), error));
	/* XXX not very fatal, go on... */
	} else {
	LIST_INSERT_HEAD(&ia->ia6_memberships,
	imm, i6mm_chain);
	}
	} else
	mtx_unlock(&hostname_mtx);
	#undef hostnamelen

	/*
	* join interface-local all-nodes address.
	* (ff01::1%ifN, and ff01::%ifN/32)
	*/
	mltaddr.sin6_addr = in6addr_nodelocal_allnodes;
	if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL))
	!= 0)
	goto cleanup; /* XXX: should not fail */
	/* XXX: again, do we really need the route? */
	rt = rtalloc1((struct sockaddr *)&mltaddr, 0, 0UL);
	if (rt) {
	if (memcmp(&mltaddr.sin6_addr,
	&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr,
	MLTMASK_LEN)) {
	RTFREE_LOCKED(rt);
	rt = NULL;
	}
	}
	if (!rt) {
	error = rtrequest(RTM_ADD, (struct sockaddr *)&mltaddr,
	(struct sockaddr *)&ia->ia_addr,
	(struct sockaddr *)&mltmask, RTF_UP \| RTF_CLONING,
	(struct rtentry **)0);
	if (error)
	goto cleanup;
	} else
	RTFREE_LOCKED(rt);

	imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
	if (!imm) {
	nd6log((LOG_WARNING, "in6_update_ifa: "
	"addmulti failed for %s on %s "
	"(errno=%d)\n",
	ip6_sprintf(ip6buf, &mltaddr.sin6_addr),
	if_name(ifp), error));
	goto cleanup;
	}
	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
	#undef MLTMASK_LEN
	}

	/*
	* Perform DAD, if needed.
	* XXX It may be of use, if we can administratively
	* disable DAD.
	*/
	if (hostIsNew && in6if_do_dad(ifp) &&
	((ifra->ifra_flags & IN6_IFF_NODAD) == 0) &&
	(ia->ia6_flags & IN6_IFF_TENTATIVE))
	{
	int mindelay, maxdelay;

	delay = 0;
	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
	/*
	* We need to impose a delay before sending an NS
	* for DAD. Check if we also needed a delay for the
	* corresponding MLD message. If we did, the delay
	* should be larger than the MLD delay (this could be
	* relaxed a bit, but this simple logic is at least
	* safe).
	*/
	mindelay = 0;
	if (in6m_sol != NULL &&
	in6m_sol->in6m_state == MLD_REPORTPENDING) {
	mindelay = in6m_sol->in6m_timer;
	}
	maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
	if (maxdelay - mindelay == 0)
	delay = 0;
	else {
	delay =
	(arc4random() % (maxdelay - mindelay)) +
	mindelay;
	}
	}
	nd6_dad_start((struct ifaddr *)ia, delay);
	}

	return (error);

	unlink:
	/*
	* XXX: if a change of an existing address failed, keep the entry
	* anyway.
	*/
	if (hostIsNew)
	in6_unlink_ifa(ia, ifp);
	return (error);

	cleanup:
	in6_purgeaddr(&ia->ia_ifa);
	return error;
	}

	void
	in6_purgeaddr(struct ifaddr *ifa)
	{
	struct ifnet *ifp = ifa->ifa_ifp;
	struct in6_ifaddr ia = (struct in6_ifaddr ) ifa;
	char ip6buf[INET6_ADDRSTRLEN];
	struct in6_multi_mship *imm;

	/* stop DAD processing */
	nd6_dad_stop(ifa);

	/*
	* delete route to the destination of the address being purged.
	* The interface must be p2p or loopback in this case.
	*/
	if ((ia->ia_flags & IFA_ROUTE) != 0 && ia->ia_dstaddr.sin6_len != 0) {
	int e;

	if ((e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST))
	!= 0) {
	log(LOG_ERR, "in6_purgeaddr: failed to remove "
	"a route to the p2p destination: %s on %s, "
	"errno=%d\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	if_name(ifp), e);
	/* proceed anyway... */
	} else
	ia->ia_flags &= ~IFA_ROUTE;
	}

	/* Remove ownaddr's loopback rtentry, if it exists. */
	in6_ifremloop(&(ia->ia_ifa));

	/*
	* leave from multicast groups we have joined for the interface
	*/
	while ((imm = ia->ia6_memberships.lh_first) != NULL) {
	LIST_REMOVE(imm, i6mm_chain);
	in6_leavegroup(imm);
	}

	in6_unlink_ifa(ia, ifp);
	}

	static void
	in6_unlink_ifa(struct in6_ifaddr ia, struct ifnet ifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct in6_ifaddr *oia;
	int s = splnet();

	TAILQ_REMOVE(&ifp->if_addrlist, &ia->ia_ifa, ifa_list);

	oia = ia;
	if (oia == (ia = V_in6_ifaddr))
	V_in6_ifaddr = ia->ia_next;
	else {
	while (ia->ia_next && (ia->ia_next != oia))
	ia = ia->ia_next;
	if (ia->ia_next)
	ia->ia_next = oia->ia_next;
	else {
	/* search failed */
	printf("Couldn't unlink in6_ifaddr from in6_ifaddr\n");
	}
	}

	/*
	* Release the reference to the base prefix. There should be a
	* positive reference.
	*/
	if (oia->ia6_ndpr == NULL) {
	nd6log((LOG_NOTICE,
	"in6_unlink_ifa: autoconf'ed address "
	"%p has no prefix\n", oia));
	} else {
	oia->ia6_ndpr->ndpr_refcnt--;
	oia->ia6_ndpr = NULL;
	}

	/*
	* Also, if the address being removed is autoconf'ed, call
	* pfxlist_onlink_check() since the release might affect the status of
	* other (detached) addresses.
	*/
	if ((oia->ia6_flags & IN6_IFF_AUTOCONF)) {
	pfxlist_onlink_check();
	}

	/*
	* release another refcnt for the link from in6_ifaddr.
	* Note that we should decrement the refcnt at least once for all *BSD.
	*/
	IFAFREE(&oia->ia_ifa);

	splx(s);
	}

	void
	in6_purgeif(struct ifnet *ifp)
	{
	struct ifaddr ifa, nifa;

	for (ifa = TAILQ_FIRST(&ifp->if_addrlist); ifa != NULL; ifa = nifa) {
	nifa = TAILQ_NEXT(ifa, ifa_list);
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	in6_purgeaddr(ifa);
	}

	in6_ifdetach(ifp);
	}

	/*
	* SIOC[GAD]LIFADDR.
	* SIOCGLIFADDR: get first address. (?)
	* SIOCGLIFADDR with IFLR_PREFIX:
	* get first address that matches the specified prefix.
	* SIOCALIFADDR: add the specified address.
	* SIOCALIFADDR with IFLR_PREFIX:
	* add the specified prefix, filling hostid part from
	* the first link-local address. prefixlen must be <= 64.
	* SIOCDLIFADDR: delete the specified address.
	* SIOCDLIFADDR with IFLR_PREFIX:
	* delete the first address that matches the specified prefix.
	* return values:
	* EINVAL on invalid parameters
	* EADDRNOTAVAIL on prefix match failed/specified address not found
	* other values may be returned from in6_ioctl()
	*
	* NOTE: SIOCALIFADDR(with IFLR_PREFIX set) allows prefixlen less than 64.
	* this is to accomodate address naming scheme other than RFC2374,
	* in the future.
	* RFC2373 defines interface id to be 64bit, but it allows non-RFC2374
	* address encoding scheme. (see figure on page 8)
	*/
	static int
	in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data,
	struct ifnet ifp, struct thread td)
	{
	struct if_laddrreq iflr = (struct if_laddrreq )data;
	struct ifaddr *ifa;
	struct sockaddr *sa;

	/* sanity checks */
	if (!data \|\| !ifp) {
	panic("invalid argument to in6_lifaddr_ioctl");
	/* NOTREACHED */
	}

	switch (cmd) {
	case SIOCGLIFADDR:
	/* address must be specified on GET with IFLR_PREFIX */
	if ((iflr->flags & IFLR_PREFIX) == 0)
	break;
	/* FALLTHROUGH */
	case SIOCALIFADDR:
	case SIOCDLIFADDR:
	/* address must be specified on ADD and DELETE */
	sa = (struct sockaddr *)&iflr->addr;
	if (sa->sa_family != AF_INET6)
	return EINVAL;
	if (sa->sa_len != sizeof(struct sockaddr_in6))
	return EINVAL;
	/* XXX need improvement */
	sa = (struct sockaddr *)&iflr->dstaddr;
	if (sa->sa_family && sa->sa_family != AF_INET6)
	return EINVAL;
	if (sa->sa_len && sa->sa_len != sizeof(struct sockaddr_in6))
	return EINVAL;
	break;
	default: /* shouldn't happen */
	#if 0
	panic("invalid cmd to in6_lifaddr_ioctl");
	/* NOTREACHED */
	#else
	return EOPNOTSUPP;
	#endif
	}
	if (sizeof(struct in6_addr) * 8 < iflr->prefixlen)
	return EINVAL;

	switch (cmd) {
	case SIOCALIFADDR:
	{
	struct in6_aliasreq ifra;
	struct in6_addr *hostid = NULL;
	int prefixlen;

	if ((iflr->flags & IFLR_PREFIX) != 0) {
	struct sockaddr_in6 *sin6;

	/*
	* hostid is to fill in the hostid part of the
	* address. hostid points to the first link-local
	* address attached to the interface.
	*/
	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0);
	if (!ifa)
	return EADDRNOTAVAIL;
	hostid = IFA_IN6(ifa);

	/* prefixlen must be <= 64. */
	if (64 < iflr->prefixlen)
	return EINVAL;
	prefixlen = iflr->prefixlen;

	/* hostid part must be zero. */
	sin6 = (struct sockaddr_in6 *)&iflr->addr;
	if (sin6->sin6_addr.s6_addr32[2] != 0 \|\|
	sin6->sin6_addr.s6_addr32[3] != 0) {
	return EINVAL;
	}
	} else
	prefixlen = iflr->prefixlen;

	/* copy args to in6_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
	bzero(&ifra, sizeof(ifra));
	bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name));

	bcopy(&iflr->addr, &ifra.ifra_addr,
	((struct sockaddr *)&iflr->addr)->sa_len);
	if (hostid) {
	/* fill in hostid part */
	ifra.ifra_addr.sin6_addr.s6_addr32[2] =
	hostid->s6_addr32[2];
	ifra.ifra_addr.sin6_addr.s6_addr32[3] =
	hostid->s6_addr32[3];
	}

	if (((struct sockaddr )&iflr->dstaddr)->sa_family) { / XXX */
	bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr,
	((struct sockaddr *)&iflr->dstaddr)->sa_len);
	if (hostid) {
	ifra.ifra_dstaddr.sin6_addr.s6_addr32[2] =
	hostid->s6_addr32[2];
	ifra.ifra_dstaddr.sin6_addr.s6_addr32[3] =
	hostid->s6_addr32[3];
	}
	}

	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
	in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen);

	ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX;
	return in6_control(so, SIOCAIFADDR_IN6, (caddr_t)&ifra, ifp, td);
	}
	case SIOCGLIFADDR:
	case SIOCDLIFADDR:
	{
	struct in6_ifaddr *ia;
	struct in6_addr mask, candidate, match;
	struct sockaddr_in6 *sin6;
	int cmp;

	bzero(&mask, sizeof(mask));
	if (iflr->flags & IFLR_PREFIX) {
	/* lookup a prefix rather than address. */
	in6_prefixlen2mask(&mask, iflr->prefixlen);

	sin6 = (struct sockaddr_in6 *)&iflr->addr;
	bcopy(&sin6->sin6_addr, &match, sizeof(match));
	match.s6_addr32[0] &= mask.s6_addr32[0];
	match.s6_addr32[1] &= mask.s6_addr32[1];
	match.s6_addr32[2] &= mask.s6_addr32[2];
	match.s6_addr32[3] &= mask.s6_addr32[3];

	/* if you set extra bits, that's wrong */
	if (bcmp(&match, &sin6->sin6_addr, sizeof(match)))
	return EINVAL;

	cmp = 1;
	} else {
	if (cmd == SIOCGLIFADDR) {
	/* on getting an address, take the 1st match */
	cmp = 0; /* XXX */
	} else {
	/* on deleting an address, do exact match */
	in6_prefixlen2mask(&mask, 128);
	sin6 = (struct sockaddr_in6 *)&iflr->addr;
	bcopy(&sin6->sin6_addr, &match, sizeof(match));

	cmp = 1;
	}
	}

	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	if (!cmp)
	break;

	/*
	* XXX: this is adhoc, but is necessary to allow
	* a user to specify fe80::/64 (not /10) for a
	* link-local address.
	*/
	bcopy(IFA_IN6(ifa), &candidate, sizeof(candidate));
	in6_clearscope(&candidate);
	candidate.s6_addr32[0] &= mask.s6_addr32[0];
	candidate.s6_addr32[1] &= mask.s6_addr32[1];
	candidate.s6_addr32[2] &= mask.s6_addr32[2];
	candidate.s6_addr32[3] &= mask.s6_addr32[3];
	if (IN6_ARE_ADDR_EQUAL(&candidate, &match))
	break;
	}
	if (!ifa)
	return EADDRNOTAVAIL;
	ia = ifa2ia6(ifa);

	if (cmd == SIOCGLIFADDR) {
	int error;

	/* fill in the if_laddrreq structure */
	bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin6_len);
	error = sa6_recoverscope(
	(struct sockaddr_in6 *)&iflr->addr);
	if (error != 0)
	return (error);

	if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
	bcopy(&ia->ia_dstaddr, &iflr->dstaddr,
	ia->ia_dstaddr.sin6_len);
	error = sa6_recoverscope(
	(struct sockaddr_in6 *)&iflr->dstaddr);
	if (error != 0)
	return (error);
	} else
	bzero(&iflr->dstaddr, sizeof(iflr->dstaddr));

	iflr->prefixlen =
	in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);

	iflr->flags = ia->ia6_flags; /* XXX */

	return 0;
	} else {
	struct in6_aliasreq ifra;

	/* fill in6_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
	bzero(&ifra, sizeof(ifra));
	bcopy(iflr->iflr_name, ifra.ifra_name,
	sizeof(ifra.ifra_name));

	bcopy(&ia->ia_addr, &ifra.ifra_addr,
	ia->ia_addr.sin6_len);
	if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
	bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr,
	ia->ia_dstaddr.sin6_len);
	} else {
	bzero(&ifra.ifra_dstaddr,
	sizeof(ifra.ifra_dstaddr));
	}
	bcopy(&ia->ia_prefixmask, &ifra.ifra_dstaddr,
	ia->ia_prefixmask.sin6_len);

	ifra.ifra_flags = ia->ia6_flags;
	return in6_control(so, SIOCDIFADDR_IN6, (caddr_t)&ifra,
	ifp, td);
	}
	}
	}

	return EOPNOTSUPP; /* just for safety */
	}

	/*
	* Initialize an interface's intetnet6 address
	* and routing table entry.
	*/
	static int
	in6_ifinit(struct ifnet ifp, struct in6_ifaddr ia,
	struct sockaddr_in6 *sin6, int newhost)
	{
	int error = 0, plen, ifacount = 0;
	int s = splimp();
	struct ifaddr *ifa;

	/*
	* Give the interface a chance to initialize
	* if this is its first address,
	* and to validate the address if necessary.
	*/
	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	ifacount++;
	}

	ia->ia_addr = *sin6;

	if (ifacount <= 1 && ifp->if_ioctl) {
	IFF_LOCKGIANT(ifp);
	error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
	IFF_UNLOCKGIANT(ifp);
	if (error) {
	splx(s);
	return (error);
	}
	}
	splx(s);

	ia->ia_ifa.ifa_metric = ifp->if_metric;

	/* we could do in(6)_socktrim here, but just omit it at this moment. */

	if (newhost) {
	/*
	* set the rtrequest function to create llinfo. It also
	* adjust outgoing interface of the route for the local
	* address when called via in6_ifaddloop() below.
	*/
	ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
	}

	/*
	* Special case:
	* If a new destination address is specified for a point-to-point
	* interface, install a route to the destination as an interface
	* direct route. In addition, if the link is expected to have neighbor
	* cache entries, specify RTF_LLINFO so that a cache entry for the
	* destination address will be created.
	* created
	* XXX: the logic below rejects assigning multiple addresses on a p2p
	* interface that share the same destination.
	*/
	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
	if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 &&
	ia->ia_dstaddr.sin6_family == AF_INET6) {
	int rtflags = RTF_UP \| RTF_HOST;
	struct rtentry rt = NULL, *rtp = NULL;

	if (nd6_need_cache(ifp) != 0) {
	rtflags \|= RTF_LLINFO;
	rtp = &rt;
	}

	error = rtrequest(RTM_ADD,
	(struct sockaddr *)&ia->ia_dstaddr,
	(struct sockaddr *)&ia->ia_addr,
	(struct sockaddr *)&ia->ia_prefixmask,
	ia->ia_flags \| rtflags, rtp);
	if (error != 0)
	return (error);
	if (rt != NULL) {
	struct llinfo_nd6 *ln;

	RT_LOCK(rt);
	ln = (struct llinfo_nd6 *)rt->rt_llinfo;
	if (ln != NULL) {
	/*
	* Set the state to STALE because we don't
	* have to perform address resolution on this
	* link.
	*/
	ln->ln_state = ND6_LLINFO_STALE;
	}
	RT_REMREF(rt);
	RT_UNLOCK(rt);
	}
	ia->ia_flags \|= IFA_ROUTE;
	}
	if (plen < 128) {
	/*
	* The RTF_CLONING flag is necessary for in6_is_ifloop_auto().
	*/
	ia->ia_ifa.ifa_flags \|= RTF_CLONING;
	}

	/* Add ownaddr as loopback rtentry, if necessary (ex. on p2p link). */
	if (newhost)
	in6_ifaddloop(&(ia->ia_ifa));

	return (error);
	}

	struct in6_multi_mship *
	in6_joingroup(struct ifnet ifp, struct in6_addr addr,
	int *errorp, int delay)
	{
	struct in6_multi_mship *imm;

	imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT);
	if (!imm) {
	*errorp = ENOBUFS;
	return NULL;
	}
	imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp, delay);
	if (!imm->i6mm_maddr) {
	/* errorp is alrady set /
	free(imm, M_IP6MADDR);
	return NULL;
	}
	return imm;
	}

	int
	in6_leavegroup(struct in6_multi_mship *imm)
	{

	if (imm->i6mm_maddr)
	in6_delmulti(imm->i6mm_maddr);
	free(imm, M_IP6MADDR);
	return 0;
	}

	/*
	* Find an IPv6 interface link-local address specific to an interface.
	*/
	struct in6_ifaddr *
	in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
	{
	struct ifaddr *ifa;

	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) {
	if ((((struct in6_ifaddr *)ifa)->ia6_flags &
	ignoreflags) != 0)
	continue;
	break;
	}
	}

	return ((struct in6_ifaddr *)ifa);
	}


	/*
	* find the internet address corresponding to a given interface and address.
	*/
	struct in6_ifaddr *
	in6ifa_ifpwithaddr(struct ifnet ifp, struct in6_addr addr)
	{
	struct ifaddr *ifa;

	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa)))
	break;
	}

	return ((struct in6_ifaddr *)ifa);
	}

	/*
	* Convert IP6 address to printable (loggable) representation. Caller
	* has to make sure that ip6buf is at least INET6_ADDRSTRLEN long.
	*/
	static char digits[] = "0123456789abcdef";
	char *
	ip6_sprintf(char ip6buf, const struct in6_addr addr)
	{
	int i;
	char *cp;
	const u_int16_t a = (const u_int16_t )addr;
	const u_int8_t *d;
	int dcolon = 0, zero = 0;

	cp = ip6buf;

	for (i = 0; i < 8; i++) {
	if (dcolon == 1) {
	if (*a == 0) {
	if (i == 7)
	*cp++ = ':';
	a++;
	continue;
	} else
	dcolon = 2;
	}
	if (*a == 0) {
	if (dcolon == 0 && *(a + 1) == 0) {
	if (i == 0)
	*cp++ = ':';
	*cp++ = ':';
	dcolon = 1;
	} else {
	*cp++ = '0';
	*cp++ = ':';
	}
	a++;
	continue;
	}
	d = (const u_char *)a;
	/* Try to eliminate leading zeros in printout like in :0001. */
	zero = 1;
	cp = digits[d >> 4];
	if (*cp != '0') {
	zero = 0;
	cp++;
	}
	cp = digits[d++ & 0xf];
	if (zero == 0 \|\| (*cp != '0')) {
	zero = 0;
	cp++;
	}
	cp = digits[d >> 4];
	if (zero == 0 \|\| (*cp != '0')) {
	zero = 0;
	cp++;
	}
	cp++ = digits[d & 0xf];
	*cp++ = ':';
	a++;
	}
	*--cp = '\0';
	return (ip6buf);
	}

	int
	in6_localaddr(struct in6_addr *in6)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr *ia;

	if (IN6_IS_ADDR_LOOPBACK(in6) \|\| IN6_IS_ADDR_LINKLOCAL(in6))
	return 1;

	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
	&ia->ia_prefixmask.sin6_addr)) {
	return 1;
	}
	}

	return (0);
	}

	int
	in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr *ia;

	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
	&sa6->sin6_addr) &&
	(ia->ia6_flags & IN6_IFF_DEPRECATED) != 0)
	return (1); /* true */

	/* XXX: do we still have to go thru the rest of the list? */
	}

	return (0); /* false */
	}

	/*
	* return length of part which dst and src are equal
	* hard coding...
	*/
	int
	in6_matchlen(struct in6_addr src, struct in6_addr dst)
	{
	int match = 0;
	u_char s = (u_char )src, d = (u_char )dst;
	u_char *lim = s + 16, r;

	while (s < lim)
	if ((r = (d++ ^ s++)) != 0) {
	while (r < 128) {
	match++;
	r <<= 1;
	}
	break;
	} else
	match += 8;
	return match;
	}

	/* XXX: to be scope conscious */
	int
	in6_are_prefix_equal(struct in6_addr p1, struct in6_addr p2, int len)
	{
	int bytelen, bitlen;

	/* sanity check */
	if (0 > len \|\| len > 128) {
	log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n",
	len);
	return (0);
	}

	bytelen = len / 8;
	bitlen = len % 8;

	if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen))
	return (0);
	if (bitlen != 0 &&
	p1->s6_addr[bytelen] >> (8 - bitlen) !=
	p2->s6_addr[bytelen] >> (8 - bitlen))
	return (0);

	return (1);
	}

	void
	in6_prefixlen2mask(struct in6_addr *maskp, int len)
	{
	u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
	int bytelen, bitlen, i;

	/* sanity check */
	if (0 > len \|\| len > 128) {
	log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
	len);
	return;
	}

	bzero(maskp, sizeof(*maskp));
	bytelen = len / 8;
	bitlen = len % 8;
	for (i = 0; i < bytelen; i++)
	maskp->s6_addr[i] = 0xff;
	if (bitlen)
	maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
	}

	/*
	* return the best address out of the same scope. if no address was
	* found, return the first valid address from designated IF.
	*/
	struct in6_ifaddr *
	in6_ifawithifp(struct ifnet ifp, struct in6_addr dst)
	{
	+ INIT_VNET_INET6(curvnet);
	int dst_scope = in6_addrscope(dst), blen = -1, tlen;
	struct ifaddr *ifa;
	struct in6_ifaddr *besta = 0;
	struct in6_ifaddr dep[2]; / last-resort: deprecated */

	dep[0] = dep[1] = NULL;

	/*
	* We first look for addresses in the same scope.
	* If there is one, return it.
	* If two or more, return one which matches the dst longest.
	* If none, return one of global addresses assigned other ifs.
	*/
	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
	continue; /* XXX: is there any case to allow anycast? */
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
	continue; /* don't use this interface */
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
	continue;
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
	if (V_ip6_use_deprecated)
	dep[0] = (struct in6_ifaddr *)ifa;
	continue;
	}

	if (dst_scope == in6_addrscope(IFA_IN6(ifa))) {
	/*
	* call in6_matchlen() as few as possible
	*/
	if (besta) {
	if (blen == -1)
	blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst);
	tlen = in6_matchlen(IFA_IN6(ifa), dst);
	if (tlen > blen) {
	blen = tlen;
	besta = (struct in6_ifaddr *)ifa;
	}
	} else
	besta = (struct in6_ifaddr *)ifa;
	}
	}
	if (besta)
	return (besta);

	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
	continue; /* XXX: is there any case to allow anycast? */
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
	continue; /* don't use this interface */
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
	continue;
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
	if (V_ip6_use_deprecated)
	dep[1] = (struct in6_ifaddr *)ifa;
	continue;
	}

	return (struct in6_ifaddr *)ifa;
	}

	/* use the last-resort values, that are, deprecated addresses */
	if (dep[0])
	return dep[0];
	if (dep[1])
	return dep[1];

	return NULL;
	}

	/*
	* perform DAD when interface becomes IFF_UP.
	*/
	void
	in6_if_up(struct ifnet *ifp)
	{
	struct ifaddr *ifa;
	struct in6_ifaddr *ia;

	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	ia = (struct in6_ifaddr *)ifa;
	if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
	/*
	* The TENTATIVE flag was likely set by hand
	* beforehand, implicitly indicating the need for DAD.
	* We may be able to skip the random delay in this
	* case, but we impose delays just in case.
	*/
	nd6_dad_start(ifa,
	arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz));
	}
	}

	/*
	* special cases, like 6to4, are handled in in6_ifattach
	*/
	in6_ifattach(ifp, NULL);
	}

	int
	in6if_do_dad(struct ifnet *ifp)
	{
	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
	return (0);

	switch (ifp->if_type) {
	#ifdef IFT_DUMMY
	case IFT_DUMMY:
	#endif
	case IFT_FAITH:
	/*
	* These interfaces do not have the IFF_LOOPBACK flag,
	* but loop packets back. We do not have to do DAD on such
	* interfaces. We should even omit it, because loop-backed
	* NS would confuse the DAD procedure.
	*/
	return (0);
	default:
	/*
	* Our DAD routine requires the interface up and running.
	* However, some interfaces can be up before the RUNNING
	* status. Additionaly, users may try to assign addresses
	* before the interface becomes up (or running).
	* We simply skip DAD in such a case as a work around.
	* XXX: we should rather mark "tentative" on such addresses,
	* and do DAD after the interface becomes ready.
	*/
	if (!((ifp->if_flags & IFF_UP) &&
	(ifp->if_drv_flags & IFF_DRV_RUNNING)))
	return (0);

	return (1);
	}
	}

	/*
	* Calculate max IPv6 MTU through all the interfaces and store it
	* to in6_maxmtu.
	*/
	void
	in6_setmaxmtu(void)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	unsigned long maxmtu = 0;
	struct ifnet *ifp;

	IFNET_RLOCK();
	for (ifp = TAILQ_FIRST(&V_ifnet); ifp;
	ifp = TAILQ_NEXT(ifp, if_list)) {
	/* this function can be called during ifnet initialization */
	if (!ifp->if_afdata[AF_INET6])
	continue;
	if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
	IN6_LINKMTU(ifp) > maxmtu)
	maxmtu = IN6_LINKMTU(ifp);
	}
	IFNET_RUNLOCK();
	if (maxmtu) /* update only when maxmtu is positive */
	V_in6_maxmtu = maxmtu;
	}

	/*
	* Provide the length of interface identifiers to be used for the link attached
	* to the given interface. The length should be defined in "IPv6 over
	* xxx-link" document. Note that address architecture might also define
	* the length for a particular set of address prefixes, regardless of the
	* link type. As clarified in rfc2462bis, those two definitions should be
	* consistent, and those really are as of August 2004.
	*/
	int
	in6_if2idlen(struct ifnet *ifp)
	{
	switch (ifp->if_type) {
	case IFT_ETHER: /* RFC2464 */
	#ifdef IFT_PROPVIRTUAL
	case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */
	#endif
	#ifdef IFT_L2VLAN
	case IFT_L2VLAN: /* ditto */
	#endif
	#ifdef IFT_IEEE80211
	case IFT_IEEE80211: /* ditto */
	#endif
	#ifdef IFT_MIP
	case IFT_MIP: /* ditto */
	#endif
	return (64);
	case IFT_FDDI: /* RFC2467 */
	return (64);
	case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */
	return (64);
	case IFT_PPP: /* RFC2472 */
	return (64);
	case IFT_ARCNET: /* RFC2497 */
	return (64);
	case IFT_FRELAY: /* RFC2590 */
	return (64);
	case IFT_IEEE1394: /* RFC3146 */
	return (64);
	case IFT_GIF:
	return (64); /* draft-ietf-v6ops-mech-v2-07 */
	case IFT_LOOP:
	return (64); /* XXX: is this really correct? */
	default:
	/*
	* Unknown link type:
	* It might be controversial to use the today's common constant
	* of 64 for these cases unconditionally. For full compliance,
	* we should return an error in this case. On the other hand,
	* if we simply miss the standard for the link type or a new
	* standard is defined for a new link type, the IFID length
	* is very likely to be the common constant. As a compromise,
	* we always use the constant, but make an explicit notice
	* indicating the "unknown" case.
	*/
	printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type);
	return (64);
	}
	}

	void *
	in6_domifattach(struct ifnet *ifp)
	{
	struct in6_ifextra *ext;

	ext = (struct in6_ifextra )malloc(sizeof(ext), M_IFADDR, M_WAITOK);
	bzero(ext, sizeof(*ext));

	ext->in6_ifstat = (struct in6_ifstat *)malloc(sizeof(struct in6_ifstat),
	M_IFADDR, M_WAITOK);
	bzero(ext->in6_ifstat, sizeof(*ext->in6_ifstat));

	ext->icmp6_ifstat =
	(struct icmp6_ifstat *)malloc(sizeof(struct icmp6_ifstat),
	M_IFADDR, M_WAITOK);
	bzero(ext->icmp6_ifstat, sizeof(*ext->icmp6_ifstat));

	ext->nd_ifinfo = nd6_ifattach(ifp);
	ext->scope6_id = scope6_ifattach(ifp);
	return ext;
	}

	void
	in6_domifdetach(struct ifnet ifp, void aux)
	{
	struct in6_ifextra ext = (struct in6_ifextra )aux;

	scope6_ifdetach(ext->scope6_id);
	nd6_ifdetach(ext->nd_ifinfo);
	free(ext->in6_ifstat, M_IFADDR);
	free(ext->icmp6_ifstat, M_IFADDR);
	free(ext, M_IFADDR);
	}

	/*
	* Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be
	* v4 mapped addr or v4 compat addr
	*/
	void
	in6_sin6_2_sin(struct sockaddr_in sin, struct sockaddr_in6 sin6)
	{

	bzero(sin, sizeof(*sin));
	sin->sin_len = sizeof(struct sockaddr_in);
	sin->sin_family = AF_INET;
	sin->sin_port = sin6->sin6_port;
	sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
	}

	/* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
	void
	in6_sin_2_v4mapsin6(struct sockaddr_in sin, struct sockaddr_in6 sin6)
	{
	bzero(sin6, sizeof(*sin6));
	sin6->sin6_len = sizeof(struct sockaddr_in6);
	sin6->sin6_family = AF_INET6;
	sin6->sin6_port = sin->sin_port;
	sin6->sin6_addr.s6_addr32[0] = 0;
	sin6->sin6_addr.s6_addr32[1] = 0;
	sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
	sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr;
	}

	/* Convert sockaddr_in6 into sockaddr_in. */
	void
	in6_sin6_2_sin_in_sock(struct sockaddr *nam)
	{
	struct sockaddr_in *sin_p;
	struct sockaddr_in6 sin6;

	/*
	* Save original sockaddr_in6 addr and convert it
	* to sockaddr_in.
	*/
	sin6 = (struct sockaddr_in6 )nam;
	sin_p = (struct sockaddr_in *)nam;
	in6_sin6_2_sin(sin_p, &sin6);
	}

	/* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
	void
	in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
	{
	struct sockaddr_in *sin_p;
	struct sockaddr_in6 *sin6_p;

	MALLOC(sin6_p, struct sockaddr_in6 , sizeof sin6_p, M_SONAME,
	M_WAITOK);
	sin_p = (struct sockaddr_in )nam;
	in6_sin_2_v4mapsin6(sin_p, sin6_p);
	FREE(*nam, M_SONAME);
	nam = (struct sockaddr )sin6_p;
	}
	Index: head/sys/netinet6/in6_gif.c
	===================================================================
	--- head/sys/netinet6/in6_gif.c (revision 183549)
	+++ head/sys/netinet6/in6_gif.c (revision 183550)
	@@ -1,425 +1,427 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/queue.h>
	#include <sys/syslog.h>
	#include <sys/protosw.h>
	#include <sys/malloc.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#ifdef INET
	#include <netinet/ip.h>
	#endif
	#include <netinet/ip_encap.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_gif.h>
	#include <netinet6/in6_var.h>
	#endif
	#include <netinet6/ip6protosw.h>
	#include <netinet/ip_ecn.h>
	#ifdef INET6
	#include <netinet6/ip6_ecn.h>
	#endif

	#include <net/if_gif.h>

	static int gif_validate6(const struct ip6_hdr , struct gif_softc ,
	struct ifnet *);

	extern struct domain inet6domain;
	struct ip6protosw in6_gif_protosw =
	{ SOCK_RAW, &inet6domain, 0/* IPPROTO_IPV[46] */, PR_ATOMIC\|PR_ADDR,
	in6_gif_input, rip6_output, 0, rip6_ctloutput,
	0,
	0, 0, 0, 0,
	&rip6_usrreqs
	};

	int
	in6_gif_output(struct ifnet *ifp,
	int family, /* family of the packet to be encapsulate */
	struct mbuf *m)
	{
	+ INIT_VNET_GIF(ifp->if_vnet);
	struct gif_softc *sc = ifp->if_softc;
	struct sockaddr_in6 dst = (struct sockaddr_in6 )&sc->gif_ro6.ro_dst;
	struct sockaddr_in6 sin6_src = (struct sockaddr_in6 )sc->gif_psrc;
	struct sockaddr_in6 sin6_dst = (struct sockaddr_in6 )sc->gif_pdst;
	struct ip6_hdr *ip6;
	struct etherip_header eiphdr;
	int proto, error;
	u_int8_t itos, otos;

	GIF_LOCK_ASSERT(sc);

	if (sin6_src == NULL \|\| sin6_dst == NULL \|\|
	sin6_src->sin6_family != AF_INET6 \|\|
	sin6_dst->sin6_family != AF_INET6) {
	m_freem(m);
	return EAFNOSUPPORT;
	}

	switch (family) {
	#ifdef INET
	case AF_INET:
	{
	struct ip *ip;

	proto = IPPROTO_IPV4;
	if (m->m_len < sizeof(*ip)) {
	m = m_pullup(m, sizeof(*ip));
	if (!m)
	return ENOBUFS;
	}
	ip = mtod(m, struct ip *);
	itos = ip->ip_tos;
	break;
	}
	#endif
	#ifdef INET6
	case AF_INET6:
	{
	struct ip6_hdr *ip6;
	proto = IPPROTO_IPV6;
	if (m->m_len < sizeof(*ip6)) {
	m = m_pullup(m, sizeof(*ip6));
	if (!m)
	return ENOBUFS;
	}
	ip6 = mtod(m, struct ip6_hdr *);
	itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	break;
	}
	#endif
	case AF_LINK:
	proto = IPPROTO_ETHERIP;
	eiphdr.eip_ver = ETHERIP_VERSION & ETHERIP_VER_VERS_MASK;
	eiphdr.eip_pad = 0;
	/* prepend Ethernet-in-IP header */
	M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT);
	if (m && m->m_len < sizeof(struct etherip_header))
	m = m_pullup(m, sizeof(struct etherip_header));
	if (m == NULL)
	return ENOBUFS;
	bcopy(&eiphdr, mtod(m, struct etherip_header *),
	sizeof(struct etherip_header));
	break;

	default:
	#ifdef DEBUG
	printf("in6_gif_output: warning: unknown family %d passed\n",
	family);
	#endif
	m_freem(m);
	return EAFNOSUPPORT;
	}

	/* prepend new IP header */
	M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT);
	if (m && m->m_len < sizeof(struct ip6_hdr))
	m = m_pullup(m, sizeof(struct ip6_hdr));
	if (m == NULL) {
	printf("ENOBUFS in in6_gif_output %d\n", __LINE__);
	return ENOBUFS;
	}

	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_plen = htons((u_short)m->m_pkthdr.len);
	ip6->ip6_nxt = proto;
	ip6->ip6_hlim = V_ip6_gif_hlim;
	ip6->ip6_src = sin6_src->sin6_addr;
	/* bidirectional configured tunnel mode */
	if (!IN6_IS_ADDR_UNSPECIFIED(&sin6_dst->sin6_addr))
	ip6->ip6_dst = sin6_dst->sin6_addr;
	else {
	m_freem(m);
	return ENETUNREACH;
	}
	ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE,
	&otos, &itos);
	ip6->ip6_flow &= ~htonl(0xff << 20);
	ip6->ip6_flow \|= htonl((u_int32_t)otos << 20);

	if (dst->sin6_family != sin6_dst->sin6_family \|\|
	!IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &sin6_dst->sin6_addr)) {
	/* cache route doesn't match */
	bzero(dst, sizeof(*dst));
	dst->sin6_family = sin6_dst->sin6_family;
	dst->sin6_len = sizeof(struct sockaddr_in6);
	dst->sin6_addr = sin6_dst->sin6_addr;
	if (sc->gif_ro6.ro_rt) {
	RTFREE(sc->gif_ro6.ro_rt);
	sc->gif_ro6.ro_rt = NULL;
	}
	#if 0
	GIF2IFP(sc)->if_mtu = GIF_MTU;
	#endif
	}

	if (sc->gif_ro6.ro_rt == NULL) {
	rtalloc((struct route *)&sc->gif_ro6);
	if (sc->gif_ro6.ro_rt == NULL) {
	m_freem(m);
	return ENETUNREACH;
	}

	/* if it constitutes infinite encapsulation, punt. */
	if (sc->gif_ro.ro_rt->rt_ifp == ifp) {
	m_freem(m);
	return ENETUNREACH; /XXX/
	}
	#if 0
	ifp->if_mtu = sc->gif_ro6.ro_rt->rt_ifp->if_mtu
	- sizeof(struct ip6_hdr);
	#endif
	}

	#ifdef IPV6_MINMTU
	/*
	* force fragmentation to minimum MTU, to avoid path MTU discovery.
	* it is too painful to ask for resend of inner packet, to achieve
	* path MTU discovery for encapsulated packets.
	*/
	error = ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, NULL);
	#else
	error = ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, NULL);
	#endif

	if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) &&
	sc->gif_ro6.ro_rt != NULL) {
	RTFREE(sc->gif_ro6.ro_rt);
	sc->gif_ro6.ro_rt = NULL;
	}

	return (error);
	}

	int
	in6_gif_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	struct ifnet *gifp = NULL;
	struct gif_softc *sc;
	struct ip6_hdr *ip6;
	int af = 0;
	u_int32_t otos;

	ip6 = mtod(m, struct ip6_hdr *);

	sc = (struct gif_softc *)encap_getarg(m);
	if (sc == NULL) {
	m_freem(m);
	V_ip6stat.ip6s_nogif++;
	return IPPROTO_DONE;
	}

	gifp = GIF2IFP(sc);
	if (gifp == NULL \|\| (gifp->if_flags & IFF_UP) == 0) {
	m_freem(m);
	V_ip6stat.ip6s_nogif++;
	return IPPROTO_DONE;
	}

	otos = ip6->ip6_flow;
	m_adj(m, *offp);

	switch (proto) {
	#ifdef INET
	case IPPROTO_IPV4:
	{
	struct ip *ip;
	u_int8_t otos8;
	af = AF_INET;
	otos8 = (ntohl(otos) >> 20) & 0xff;
	if (m->m_len < sizeof(*ip)) {
	m = m_pullup(m, sizeof(*ip));
	if (!m)
	return IPPROTO_DONE;
	}
	ip = mtod(m, struct ip *);
	if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ?
	ECN_ALLOWED : ECN_NOCARE,
	&otos8, &ip->ip_tos) == 0) {
	m_freem(m);
	return IPPROTO_DONE;
	}
	break;
	}
	#endif /* INET */
	#ifdef INET6
	case IPPROTO_IPV6:
	{
	struct ip6_hdr *ip6;
	af = AF_INET6;
	if (m->m_len < sizeof(*ip6)) {
	m = m_pullup(m, sizeof(*ip6));
	if (!m)
	return IPPROTO_DONE;
	}
	ip6 = mtod(m, struct ip6_hdr *);
	if (ip6_ecn_egress((gifp->if_flags & IFF_LINK1) ?
	ECN_ALLOWED : ECN_NOCARE,
	&otos, &ip6->ip6_flow) == 0) {
	m_freem(m);
	return IPPROTO_DONE;
	}
	break;
	}
	#endif
	case IPPROTO_ETHERIP:
	af = AF_LINK;
	break;

	default:
	V_ip6stat.ip6s_nogif++;
	m_freem(m);
	return IPPROTO_DONE;
	}

	gif_input(m, af, gifp);
	return IPPROTO_DONE;
	}

	/*
	* validate outer address.
	*/
	static int
	gif_validate6(const struct ip6_hdr ip6, struct gif_softc sc,
	struct ifnet *ifp)
	{
	struct sockaddr_in6 src, dst;

	src = (struct sockaddr_in6 *)sc->gif_psrc;
	dst = (struct sockaddr_in6 *)sc->gif_pdst;

	/*
	* Check for address match. Note that the check is for an incoming
	* packet. We should compare the source address in our configuration
	* and the destination address of the packet, and vice versa.
	*/
	if (!IN6_ARE_ADDR_EQUAL(&src->sin6_addr, &ip6->ip6_dst) \|\|
	!IN6_ARE_ADDR_EQUAL(&dst->sin6_addr, &ip6->ip6_src))
	return 0;

	/* martian filters on outer source - done in ip6_input */

	/* ingress filters on outer source */
	if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) {
	struct sockaddr_in6 sin6;
	struct rtentry *rt;

	bzero(&sin6, sizeof(sin6));
	sin6.sin6_family = AF_INET6;
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_addr = ip6->ip6_src;
	sin6.sin6_scope_id = 0; /* XXX */

	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
	if (!rt \|\| rt->rt_ifp != ifp) {
	#if 0
	char ip6buf[INET6_ADDRSTRLEN];
	log(LOG_WARNING, "%s: packet from %s dropped "
	"due to ingress filter\n", if_name(GIF2IFP(sc)),
	ip6_sprintf(ip6buf, &sin6.sin6_addr));
	#endif
	if (rt)
	rtfree(rt);
	return 0;
	}
	rtfree(rt);
	}

	return 128 * 2;
	}

	/*
	* we know that we are in IFF_UP, outer address available, and outer family
	* matched the physical addr family. see gif_encapcheck().
	* sanity check for arg should have been done in the caller.
	*/
	int
	gif_encapcheck6(const struct mbuf m, int off, int proto, void arg)
	{
	struct ip6_hdr ip6;
	struct gif_softc *sc;
	struct ifnet *ifp;

	/* sanity check done in caller */
	sc = (struct gif_softc *)arg;

	/* LINTED const cast */
	m_copydata(m, 0, sizeof(ip6), (caddr_t)&ip6);
	ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL;

	return gif_validate6(&ip6, sc, ifp);
	}

	int
	in6_gif_attach(struct gif_softc *sc)
	{
	sc->encap_cookie6 = encap_attach_func(AF_INET6, -1, gif_encapcheck,
	(void *)&in6_gif_protosw, sc);
	if (sc->encap_cookie6 == NULL)
	return EEXIST;
	return 0;
	}

	int
	in6_gif_detach(struct gif_softc *sc)
	{
	int error;

	error = encap_detach(sc->encap_cookie6);
	if (error == 0)
	sc->encap_cookie6 = NULL;
	return error;
	}
	Index: head/sys/netinet6/in6_ifattach.c
	===================================================================
	--- head/sys/netinet6/in6_ifattach.c (revision 183549)
	+++ head/sys/netinet6/in6_ifattach.c (revision 183550)
	@@ -1,908 +1,920 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/md5.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/if_ether.h>
	#include <netinet/in_pcb.h>

	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/scope6_var.h>

	unsigned long in6_maxmtu = 0;

	#ifdef IP6_AUTO_LINKLOCAL
	int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL;
	#else
	int ip6_auto_linklocal = 1; /* enable by default */
	#endif

	struct callout in6_tmpaddrtimer_ch;

	extern struct inpcbinfo udbinfo;
	extern struct inpcbinfo ripcbinfo;

	static int get_rand_ifid(struct ifnet , struct in6_addr );
	static int generate_tmp_ifid(u_int8_t , const u_int8_t , u_int8_t *);
	static int get_ifid(struct ifnet , struct ifnet , struct in6_addr *);
	static int in6_ifattach_linklocal(struct ifnet , struct ifnet );
	static int in6_ifattach_loopback(struct ifnet *);
	static void in6_purgemaddrs(struct ifnet *);

	#define EUI64_GBIT 0x01
	#define EUI64_UBIT 0x02
	#define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0)
	#define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT)
	#define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6))
	#define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT)
	#define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6))

	#define IFID_LOCAL(in6) (!EUI64_LOCAL(in6))
	#define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6))

	/*
	* Generate a last-resort interface identifier, when the machine has no
	* IEEE802/EUI64 address sources.
	* The goal here is to get an interface identifier that is
	* (1) random enough and (2) does not change across reboot.
	* We currently use MD5(hostname) for it.
	*
	* in6 - upper 64bits are preserved
	*/
	static int
	get_rand_ifid(struct ifnet ifp, struct in6_addr in6)
	{
	+ INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */
	MD5_CTX ctxt;
	u_int8_t digest[16];
	int hostnamelen;

	mtx_lock(&hostname_mtx);
	hostnamelen = strlen(V_hostname);
	#if 0
	/* we need at least several letters as seed for ifid */
	if (hostnamelen < 3)
	return -1;
	#endif

	/* generate 8 bytes of pseudo-random value. */
	bzero(&ctxt, sizeof(ctxt));
	MD5Init(&ctxt);
	MD5Update(&ctxt, V_hostname, hostnamelen);
	mtx_unlock(&hostname_mtx);
	MD5Final(digest, &ctxt);

	/* assumes sizeof(digest) > sizeof(ifid) */
	bcopy(digest, &in6->s6_addr[8], 8);

	/* make sure to set "u" bit to local, and "g" bit to individual. */
	in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */
	in6->s6_addr[8] \|= EUI64_UBIT; /* u bit to "local" */

	/* convert EUI64 into IPv6 interface identifier */
	EUI64_TO_IFID(in6);

	return 0;
	}

	static int
	generate_tmp_ifid(u_int8_t seed0, const u_int8_t seed1, u_int8_t *ret)
	{
	+ INIT_VNET_INET6(curvnet);
	MD5_CTX ctxt;
	u_int8_t seed[16], digest[16], nullbuf[8];
	u_int32_t val32;

	/* If there's no history, start with a random seed. */
	bzero(nullbuf, sizeof(nullbuf));
	if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) {
	int i;

	for (i = 0; i < 2; i++) {
	val32 = arc4random();
	bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32));
	}
	} else
	bcopy(seed0, seed, 8);

	/* copy the right-most 64-bits of the given address */
	/* XXX assumption on the size of IFID */
	bcopy(seed1, &seed[8], 8);

	if (0) { /* for debugging purposes only */
	int i;

	printf("generate_tmp_ifid: new randomized ID from: ");
	for (i = 0; i < 16; i++)
	printf("%02x", seed[i]);
	printf(" ");
	}

	/* generate 16 bytes of pseudo-random value. */
	bzero(&ctxt, sizeof(ctxt));
	MD5Init(&ctxt);
	MD5Update(&ctxt, seed, sizeof(seed));
	MD5Final(digest, &ctxt);

	/*
	* RFC 3041 3.2.1. (3)
	* Take the left-most 64-bits of the MD5 digest and set bit 6 (the
	* left-most bit is numbered 0) to zero.
	*/
	bcopy(digest, ret, 8);
	ret[0] &= ~EUI64_UBIT;

	/*
	* XXX: we'd like to ensure that the generated value is not zero
	* for simplicity. If the caclculated digest happens to be zero,
	* use a random non-zero value as the last resort.
	*/
	if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
	nd6log((LOG_INFO,
	"generate_tmp_ifid: computed MD5 value is zero.\n"));

	val32 = arc4random();
	val32 = 1 + (val32 % (0xffffffff - 1));
	}

	/*
	* RFC 3041 3.2.1. (4)
	* Take the rightmost 64-bits of the MD5 digest and save them in
	* stable storage as the history value to be used in the next
	* iteration of the algorithm.
	*/
	bcopy(&digest[8], seed0, 8);

	if (0) { /* for debugging purposes only */
	int i;

	printf("to: ");
	for (i = 0; i < 16; i++)
	printf("%02x", digest[i]);
	printf("\n");
	}

	return 0;
	}

	/*
	* Get interface identifier for the specified interface.
	* XXX assumes single sockaddr_dl (AF_LINK address) per an interface
	*
	* in6 - upper 64bits are preserved
	*/
	int
	in6_get_hw_ifid(struct ifnet ifp, struct in6_addr in6)
	{
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;
	u_int8_t *addr;
	size_t addrlen;
	static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
	static u_int8_t allone[8] =
	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

	for (ifa = ifp->if_addrlist.tqh_first;
	ifa;
	ifa = ifa->ifa_list.tqe_next) {
	if (ifa->ifa_addr->sa_family != AF_LINK)
	continue;
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	if (sdl == NULL)
	continue;
	if (sdl->sdl_alen == 0)
	continue;

	goto found;
	}

	return -1;

	found:
	addr = LLADDR(sdl);
	addrlen = sdl->sdl_alen;

	/* get EUI64 */
	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_ISO88025:
	case IFT_ATM:
	case IFT_IEEE1394:
	#ifdef IFT_IEEE80211
	case IFT_IEEE80211:
	#endif
	/* IEEE802/EUI64 cases - what others? */
	/* IEEE1394 uses 16byte length address starting with EUI64 */
	if (addrlen > 8)
	addrlen = 8;

	/* look at IEEE802/EUI64 only */
	if (addrlen != 8 && addrlen != 6)
	return -1;

	/*
	* check for invalid MAC address - on bsdi, we see it a lot
	* since wildboar configures all-zero MAC on pccard before
	* card insertion.
	*/
	if (bcmp(addr, allzero, addrlen) == 0)
	return -1;
	if (bcmp(addr, allone, addrlen) == 0)
	return -1;

	/* make EUI64 address */
	if (addrlen == 8)
	bcopy(addr, &in6->s6_addr[8], 8);
	else if (addrlen == 6) {
	in6->s6_addr[8] = addr[0];
	in6->s6_addr[9] = addr[1];
	in6->s6_addr[10] = addr[2];
	in6->s6_addr[11] = 0xff;
	in6->s6_addr[12] = 0xfe;
	in6->s6_addr[13] = addr[3];
	in6->s6_addr[14] = addr[4];
	in6->s6_addr[15] = addr[5];
	}
	break;

	case IFT_ARCNET:
	if (addrlen != 1)
	return -1;
	if (!addr[0])
	return -1;

	bzero(&in6->s6_addr[8], 8);
	in6->s6_addr[15] = addr[0];

	/*
	* due to insufficient bitwidth, we mark it local.
	*/
	in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */
	in6->s6_addr[8] \|= EUI64_UBIT; /* u bit to "local" */
	break;

	case IFT_GIF:
	#ifdef IFT_STF
	case IFT_STF:
	#endif
	/*
	* RFC2893 says: "SHOULD use IPv4 address as ifid source".
	* however, IPv4 address is not very suitable as unique
	* identifier source (can be renumbered).
	* we don't do this.
	*/
	return -1;

	default:
	return -1;
	}

	/* sanity check: g bit must not indicate "group" */
	if (EUI64_GROUP(in6))
	return -1;

	/* convert EUI64 into IPv6 interface identifier */
	EUI64_TO_IFID(in6);

	/*
	* sanity check: ifid must not be all zero, avoid conflict with
	* subnet router anycast
	*/
	if ((in6->s6_addr[8] & ~(EUI64_GBIT \| EUI64_UBIT)) == 0x00 &&
	bcmp(&in6->s6_addr[9], allzero, 7) == 0) {
	return -1;
	}

	return 0;
	}

	/*
	* Get interface identifier for the specified interface. If it is not
	* available on ifp0, borrow interface identifier from other information
	* sources.
	*
	* altifp - secondary EUI64 source
	*/
	static int
	get_ifid(struct ifnet ifp0, struct ifnet altifp,
	struct in6_addr *in6)
	{
	+ INIT_VNET_NET(ifp0->if_vnet);
	+ INIT_VNET_INET6(ifp0->if_vnet);
	struct ifnet *ifp;

	/* first, try to get it from the interface itself */
	if (in6_get_hw_ifid(ifp0, in6) == 0) {
	nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n",
	if_name(ifp0)));
	goto success;
	}

	/* try secondary EUI64 source. this basically is for ATM PVC */
	if (altifp && in6_get_hw_ifid(altifp, in6) == 0) {
	nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n",
	if_name(ifp0), if_name(altifp)));
	goto success;
	}

	/* next, try to get it from some other hardware interface */
	IFNET_RLOCK();
	for (ifp = V_ifnet.tqh_first; ifp; ifp = ifp->if_list.tqe_next) {
	if (ifp == ifp0)
	continue;
	if (in6_get_hw_ifid(ifp, in6) != 0)
	continue;

	/*
	* to borrow ifid from other interface, ifid needs to be
	* globally unique
	*/
	if (IFID_UNIVERSAL(in6)) {
	nd6log((LOG_DEBUG,
	"%s: borrow interface identifier from %s\n",
	if_name(ifp0), if_name(ifp)));
	IFNET_RUNLOCK();
	goto success;
	}
	}
	IFNET_RUNLOCK();

	/* last resort: get from random number source */
	if (get_rand_ifid(ifp, in6) == 0) {
	nd6log((LOG_DEBUG,
	"%s: interface identifier generated by random number\n",
	if_name(ifp0)));
	goto success;
	}

	printf("%s: failed to get interface identifier\n", if_name(ifp0));
	return -1;

	success:
	nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
	if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
	in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
	in6->s6_addr[14], in6->s6_addr[15]));
	return 0;
	}

	/*
	* altifp - secondary EUI64 source
	*/
	static int
	in6_ifattach_linklocal(struct ifnet ifp, struct ifnet altifp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr *ia;
	struct in6_aliasreq ifra;
	struct nd_prefixctl pr0;
	int i, error;

	/*
	* configure link-local address.
	*/
	bzero(&ifra, sizeof(ifra));

	/*
	* in6_update_ifa() does not use ifra_name, but we accurately set it
	* for safety.
	*/
	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));

	ifra.ifra_addr.sin6_family = AF_INET6;
	ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
	ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
	ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
	ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
	ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
	} else {
	if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) {
	nd6log((LOG_ERR,
	"%s: no ifid available\n", if_name(ifp)));
	return (-1);
	}
	}
	if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
	return (-1);

	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
	ifra.ifra_prefixmask.sin6_family = AF_INET6;
	ifra.ifra_prefixmask.sin6_addr = in6mask64;
	/* link-local addresses should NEVER expire. */
	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;

	/*
	* Now call in6_update_ifa() to do a bunch of procedures to configure
	* a link-local address. We can set the 3rd argument to NULL, because
	* we know there's no other link-local address on the interface
	* and therefore we are adding one (instead of updating one).
	*/
	if ((error = in6_update_ifa(ifp, &ifra, NULL,
	IN6_IFAUPDATE_DADDELAY)) != 0) {
	/*
	* XXX: When the interface does not support IPv6, this call
	* would fail in the SIOCSIFADDR ioctl. I believe the
	* notification is rather confusing in this case, so just
	* suppress it. (jinmei@kame.net 20010130)
	*/
	if (error != EAFNOSUPPORT)
	nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to "
	"configure a link-local address on %s "
	"(errno=%d)\n",
	if_name(ifp), error));
	return (-1);
	}

	ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */
	#ifdef DIAGNOSTIC
	if (!ia) {
	panic("ia == NULL in in6_ifattach_linklocal");
	/* NOTREACHED */
	}
	#endif

	/*
	* Make the link-local prefix (fe80::%link/64) as on-link.
	* Since we'd like to manage prefixes separately from addresses,
	* we make an ND6 prefix structure for the link-local prefix,
	* and add it to the prefix list as a never-expire prefix.
	* XXX: this change might affect some existing code base...
	*/
	bzero(&pr0, sizeof(pr0));
	pr0.ndpr_ifp = ifp;
	/* this should be 64 at this moment. */
	pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL);
	pr0.ndpr_prefix = ifra.ifra_addr;
	/* apply the mask for safety. (nd6_prelist_add will apply it again) */
	for (i = 0; i < 4; i++) {
	pr0.ndpr_prefix.sin6_addr.s6_addr32[i] &=
	in6mask64.s6_addr32[i];
	}
	/*
	* Initialize parameters. The link-local prefix must always be
	* on-link, and its lifetimes never expire.
	*/
	pr0.ndpr_raf_onlink = 1;
	pr0.ndpr_raf_auto = 1; /* probably meaningless */
	pr0.ndpr_vltime = ND6_INFINITE_LIFETIME;
	pr0.ndpr_pltime = ND6_INFINITE_LIFETIME;
	/*
	* Since there is no other link-local addresses, nd6_prefix_lookup()
	* probably returns NULL. However, we cannot always expect the result.
	* For example, if we first remove the (only) existing link-local
	* address, and then reconfigure another one, the prefix is still
	* valid with referring to the old link-local address.
	*/
	if (nd6_prefix_lookup(&pr0) == NULL) {
	if ((error = nd6_prelist_add(&pr0, NULL, NULL)) != 0)
	return (error);
	}

	return 0;
	}

	/*
	* ifp - must be IFT_LOOP
	*/
	static int
	in6_ifattach_loopback(struct ifnet *ifp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_aliasreq ifra;
	int error;

	bzero(&ifra, sizeof(ifra));

	/*
	* in6_update_ifa() does not use ifra_name, but we accurately set it
	* for safety.
	*/
	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));

	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
	ifra.ifra_prefixmask.sin6_family = AF_INET6;
	ifra.ifra_prefixmask.sin6_addr = in6mask128;

	/*
	* Always initialize ia_dstaddr (= broadcast address) to loopback
	* address. Follows IPv4 practice - see in_ifinit().
	*/
	ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6);
	ifra.ifra_dstaddr.sin6_family = AF_INET6;
	ifra.ifra_dstaddr.sin6_addr = in6addr_loopback;

	ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
	ifra.ifra_addr.sin6_family = AF_INET6;
	ifra.ifra_addr.sin6_addr = in6addr_loopback;

	/* the loopback address should NEVER expire. */
	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;

	/* we don't need to perform DAD on loopback interfaces. */
	ifra.ifra_flags \|= IN6_IFF_NODAD;

	/* skip registration to the prefix list. XXX should be temporary. */
	ifra.ifra_flags \|= IN6_IFF_NOPFX;

	/*
	* We are sure that this is a newly assigned address, so we can set
	* NULL to the 3rd arg.
	*/
	if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) {
	nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure "
	"the loopback address on %s (errno=%d)\n",
	if_name(ifp), error));
	return (-1);
	}

	return 0;
	}

	/*
	* compute NI group address, based on the current hostname setting.
	* see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later).
	*
	* when ifp == NULL, the caller is responsible for filling scopeid.
	*/
	int
	in6_nigroup(struct ifnet ifp, const char name, int namelen,
	struct in6_addr *in6)
	{
	const char *p;
	u_char *q;
	MD5_CTX ctxt;
	u_int8_t digest[16];
	char l;
	char n[64]; /* a single label must not exceed 63 chars */

	if (!namelen \|\| !name)
	return -1;

	p = name;
	while (p && p && p != '.' && p - name < namelen)
	p++;
	if (p - name > sizeof(n) - 1)
	return -1; /* label too long */
	l = p - name;
	strncpy(n, name, l);
	n[(int)l] = '\0';
	for (q = n; *q; q++) {
	if ('A' <= q && q <= 'Z')
	q = q - 'A' + 'a';
	}

	/* generate 8 bytes of pseudo-random value. */
	bzero(&ctxt, sizeof(ctxt));
	MD5Init(&ctxt);
	MD5Update(&ctxt, &l, sizeof(l));
	MD5Update(&ctxt, n, l);
	MD5Final(digest, &ctxt);

	bzero(in6, sizeof(*in6));
	in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL;
	in6->s6_addr8[11] = 2;
	bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3]));
	if (in6_setscope(in6, ifp, NULL))
	return (-1); /* XXX: should not fail */

	return 0;
	}

	/*
	* XXX multiple loopback interface needs more care. for instance,
	* nodelocal address needs to be configured onto only one of them.
	* XXX multiple link-local address case
	*
	* altifp - secondary EUI64 source
	*/
	void
	in6_ifattach(struct ifnet ifp, struct ifnet altifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct in6_ifaddr *ia;
	struct in6_addr in6;

	/* some of the interfaces are inherently not IPv6 capable */
	switch (ifp->if_type) {
	case IFT_PFLOG:
	case IFT_PFSYNC:
	case IFT_CARP:
	return;
	}

	/*
	* quirks based on interface type
	*/
	switch (ifp->if_type) {
	#ifdef IFT_STF
	case IFT_STF:
	/*
	* 6to4 interface is a very special kind of beast.
	* no multicast, no linklocal. RFC2529 specifies how to make
	* linklocals for 6to4 interface, but there's no use and
	* it is rather harmful to have one.
	*/
	goto statinit;
	#endif
	default:
	break;
	}

	/*
	* usually, we require multicast capability to the interface
	*/
	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
	nd6log((LOG_INFO, "in6_ifattach: "
	"%s is not multicast capable, IPv6 not enabled\n",
	if_name(ifp)));
	return;
	}

	/*
	* assign loopback address for loopback interface.
	* XXX multiple loopback interface case.
	*/
	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
	in6 = in6addr_loopback;
	if (in6ifa_ifpwithaddr(ifp, &in6) == NULL) {
	if (in6_ifattach_loopback(ifp) != 0)
	return;
	}
	}

	/*
	* assign a link-local address, if there's none.
	*/
	if (V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) {
	ia = in6ifa_ifpforlinklocal(ifp, 0);
	if (ia == NULL) {
	if (in6_ifattach_linklocal(ifp, altifp) == 0) {
	/* linklocal address assigned */
	} else {
	/* failed to assign linklocal address. bark? */
	}
	}
	}

	#ifdef IFT_STF /* XXX */
	statinit:
	#endif

	/* update dynamically. */
	if (V_in6_maxmtu < ifp->if_mtu)
	V_in6_maxmtu = ifp->if_mtu;
	}

	/*
	* NOTE: in6_ifdetach() does not support loopback if at this moment.
	* We don't need this function in bsdi, because interfaces are never removed
	* from the ifnet list in bsdi.
	*/
	void
	in6_ifdetach(struct ifnet *ifp)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	+ INIT_VNET_INET(ifp->if_vnet);
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct in6_ifaddr ia, oia;
	struct ifaddr ifa, next;
	struct rtentry *rt;
	short rtflags;
	struct sockaddr_in6 sin6;
	struct in6_multi_mship *imm;

	/* remove neighbor management table */
	nd6_purge(ifp);

	/* nuke any of IPv6 addresses we have */
	for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) {
	next = ifa->ifa_list.tqe_next;
	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;
	in6_purgeaddr(ifa);
	}

	/* undo everything done by in6_ifattach(), just in case */
	for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) {
	next = ifa->ifa_list.tqe_next;

	if (ifa->ifa_addr->sa_family != AF_INET6
	\|\| !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)->sin6_addr)) {
	continue;
	}

	ia = (struct in6_ifaddr *)ifa;

	/*
	* leave from multicast groups we have joined for the interface
	*/
	while ((imm = ia->ia6_memberships.lh_first) != NULL) {
	LIST_REMOVE(imm, i6mm_chain);
	in6_leavegroup(imm);
	}

	/* remove from the routing table */
	if ((ia->ia_flags & IFA_ROUTE) &&
	(rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0UL))) {
	rtflags = rt->rt_flags;
	rtfree(rt);
	rtrequest(RTM_DELETE, (struct sockaddr *)&ia->ia_addr,
	(struct sockaddr *)&ia->ia_addr,
	(struct sockaddr *)&ia->ia_prefixmask,
	rtflags, (struct rtentry **)0);
	}

	/* remove from the linked list */
	TAILQ_REMOVE(&ifp->if_addrlist, (struct ifaddr *)ia, ifa_list);
	IFAFREE(&ia->ia_ifa);

	/* also remove from the IPv6 address chain(itojun&jinmei) */
	oia = ia;
	if (oia == (ia = V_in6_ifaddr))
	V_in6_ifaddr = ia->ia_next;
	else {
	while (ia->ia_next && (ia->ia_next != oia))
	ia = ia->ia_next;
	if (ia->ia_next)
	ia->ia_next = oia->ia_next;
	else {
	nd6log((LOG_ERR,
	"%s: didn't unlink in6ifaddr from list\n",
	if_name(ifp)));
	}
	}

	IFAFREE(&oia->ia_ifa);
	}

	in6_pcbpurgeif0(&V_udbinfo, ifp);
	in6_pcbpurgeif0(&V_ripcbinfo, ifp);
	/* leave from all multicast groups joined */
	in6_purgemaddrs(ifp);

	/*
	* remove neighbor management table. we call it twice just to make
	* sure we nuke everything. maybe we need just one call.
	* XXX: since the first call did not release addresses, some prefixes
	* might remain. We should call nd6_purge() again to release the
	* prefixes after removing all addresses above.
	* (Or can we just delay calling nd6_purge until at this point?)
	*/
	nd6_purge(ifp);

	/* remove route to link-local allnodes multicast (ff02::1) */
	bzero(&sin6, sizeof(sin6));
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_addr = in6addr_linklocal_allnodes;
	if (in6_setscope(&sin6.sin6_addr, ifp, NULL))
	/* XXX: should not fail */
	return;
	/* XXX grab lock first to avoid LOR */
	if (V_rt_tables[0][AF_INET6] != NULL) {
	RADIX_NODE_HEAD_LOCK(V_rt_tables[0][AF_INET6]);
	rt = rtalloc1((struct sockaddr *)&sin6, 0, 0UL);
	if (rt) {
	if (rt->rt_ifp == ifp)
	rtexpunge(rt);
	RTFREE_LOCKED(rt);
	}
	RADIX_NODE_HEAD_UNLOCK(V_rt_tables[0][AF_INET6]);
	}
	}

	int
	in6_get_tmpifid(struct ifnet ifp, u_int8_t retbuf,
	const u_int8_t *baseid, int generate)
	{
	u_int8_t nullbuf[8];
	struct nd_ifinfo *ndi = ND_IFINFO(ifp);

	bzero(nullbuf, sizeof(nullbuf));
	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) {
	/* we've never created a random ID. Create a new one. */
	generate = 1;
	}

	if (generate) {
	bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1));

	/* generate_tmp_ifid will update seedn and buf */
	(void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1,
	ndi->randomid);
	}
	bcopy(ndi->randomid, retbuf, 8);

	return (0);
	}

	void
	in6_tmpaddrtimer(void *ignored_arg)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct nd_ifinfo *ndi;
	u_int8_t nullbuf[8];
	struct ifnet *ifp;

	callout_reset(&V_in6_tmpaddrtimer_ch,
	(V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
	V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, NULL);

	bzero(nullbuf, sizeof(nullbuf));
	for (ifp = TAILQ_FIRST(&V_ifnet); ifp;
	ifp = TAILQ_NEXT(ifp, if_list)) {
	ndi = ND_IFINFO(ifp);
	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) {
	/*
	* We've been generating a random ID on this interface.
	* Create a new one.
	*/
	(void)generate_tmp_ifid(ndi->randomseed0,
	ndi->randomseed1, ndi->randomid);
	}
	}

	}

	static void
	in6_purgemaddrs(struct ifnet *ifp)
	{
	struct in6_multi *in6m;
	struct in6_multi *oin6m;

	#ifdef DIAGNOSTIC
	printf("%s: purging ifp %p\n", __func__, ifp);
	#endif

	IFF_LOCKGIANT(ifp);
	LIST_FOREACH_SAFE(in6m, &in6_multihead, in6m_entry, oin6m) {
	if (in6m->in6m_ifp == ifp)
	in6_delmulti(in6m);
	}
	IFF_UNLOCKGIANT(ifp);
	}
	Index: head/sys/netinet6/in6_pcb.c
	===================================================================
	--- head/sys/netinet6/in6_pcb.c (revision 183549)
	+++ head/sys/netinet6/in6_pcb.c (revision 183550)
	@@ -1,912 +1,915 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/jail.h>
	#include <sys/vimage.h>

	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/tcp_var.h>
	#include <netinet/ip6.h>
	#include <netinet/ip_var.h>

	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/in_pcb.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/scope6_var.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#include <netipsec/key.h>
	#endif /* IPSEC */

	#include <security/mac/mac_framework.h>

	struct in6_addr zeroin6_addr;

	int
	in6_pcbbind(register struct inpcb inp, struct sockaddr nam,
	struct ucred *cred)
	{
	+ INIT_VNET_INET6(inp->inp_vnet);
	+ INIT_VNET_INET(inp->inp_vnet);
	struct socket *so = inp->inp_socket;
	struct sockaddr_in6 sin6 = (struct sockaddr_in6 )NULL;
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
	u_short lport = 0;
	int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	INP_WLOCK_ASSERT(inp);

	if (!V_in6_ifaddr) /* XXX broken! */
	return (EADDRNOTAVAIL);
	if (inp->inp_lport \|\| !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
	return (EINVAL);
	if ((so->so_options & (SO_REUSEADDR\|SO_REUSEPORT)) == 0)
	wild = INPLOOKUP_WILDCARD;
	if (nam) {
	int error;

	sin6 = (struct sockaddr_in6 *)nam;
	if (nam->sa_len != sizeof(*sin6))
	return (EINVAL);
	/*
	* family check.
	*/
	if (nam->sa_family != AF_INET6)
	return (EAFNOSUPPORT);

	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
	return(error);

	lport = sin6->sin6_port;
	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
	/*
	* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
	* allow compepte duplication of binding if
	* SO_REUSEPORT is set, or if SO_REUSEADDR is set
	* and a multicast address is bound on both
	* new and duplicated sockets.
	*/
	if (so->so_options & SO_REUSEADDR)
	reuseport = SO_REUSEADDR\|SO_REUSEPORT;
	} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
	struct ifaddr *ia = NULL;

	sin6->sin6_port = 0; /* yech... */
	if ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0)
	return (EADDRNOTAVAIL);

	/*
	* XXX: bind to an anycast address might accidentally
	* cause sending a packet with anycast source address.
	* We should allow to bind to a deprecated address, since
	* the application dares to use it.
	*/
	if (ia &&
	((struct in6_ifaddr *)ia)->ia6_flags &
	(IN6_IFF_ANYCAST\|IN6_IFF_NOTREADY\|IN6_IFF_DETACHED)) {
	return (EADDRNOTAVAIL);
	}
	}
	if (lport) {
	struct inpcb *t;

	/* GROSS */
	if (ntohs(lport) <= V_ipport_reservedhigh &&
	ntohs(lport) >= V_ipport_reservedlow &&
	priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
	0))
	return (EACCES);
	if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
	priv_check_cred(so->so_cred,
	PRIV_NETINET_REUSEPORT, 0) != 0) {
	t = in6_pcblookup_local(pcbinfo,
	&sin6->sin6_addr, lport,
	INPLOOKUP_WILDCARD, cred);
	if (t &&
	((t->inp_vflag & INP_TIMEWAIT) == 0) &&
	(so->so_type != SOCK_STREAM \|\|
	IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
	(!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) \|\|
	!IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) \|\|
	(t->inp_socket->so_options & SO_REUSEPORT)
	== 0) && (so->so_cred->cr_uid !=
	t->inp_socket->so_cred->cr_uid))
	return (EADDRINUSE);
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
	IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
	struct sockaddr_in sin;

	in6_sin6_2_sin(&sin, sin6);
	t = in_pcblookup_local(pcbinfo,
	sin.sin_addr, lport,
	INPLOOKUP_WILDCARD, cred);
	if (t &&
	((t->inp_vflag &
	INP_TIMEWAIT) == 0) &&
	(so->so_type != SOCK_STREAM \|\|
	ntohl(t->inp_faddr.s_addr) ==
	INADDR_ANY) &&
	(so->so_cred->cr_uid !=
	t->inp_socket->so_cred->cr_uid))
	return (EADDRINUSE);
	}
	}
	t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr,
	lport, wild, cred);
	if (t && (reuseport & ((t->inp_vflag & INP_TIMEWAIT) ?
	intotw(t)->tw_so_options :
	t->inp_socket->so_options)) == 0)
	return (EADDRINUSE);
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
	IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
	struct sockaddr_in sin;

	in6_sin6_2_sin(&sin, sin6);
	t = in_pcblookup_local(pcbinfo, sin.sin_addr,
	lport, wild, cred);
	if (t && t->inp_vflag & INP_TIMEWAIT) {
	if ((reuseport &
	intotw(t)->tw_so_options) == 0 &&
	(ntohl(t->inp_laddr.s_addr) !=
	INADDR_ANY \|\| ((inp->inp_vflag &
	INP_IPV6PROTO) ==
	(t->inp_vflag & INP_IPV6PROTO))))
	return (EADDRINUSE);
	}
	else if (t &&
	(reuseport & t->inp_socket->so_options)
	== 0 && (ntohl(t->inp_laddr.s_addr) !=
	INADDR_ANY \|\| INP_SOCKAF(so) ==
	INP_SOCKAF(t->inp_socket)))
	return (EADDRINUSE);
	}
	}
	inp->in6p_laddr = sin6->sin6_addr;
	}
	if (lport == 0) {
	int e;
	if ((e = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0)
	return (e);
	}
	else {
	inp->inp_lport = lport;
	if (in_pcbinshash(inp) != 0) {
	inp->in6p_laddr = in6addr_any;
	inp->inp_lport = 0;
	return (EAGAIN);
	}
	}
	return (0);
	}

	/*
	* Transform old in6_pcbconnect() into an inner subroutine for new
	* in6_pcbconnect(): Do some validity-checking on the remote
	* address (in mbuf 'nam') and then determine local host address
	* (i.e., which interface) to use to access that remote host.
	*
	* This preserves definition of in6_pcbconnect(), while supporting a
	* slightly different version for T/TCP. (This is more than
	* a bit of a kludge, but cleaning up the internal interfaces would
	* have forced minor changes in every protocol).
	*/
	int
	in6_pcbladdr(register struct inpcb inp, struct sockaddr nam,
	struct in6_addr **plocal_addr6)
	{
	+ INIT_VNET_INET6(inp->inp_vnet);
	register struct sockaddr_in6 sin6 = (struct sockaddr_in6 )nam;
	int error = 0;
	struct ifnet *ifp = NULL;
	int scope_ambiguous = 0;

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	if (nam->sa_len != sizeof (*sin6))
	return (EINVAL);
	if (sin6->sin6_family != AF_INET6)
	return (EAFNOSUPPORT);
	if (sin6->sin6_port == 0)
	return (EADDRNOTAVAIL);

	if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
	return(error);

	if (V_in6_ifaddr) {
	/*
	* If the destination address is UNSPECIFIED addr,
	* use the loopback addr, e.g ::1.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
	sin6->sin6_addr = in6addr_loopback;
	}

	/*
	* XXX: in6_selectsrc might replace the bound local address
	* with the address specified by setsockopt(IPV6_PKTINFO).
	* Is it the intended behavior?
	*/
	*plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts,
	inp, NULL,
	inp->inp_socket->so_cred,
	&ifp, &error);
	if (ifp && scope_ambiguous &&
	(error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) {
	return(error);
	}

	if (*plocal_addr6 == 0) {
	if (error == 0)
	error = EADDRNOTAVAIL;
	return (error);
	}
	/*
	* Don't do pcblookup call here; return interface in
	* plocal_addr6
	* and exit to caller, that will do the lookup.
	*/

	return (0);
	}

	/*
	* Outer subroutine:
	* Connect from a socket to a specified address.
	* Both address and port must be specified in argument sin.
	* If don't have a local address for this socket yet,
	* then pick one.
	*/
	int
	in6_pcbconnect(register struct inpcb inp, struct sockaddr nam,
	struct ucred *cred)
	{
	struct in6_addr *addr6;
	register struct sockaddr_in6 sin6 = (struct sockaddr_in6 )nam;
	int error;

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	/*
	* Call inner routine, to assign local interface address.
	* in6_pcbladdr() may automatically fill in sin6_scope_id.
	*/
	if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0)
	return (error);

	if (in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr,
	sin6->sin6_port,
	IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
	? addr6 : &inp->in6p_laddr,
	inp->inp_lport, 0, NULL) != NULL) {
	return (EADDRINUSE);
	}
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
	if (inp->inp_lport == 0) {
	error = in6_pcbbind(inp, (struct sockaddr *)0, cred);
	if (error)
	return (error);
	}
	inp->in6p_laddr = *addr6;
	}
	inp->in6p_faddr = sin6->sin6_addr;
	inp->inp_fport = sin6->sin6_port;
	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
	if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
	inp->in6p_flowinfo \|=
	(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);

	in_pcbrehash(inp);

	return (0);
	}

	void
	in6_pcbdisconnect(struct inpcb *inp)
	{

	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr));
	inp->inp_fport = 0;
	/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
	in_pcbrehash(inp);
	}

	void
	in6_pcbdetach(struct inpcb *inp)
	{

	KASSERT(inp->inp_socket != NULL, ("in6_pcbdetach: inp_socket == NULL"));
	inp->inp_socket->so_pcb = NULL;
	inp->inp_socket = NULL;
	}

	void
	in6_pcbfree(struct inpcb *inp)
	{
	struct inpcbinfo *ipi = inp->inp_pcbinfo;

	KASSERT(inp->inp_socket == NULL, ("in6_pcbfree: inp_socket != NULL"));
	INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
	INP_WLOCK_ASSERT(inp);

	#ifdef IPSEC
	if (inp->in6p_sp != NULL)
	ipsec6_delete_pcbpolicy(inp);
	#endif /* IPSEC */
	inp->inp_gencnt = ++ipi->ipi_gencnt;
	in_pcbremlists(inp);
	ip6_freepcbopts(inp->in6p_outputopts);
	ip6_freemoptions(inp->in6p_moptions);
	/* Check and free IPv4 related resources in case of mapped addr */
	if (inp->inp_options)
	(void)m_free(inp->inp_options);
	if (inp->inp_moptions != NULL)
	inp_freemoptions(inp->inp_moptions);
	inp->inp_vflag = 0;
	#ifdef MAC
	mac_inpcb_destroy(inp);
	#endif
	INP_WUNLOCK(inp);
	uma_zfree(ipi->ipi_zone, inp);
	}

	struct sockaddr *
	in6_sockaddr(in_port_t port, struct in6_addr *addr_p)
	{
	struct sockaddr_in6 *sin6;

	MALLOC(sin6, struct sockaddr_in6 , sizeof sin6, M_SONAME, M_WAITOK);
	bzero(sin6, sizeof *sin6);
	sin6->sin6_family = AF_INET6;
	sin6->sin6_len = sizeof(*sin6);
	sin6->sin6_port = port;
	sin6->sin6_addr = *addr_p;
	(void)sa6_recoverscope(sin6); /* XXX: should catch errors */

	return (struct sockaddr *)sin6;
	}

	struct sockaddr *
	in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p)
	{
	struct sockaddr_in sin;
	struct sockaddr_in6 *sin6_p;

	bzero(&sin, sizeof sin);
	sin.sin_family = AF_INET;
	sin.sin_len = sizeof(sin);
	sin.sin_port = port;
	sin.sin_addr = *addr_p;

	MALLOC(sin6_p, struct sockaddr_in6 , sizeof sin6_p, M_SONAME,
	M_WAITOK);
	in6_sin_2_v4mapsin6(&sin, sin6_p);

	return (struct sockaddr *)sin6_p;
	}

	int
	in6_getsockaddr(struct socket so, struct sockaddr *nam)
	{
	register struct inpcb *inp;
	struct in6_addr addr;
	in_port_t port;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL"));

	INP_RLOCK(inp);
	port = inp->inp_lport;
	addr = inp->in6p_laddr;
	INP_RUNLOCK(inp);

	*nam = in6_sockaddr(port, &addr);
	return 0;
	}

	int
	in6_getpeeraddr(struct socket so, struct sockaddr *nam)
	{
	struct inpcb *inp;
	struct in6_addr addr;
	in_port_t port;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL"));

	INP_RLOCK(inp);
	port = inp->inp_fport;
	addr = inp->in6p_faddr;
	INP_RUNLOCK(inp);

	*nam = in6_sockaddr(port, &addr);
	return 0;
	}

	int
	in6_mapped_sockaddr(struct socket so, struct sockaddr *nam)
	{
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL"));

	if ((inp->inp_vflag & (INP_IPV4 \| INP_IPV6)) == INP_IPV4) {
	error = in_getsockaddr(so, nam);
	if (error == 0)
	in6_sin_2_v4mapsin6_in_sock(nam);
	} else {
	/* scope issues will be handled in in6_getsockaddr(). */
	error = in6_getsockaddr(so, nam);
	}

	return error;
	}

	int
	in6_mapped_peeraddr(struct socket so, struct sockaddr *nam)
	{
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL"));

	if ((inp->inp_vflag & (INP_IPV4 \| INP_IPV6)) == INP_IPV4) {
	error = in_getpeeraddr(so, nam);
	if (error == 0)
	in6_sin_2_v4mapsin6_in_sock(nam);
	} else
	/* scope issues will be handled in in6_getpeeraddr(). */
	error = in6_getpeeraddr(so, nam);

	return error;
	}

	/*
	* Pass some notification to all connections of a protocol
	* associated with address dst. The local address and/or port numbers
	* may be specified to limit the search. The "usual action" will be
	* taken, depending on the ctlinput cmd. The caller must filter any
	* cmds that are uninteresting (e.g., no error in the map).
	* Call the protocol specific routine (if any) to report
	* any errors for each matching socket.
	*/
	void
	in6_pcbnotify(struct inpcbinfo pcbinfo, struct sockaddr dst,
	u_int fport_arg, const struct sockaddr *src, u_int lport_arg,
	int cmd, void *cmdarg,
	struct inpcb (notify)(struct inpcb *, int))
	{
	struct inpcb inp, inp_temp;
	struct sockaddr_in6 sa6_src, *sa6_dst;
	u_short fport = fport_arg, lport = lport_arg;
	u_int32_t flowinfo;
	int errno;

	if ((unsigned)cmd >= PRC_NCMDS \|\| dst->sa_family != AF_INET6)
	return;

	sa6_dst = (struct sockaddr_in6 *)dst;
	if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
	return;

	/*
	* note that src can be NULL when we get notify by local fragmentation.
	*/
	sa6_src = (src == NULL) ? sa6_any : (const struct sockaddr_in6 )src;
	flowinfo = sa6_src.sin6_flowinfo;

	/*
	* Redirects go to all references to the destination,
	* and use in6_rtchange to invalidate the route cache.
	* Dead host indications: also use in6_rtchange to invalidate
	* the cache, and deliver the error to all the sockets.
	* Otherwise, if we have knowledge of the local port and address,
	* deliver only to that socket.
	*/
	if (PRC_IS_REDIRECT(cmd) \|\| cmd == PRC_HOSTDEAD) {
	fport = 0;
	lport = 0;
	bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr));

	if (cmd != PRC_HOSTDEAD)
	notify = in6_rtchange;
	}
	errno = inet6ctlerrmap[cmd];
	INP_INFO_WLOCK(pcbinfo);
	LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
	INP_WLOCK(inp);
	if ((inp->inp_vflag & INP_IPV6) == 0) {
	INP_WUNLOCK(inp);
	continue;
	}

	/*
	* If the error designates a new path MTU for a destination
	* and the application (associated with this socket) wanted to
	* know the value, notify. Note that we notify for all
	* disconnected sockets if the corresponding application
	* wanted. This is because some UDP applications keep sending
	* sockets disconnected.
	* XXX: should we avoid to notify the value to TCP sockets?
	*/
	if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 &&
	(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) \|\|
	IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr))) {
	ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst,
	(u_int32_t *)cmdarg);
	}

	/*
	* Detect if we should notify the error. If no source and
	* destination ports are specifed, but non-zero flowinfo and
	* local address match, notify the error. This is the case
	* when the error is delivered with an encrypted buffer
	* by ESP. Otherwise, just compare addresses and ports
	* as usual.
	*/
	if (lport == 0 && fport == 0 && flowinfo &&
	inp->inp_socket != NULL &&
	flowinfo == (inp->in6p_flowinfo & IPV6_FLOWLABEL_MASK) &&
	IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr))
	goto do_notify;
	else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
	&sa6_dst->sin6_addr) \|\|
	inp->inp_socket == 0 \|\|
	(lport && inp->inp_lport != lport) \|\|
	(!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
	!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
	&sa6_src.sin6_addr)) \|\|
	(fport && inp->inp_fport != fport)) {
	INP_WUNLOCK(inp);
	continue;
	}

	do_notify:
	if (notify) {
	if ((*notify)(inp, errno))
	INP_WUNLOCK(inp);
	} else
	INP_WUNLOCK(inp);
	}
	INP_INFO_WUNLOCK(pcbinfo);
	}

	/*
	* Lookup a PCB based on the local address and port.
	*/
	struct inpcb *
	in6_pcblookup_local(struct inpcbinfo pcbinfo, struct in6_addr laddr,
	u_short lport, int wild_okay, struct ucred *cred)
	{
	register struct inpcb *inp;
	int matchwild = 3, wildcard;

	INP_INFO_WLOCK_ASSERT(pcbinfo);

	if (!wild_okay) {
	struct inpcbhead *head;
	/*
	* Look for an unconnected (wildcard foreign addr) PCB that
	* matches the local address and port we're looking for.
	*/
	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
	0, pcbinfo->ipi_hashmask)];
	LIST_FOREACH(inp, head, inp_hash) {
	if ((inp->inp_vflag & INP_IPV6) == 0)
	continue;
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
	IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
	inp->inp_lport == lport) {
	/*
	* Found.
	*/
	return (inp);
	}
	}
	/*
	* Not found.
	*/
	return (NULL);
	} else {
	struct inpcbporthead *porthash;
	struct inpcbport *phd;
	struct inpcb *match = NULL;
	/*
	* Best fit PCB lookup.
	*
	* First see if this local port is in use by looking on the
	* port hash list.
	*/
	porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
	pcbinfo->ipi_porthashmask)];
	LIST_FOREACH(phd, porthash, phd_hash) {
	if (phd->phd_port == lport)
	break;
	}
	if (phd != NULL) {
	/*
	* Port is in use by one or more PCBs. Look for best
	* fit.
	*/
	LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
	wildcard = 0;
	if ((inp->inp_vflag & INP_IPV6) == 0)
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
	wildcard++;
	if (!IN6_IS_ADDR_UNSPECIFIED(
	&inp->in6p_laddr)) {
	if (IN6_IS_ADDR_UNSPECIFIED(laddr))
	wildcard++;
	else if (!IN6_ARE_ADDR_EQUAL(
	&inp->in6p_laddr, laddr))
	continue;
	} else {
	if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
	wildcard++;
	}
	if (wildcard < matchwild) {
	match = inp;
	matchwild = wildcard;
	if (matchwild == 0) {
	break;
	}
	}
	}
	}
	return (match);
	}
	}

	void
	in6_pcbpurgeif0(struct inpcbinfo pcbinfo, struct ifnet ifp)
	{
	struct in6pcb *in6p;
	struct ip6_moptions *im6o;
	struct in6_multi_mship imm, nimm;

	INP_INFO_RLOCK(pcbinfo);
	LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) {
	INP_WLOCK(in6p);
	im6o = in6p->in6p_moptions;
	if ((in6p->inp_vflag & INP_IPV6) &&
	im6o) {
	/*
	* Unselect the outgoing interface if it is being
	* detached.
	*/
	if (im6o->im6o_multicast_ifp == ifp)
	im6o->im6o_multicast_ifp = NULL;

	/*
	* Drop multicast group membership if we joined
	* through the interface being detached.
	* XXX controversial - is it really legal for kernel
	* to force this?
	*/
	for (imm = im6o->im6o_memberships.lh_first;
	imm != NULL; imm = nimm) {
	nimm = imm->i6mm_chain.le_next;
	if (imm->i6mm_maddr->in6m_ifp == ifp) {
	LIST_REMOVE(imm, i6mm_chain);
	in6_delmulti(imm->i6mm_maddr);
	free(imm, M_IP6MADDR);
	}
	}
	}
	INP_WUNLOCK(in6p);
	}
	INP_INFO_RUNLOCK(pcbinfo);
	}

	/*
	* Check for alternatives when higher level complains
	* about service problems. For now, invalidate cached
	* routing information. If the route was created dynamically
	* (by a redirect), time to try a default gateway again.
	*/
	void
	in6_losing(struct inpcb *in6p)
	{

	/*
	* We don't store route pointers in the routing table anymore
	*/
	return;
	}

	/*
	* After a routing change, flush old routing
	* and allocate a (hopefully) better one.
	*/
	struct inpcb *
	in6_rtchange(struct inpcb *inp, int errno)
	{
	/*
	* We don't store route pointers in the routing table anymore
	*/
	return inp;
	}

	/*
	* Lookup PCB in hash list.
	*/
	struct inpcb *
	in6_pcblookup_hash(struct inpcbinfo pcbinfo, struct in6_addr faddr,
	u_int fport_arg, struct in6_addr *laddr, u_int lport_arg,
	int wildcard, struct ifnet *ifp)
	{
	struct inpcbhead *head;
	register struct inpcb *inp;
	u_short fport = fport_arg, lport = lport_arg;
	int faith;

	INP_INFO_LOCK_ASSERT(pcbinfo);

	if (faithprefix_p != NULL)
	faith = (*faithprefix_p)(laddr);
	else
	faith = 0;

	/*
	* First look for an exact match.
	*/
	head = &pcbinfo->ipi_hashbase[
	INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, lport, fport,
	pcbinfo->ipi_hashmask)];
	LIST_FOREACH(inp, head, inp_hash) {
	if ((inp->inp_vflag & INP_IPV6) == 0)
	continue;
	if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
	IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
	inp->inp_fport == fport &&
	inp->inp_lport == lport) {
	/*
	* Found.
	*/
	return (inp);
	}
	}
	if (wildcard) {
	struct inpcb *local_wild = NULL;

	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
	0, pcbinfo->ipi_hashmask)];
	LIST_FOREACH(inp, head, inp_hash) {
	if ((inp->inp_vflag & INP_IPV6) == 0)
	continue;
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
	inp->inp_lport == lport) {
	if (faith && (inp->inp_flags & INP_FAITH) == 0)
	continue;
	if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
	laddr))
	return (inp);
	else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
	local_wild = inp;
	}
	}
	return (local_wild);
	}

	/*
	* Not found.
	*/
	return (NULL);
	}

	void
	init_sin6(struct sockaddr_in6 sin6, struct mbuf m)
	{
	struct ip6_hdr *ip;

	ip = mtod(m, struct ip6_hdr *);
	bzero(sin6, sizeof(*sin6));
	sin6->sin6_len = sizeof(*sin6);
	sin6->sin6_family = AF_INET6;
	sin6->sin6_addr = ip->ip6_src;

	(void)sa6_recoverscope(sin6); /* XXX: should catch errors... */

	return;
	}
	Index: head/sys/netinet6/in6_proto.c
	===================================================================
	--- head/sys/netinet6/in6_proto.c (revision 183549)
	+++ head/sys/netinet6/in6_proto.c (revision 183550)
	@@ -1,582 +1,585 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in_proto.c 8.1 (Berkeley) 6/10/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_ipstealth.h"
	#include "opt_carp.h"
	#include "opt_sctp.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/protosw.h>
	#include <sys/kernel.h>
	#include <sys/domain.h>
	#include <sys/mbuf.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/radix.h>
	#include <net/route.h>
	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_encap.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet/icmp6.h>

	#include <netinet/tcp.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>
	#include <netinet6/tcp6_var.h>
	#include <netinet6/raw_ip6.h>
	#include <netinet6/udp6_var.h>
	#include <netinet6/pim6_var.h>
	#include <netinet6/nd6.h>

	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif

	#ifdef SCTP
	#include <netinet/in_pcb.h>
	#include <netinet/sctp_pcb.h>
	#include <netinet/sctp.h>
	#include <netinet/sctp_var.h>
	#include <netinet6/sctp6_var.h>
	#endif /* SCTP */

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#endif /* IPSEC */

	#include <netinet6/ip6protosw.h>

	/*
	* TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
	*/

	extern struct domain inet6domain;
	static struct pr_usrreqs nousrreqs;

	#define PR_LISTEN 0
	#define PR_ABRTACPTDIS 0

	struct ip6protosw inet6sw[] = {
	{
	.pr_type = 0,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_IPV6,
	.pr_init = ip6_init,
	.pr_slowtimo = frag6_slowtimo,
	.pr_drain = frag6_drain,
	.pr_usrreqs = &nousrreqs,
	},
	{
	.pr_type = SOCK_DGRAM,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_UDP,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = udp6_input,
	.pr_ctlinput = udp6_ctlinput,
	.pr_ctloutput = ip6_ctloutput,
	.pr_usrreqs = &udp6_usrreqs,
	},
	{
	.pr_type = SOCK_STREAM,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_TCP,
	.pr_flags = PR_CONNREQUIRED\|PR_WANTRCVD\|PR_LISTEN,
	.pr_input = tcp6_input,
	.pr_ctlinput = tcp6_ctlinput,
	.pr_ctloutput = tcp_ctloutput,
	#ifndef INET /* don't call initialization and timeout routines twice */
	.pr_init = tcp_init,
	.pr_fasttimo = tcp_fasttimo,
	.pr_slowtimo = tcp_slowtimo,
	#endif
	.pr_drain = tcp_drain,
	.pr_usrreqs = &tcp6_usrreqs,
	},
	#ifdef SCTP
	{
	.pr_type = SOCK_DGRAM,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_SCTP,
	.pr_flags = PR_WANTRCVD,
	.pr_input = sctp6_input,
	.pr_ctlinput = sctp6_ctlinput,
	.pr_ctloutput = sctp_ctloutput,
	.pr_drain = sctp_drain,
	.pr_usrreqs = &sctp6_usrreqs
	},
	{
	.pr_type = SOCK_SEQPACKET,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_SCTP,
	.pr_flags = PR_WANTRCVD,
	.pr_input = sctp6_input,
	.pr_ctlinput = sctp6_ctlinput,
	.pr_ctloutput = sctp_ctloutput,
	.pr_drain = sctp_drain,
	.pr_usrreqs = &sctp6_usrreqs
	},

	{
	.pr_type = SOCK_STREAM,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_SCTP,
	.pr_flags = PR_WANTRCVD,
	.pr_input = sctp6_input,
	.pr_ctlinput = sctp6_ctlinput,
	.pr_ctloutput = sctp_ctloutput,
	.pr_drain = sctp_drain,
	.pr_usrreqs = &sctp6_usrreqs
	},
	#endif /* SCTP */
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_RAW,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = rip6_input,
	.pr_output = rip6_output,
	.pr_ctlinput = rip6_ctlinput,
	.pr_ctloutput = rip6_ctloutput,
	.pr_usrreqs = &rip6_usrreqs
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_ICMPV6,
	.pr_flags = PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	.pr_input = icmp6_input,
	.pr_output = rip6_output,
	.pr_ctlinput = rip6_ctlinput,
	.pr_ctloutput = rip6_ctloutput,
	.pr_init = icmp6_init,
	.pr_fasttimo = icmp6_fasttimo,
	.pr_usrreqs = &rip6_usrreqs
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_DSTOPTS,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = dest6_input,
	.pr_usrreqs = &nousrreqs
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_ROUTING,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = route6_input,
	.pr_usrreqs = &nousrreqs
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_FRAGMENT,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = frag6_input,
	.pr_usrreqs = &nousrreqs
	},
	#ifdef IPSEC
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_AH,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = ipsec6_common_input,
	.pr_usrreqs = &nousrreqs,
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_ESP,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = ipsec6_common_input,
	.pr_ctlinput = esp6_ctlinput,
	.pr_usrreqs = &nousrreqs,
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_IPCOMP,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = ipsec6_common_input,
	.pr_usrreqs = &nousrreqs,
	},
	#endif /* IPSEC */
	#ifdef INET
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_IPV4,
	.pr_flags = PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	.pr_input = encap6_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_init = encap_init,
	.pr_usrreqs = &rip6_usrreqs
	},
	#endif /* INET */
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_IPV6,
	.pr_flags = PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	.pr_input = encap6_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_init = encap_init,
	.pr_usrreqs = &rip6_usrreqs
	},
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_PIM,
	.pr_flags = PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	.pr_input = encap6_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_usrreqs = &rip6_usrreqs
	},
	#ifdef DEV_CARP
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_CARP,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = carp6_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_usrreqs = &rip6_usrreqs
	},
	#endif /* DEV_CARP */
	/* raw wildcard */
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_input = rip6_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_usrreqs = &rip6_usrreqs
	},
	};

	extern int in6_inithead(void **, int);

	struct domain inet6domain = {
	.dom_family = AF_INET6,
	.dom_name = "internet6",
	.dom_protosw = (struct protosw *)inet6sw,
	.dom_protoswNPROTOSW = (struct protosw *)
	&inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])],
	#ifdef RADIX_MPATH
	.dom_rtattach = rn6_mpath_inithead,
	#else
	.dom_rtattach = in6_inithead,
	#endif
	.dom_rtoffset = offsetof(struct sockaddr_in6, sin6_addr) << 3,
	.dom_maxrtkey = sizeof(struct sockaddr_in6),
	.dom_ifattach = in6_domifattach,
	.dom_ifdetach = in6_domifdetach
	};

	DOMAIN_SET(inet6);

	/*
	* Internet configuration info
	*/
	#ifndef IPV6FORWARDING
	#ifdef GATEWAY6
	#define IPV6FORWARDING 1 /* forward IP6 packets not for us */
	#else
	#define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */
	#endif /* GATEWAY6 */
	#endif /* !IPV6FORWARDING */

	#ifndef IPV6_SENDREDIRECTS
	#define IPV6_SENDREDIRECTS 1
	#endif

	int ip6_forwarding = IPV6FORWARDING; /* act as router? */
	int ip6_sendredirects = IPV6_SENDREDIRECTS;
	int ip6_defhlim = IPV6_DEFHLIM;
	int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS;
	int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */
	int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */
	int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */
	int ip6_log_interval = 5;
	int ip6_hdrnestlimit = 15; /* How many header options will we process? */
	int ip6_dad_count = 1; /* DupAddrDetectionTransmits */
	int ip6_auto_flowlabel = 1;
	int ip6_gif_hlim = 0;
	int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */
	int ip6_rr_prune = 5; /* router renumbering prefix
	* walk list every 5 sec. */
	int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */
	int ip6_v6only = 1;

	int ip6_keepfaith = 0;
	time_t ip6_log_time = (time_t)0L;
	#ifdef IPSTEALTH
	int ip6stealth = 0;
	#endif
	int nd6_onlink_ns_rfc4861 = 0; /* allow 'on-link' nd6 NS (as in RFC 4861) */

	/* icmp6 */
	/*
	* BSDI4 defines these variables in in_proto.c...
	* XXX: what if we don't define INET? Should we define pmtu6_expire
	* or so? (jinmei@kame.net 19990310)
	*/
	int pmtu_expire = 60*10;
	int pmtu_probe = 60*2;

	/* raw IP6 parameters */
	/*
	* Nominal space allocated to a raw ip socket.
	*/
	#define RIPV6SNDQ 8192
	#define RIPV6RCVQ 8192

	u_long rip6_sendspace = RIPV6SNDQ;
	u_long rip6_recvspace = RIPV6RCVQ;

	/* ICMPV6 parameters */
	int icmp6_rediraccept = 1; /* accept and process redirects */
	int icmp6_redirtimeout = 10 * 60; /* 10 minutes */
	int icmp6errppslim = 100; /* 100pps */
	/* control how to respond to NI queries */
	int icmp6_nodeinfo = (ICMP6_NODEINFO_FQDNOK\|ICMP6_NODEINFO_NODEADDROK);

	/* UDP on IP6 parameters */
	int udp6_sendspace = 9216; /* really max datagram size */
	int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6));
	/* 40 1K datagrams */

	/*
	* sysctl related items.
	*/
	SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW, 0,
	"Internet6 Family");

	/* net.inet6 */
	SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW, 0, "IP6");
	SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW, 0, "ICMP6");
	SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW, 0, "UDP6");
	SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW, 0, "TCP6");
	#ifdef SCTP
	SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW, 0, "SCTP6");
	#endif
	#ifdef IPSEC
	SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW, 0, "IPSEC6");
	#endif /* IPSEC */

	/* net.inet6.ip6 */
	static int
	sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET6(curvnet);
	int error = 0;
	int old;

	error = SYSCTL_OUT(req, arg1, sizeof(int));
	if (error \|\| !req->newptr)
	return (error);
	old = V_ip6_temp_preferred_lifetime;
	error = SYSCTL_IN(req, arg1, sizeof(int));
	if (V_ip6_temp_preferred_lifetime <
	V_ip6_desync_factor + V_ip6_temp_regen_advance) {
	V_ip6_temp_preferred_lifetime = old;
	return (EINVAL);
	}
	return (error);
	}

	static int
	sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET6(curvnet);
	int error = 0;
	int old;

	error = SYSCTL_OUT(req, arg1, sizeof(int));
	if (error \|\| !req->newptr)
	return (error);
	old = V_ip6_temp_valid_lifetime;
	error = SYSCTL_IN(req, arg1, sizeof(int));
	if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) {
	V_ip6_temp_preferred_lifetime = old;
	return (EINVAL);
	}
	return (error);
	}

	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING,
	- forwarding, CTLFLAG_RW, &ip6_forwarding, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS,
	- redirect, CTLFLAG_RW, &ip6_sendredirects, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM,
	- hlim, CTLFLAG_RW, &ip6_defhlim, 0, "");
	-SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD,
	- &ip6stat, ip6stat, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS,
	- maxfragpackets, CTLFLAG_RW, &ip6_maxfragpackets, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV,
	- accept_rtadv, CTLFLAG_RW, &ip6_accept_rtadv, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH,
	- keepfaith, CTLFLAG_RW, &ip6_keepfaith, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL,
	- log_interval, CTLFLAG_RW, &ip6_log_interval, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT,
	- hdrnestlimit, CTLFLAG_RW, &ip6_hdrnestlimit, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT,
	- dad_count, CTLFLAG_RW, &ip6_dad_count, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL,
	- auto_flowlabel, CTLFLAG_RW, &ip6_auto_flowlabel, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM,
	- defmcasthlim, CTLFLAG_RW, &ip6_defmcasthlim, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM,
	- gifhlim, CTLFLAG_RW, &ip6_gif_hlim, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_FORWARDING,
	+ forwarding, CTLFLAG_RW, ip6_forwarding, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_SENDREDIRECTS,
	+ redirect, CTLFLAG_RW, ip6_sendredirects, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFHLIM,
	+ hlim, CTLFLAG_RW, ip6_defhlim, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STATS, stats,
	+ CTLFLAG_RD, ip6stat, ip6stat, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS,
	+ maxfragpackets, CTLFLAG_RW, ip6_maxfragpackets, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_ACCEPT_RTADV,
	+ accept_rtadv, CTLFLAG_RW, ip6_accept_rtadv, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_KEEPFAITH,
	+ keepfaith, CTLFLAG_RW, ip6_keepfaith, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_LOG_INTERVAL,
	+ log_interval, CTLFLAG_RW, ip6_log_interval, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_HDRNESTLIMIT,
	+ hdrnestlimit, CTLFLAG_RW, ip6_hdrnestlimit, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DAD_COUNT,
	+ dad_count, CTLFLAG_RW, ip6_dad_count, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL,
	+ auto_flowlabel, CTLFLAG_RW, ip6_auto_flowlabel, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_DEFMCASTHLIM,
	+ defmcasthlim, CTLFLAG_RW, ip6_defmcasthlim, 0, "");
	SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION,
	kame_version, CTLFLAG_RD, __KAME_VERSION, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED,
	- use_deprecated, CTLFLAG_RW, &ip6_use_deprecated, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE,
	- rr_prune, CTLFLAG_RW, &ip6_rr_prune, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR,
	- use_tempaddr, CTLFLAG_RW, &ip6_use_tempaddr, 0, "");
	-SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
	- CTLTYPE_INT\|CTLFLAG_RW, &ip6_temp_preferred_lifetime, 0,
	- sysctl_ip6_temppltime, "I", "");
	-SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
	- CTLTYPE_INT\|CTLFLAG_RW, &ip6_temp_valid_lifetime, 0,
	- sysctl_ip6_tempvltime, "I", "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY,
	- v6only, CTLFLAG_RW, &ip6_v6only, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEPRECATED,
	+ use_deprecated, CTLFLAG_RW, ip6_use_deprecated, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RR_PRUNE,
	+ rr_prune, CTLFLAG_RW, ip6_rr_prune, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USETEMPADDR,
	+ use_tempaddr, CTLFLAG_RW, ip6_use_tempaddr, 0, "");
	+SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
	+ CTLTYPE_INT\|CTLFLAG_RW, ip6_temp_preferred_lifetime, 0,
	+ sysctl_ip6_temppltime, "I", "");
	+SYSCTL_V_OID(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
	+ CTLTYPE_INT\|CTLFLAG_RW, ip6_temp_valid_lifetime, 0,
	+ sysctl_ip6_tempvltime, "I", "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_V6ONLY,
	+ v6only, CTLFLAG_RW, ip6_v6only, 0, "");
	+#ifndef VIMAGE
	TUNABLE_INT("net.inet6.ip6.auto_linklocal", &ip6_auto_linklocal);
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL,
	- auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, "");
	-SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD,
	- &rip6stat, rip6stat, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR,
	- prefer_tempaddr, CTLFLAG_RW, &ip6_prefer_tempaddr, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE,
	- use_defaultzone, CTLFLAG_RW, &ip6_use_defzone, 0,"");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS,
	- maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, "");
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU,
	- mcast_pmtu, CTLFLAG_RW, &ip6_mcast_pmtu, 0, "");
	+#endif
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL,
	+ auto_linklocal, CTLFLAG_RW, ip6_auto_linklocal, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_RIP6STATS,
	+ rip6stats, CTLFLAG_RD, rip6stat, rip6stat, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR,
	+ prefer_tempaddr, CTLFLAG_RW, ip6_prefer_tempaddr, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE,
	+ use_defaultzone, CTLFLAG_RW, ip6_use_defzone, 0,"");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MAXFRAGS,
	+ maxfrags, CTLFLAG_RW, ip6_maxfrags, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_MCAST_PMTU,
	+ mcast_pmtu, CTLFLAG_RW, ip6_mcast_pmtu, 0, "");
	#ifdef IPSTEALTH
	-SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW,
	- &ip6stealth, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_ip6, IPV6CTL_STEALTH,
	+ stealth, CTLFLAG_RW, ip6stealth, 0, "");
	#endif

	/* net.inet6.icmp6 */
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT,
	- rediraccept, CTLFLAG_RW, &icmp6_rediraccept, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT,
	- redirtimeout, CTLFLAG_RW, &icmp6_redirtimeout, 0, "");
	-SYSCTL_STRUCT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD,
	- &icmp6stat, icmp6stat, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE,
	- nd6_prune, CTLFLAG_RW, &nd6_prune, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY,
	- nd6_delay, CTLFLAG_RW, &nd6_delay, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES,
	- nd6_umaxtries, CTLFLAG_RW, &nd6_umaxtries, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES,
	- nd6_mmaxtries, CTLFLAG_RW, &nd6_mmaxtries, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK,
	- nd6_useloopback, CTLFLAG_RW, &nd6_useloopback, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO,
	- nodeinfo, CTLFLAG_RW, &icmp6_nodeinfo, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT,
	- errppslimit, CTLFLAG_RW, &icmp6errppslim, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT,
	- nd6_maxnudhint, CTLFLAG_RW, &nd6_maxnudhint, 0, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG,
	- nd6_debug, CTLFLAG_RW, &nd6_debug, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT,
	+ rediraccept, CTLFLAG_RW, icmp6_rediraccept, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT,
	+ redirtimeout, CTLFLAG_RW, icmp6_redirtimeout, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_STATS,
	+ stats, CTLFLAG_RD, icmp6stat, icmp6stat, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE,
	+ nd6_prune, CTLFLAG_RW, nd6_prune, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DELAY,
	+ nd6_delay, CTLFLAG_RW, nd6_delay, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES,
	+ nd6_umaxtries, CTLFLAG_RW, nd6_umaxtries, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES,
	+ nd6_mmaxtries, CTLFLAG_RW, nd6_mmaxtries, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK,
	+ nd6_useloopback, CTLFLAG_RW, nd6_useloopback, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_NODEINFO,
	+ nodeinfo, CTLFLAG_RW, icmp6_nodeinfo, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT,
	+ errppslimit, CTLFLAG_RW, icmp6errppslim, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT,
	+ nd6_maxnudhint, CTLFLAG_RW, nd6_maxnudhint, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG,
	+ nd6_debug, CTLFLAG_RW, nd6_debug, 0, "");
	+
	SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
	nd6_onlink_ns_rfc4861, CTLFLAG_RW, &nd6_onlink_ns_rfc4861, 0,
	"Accept 'on-link' nd6 NS in compliance with RFC 4861.");
	Index: head/sys/netinet6/in6_rmx.c
	===================================================================
	--- head/sys/netinet6/in6_rmx.c (revision 183549)
	+++ head/sys/netinet6/in6_rmx.c (revision 183550)
	@@ -1,478 +1,490 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $
	*/

	/*-
	* Copyright 1994, 1995 Massachusetts Institute of Technology
	*
	* Permission to use, copy, modify, and distribute this software and
	* its documentation for any purpose and without fee is hereby
	* granted, provided that both the above copyright notice and this
	* permission notice appear in all copies, that both the above
	* copyright notice and this permission notice appear in all
	* supporting documentation, and that the name of M.I.T. not be used
	* in advertising or publicity pertaining to distribution of the
	* software without specific, written prior permission. M.I.T. makes
	* no representations about the suitability of this software for any
	* purpose. It is provided "as is" without express or implied
	* warranty.
	*
	* THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
	* ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
	* SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	/*
	* This code does two things necessary for the enhanced TCP metrics to
	* function in a useful manner:
	* 1) It marks all non-host routes as `cloning', thus ensuring that
	* every actual reference to such a route actually gets turned
	* into a reference to a host route to the specific destination
	* requested.
	* 2) When such routes lose all their references, it arranges for them
	* to be deleted in some random collection of circumstances, so that
	* a large quantity of stale routing data is not kept in kernel memory
	* indefinitely. See in6_rtqtimo() below for the exact mechanism.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/queue.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/mbuf.h>
	#include <sys/syslog.h>
	#include <sys/callout.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <netinet/in.h>
	#include <netinet/ip_var.h>
	#include <netinet/in_var.h>

	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>

	#include <netinet/icmp6.h>
	#include <netinet6/nd6.h>

	#include <netinet/tcp.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>

	extern int in6_inithead(void **head, int off);

	#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */

	/*
	* Do what we need to do when inserting a route.
	*/
	static struct radix_node *
	in6_addroute(void v_arg, void n_arg, struct radix_node_head *head,
	struct radix_node *treenodes)
	{
	struct rtentry rt = (struct rtentry )treenodes;
	struct sockaddr_in6 sin6 = (struct sockaddr_in6 )rt_key(rt);
	struct radix_node *ret;

	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
	rt->rt_flags \|= RTF_MULTICAST;

	/*
	* A little bit of help for both IPv6 output and input:
	* For local addresses, we make sure that RTF_LOCAL is set,
	* with the thought that this might one day be used to speed up
	* ip_input().
	*
	* We also mark routes to multicast addresses as such, because
	* it's easy to do and might be useful (but this is much more
	* dubious since it's so easy to inspect the address). (This
	* is done above.)
	*
	* XXX
	* should elaborate the code.
	*/
	if (rt->rt_flags & RTF_HOST) {
	if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
	->sin6_addr,
	&sin6->sin6_addr)) {
	rt->rt_flags \|= RTF_LOCAL;
	}
	}

	if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
	rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);

	ret = rn_addroute(v_arg, n_arg, head, treenodes);
	if (ret == NULL && rt->rt_flags & RTF_HOST) {
	struct rtentry *rt2;
	/*
	* We are trying to add a host route, but can't.
	* Find out if it is because of an
	* ARP entry and delete it if so.
	*/
	rt2 = rtalloc1((struct sockaddr *)sin6, 0, RTF_CLONING);
	if (rt2) {
	if (rt2->rt_flags & RTF_LLINFO &&
	rt2->rt_flags & RTF_HOST &&
	rt2->rt_gateway &&
	rt2->rt_gateway->sa_family == AF_LINK) {
	rtexpunge(rt2);
	RTFREE_LOCKED(rt2);
	ret = rn_addroute(v_arg, n_arg, head,
	treenodes);
	} else
	RTFREE_LOCKED(rt2);
	}
	} else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
	struct rtentry *rt2;
	/*
	* We are trying to add a net route, but can't.
	* The following case should be allowed, so we'll make a
	* special check for this:
	* Two IPv6 addresses with the same prefix is assigned
	* to a single interrface.
	* # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
	* # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
	* In this case, (1) and (2) want to add the same
	* net route entry, 3ffe:0501:: -> if0.
	* This case should not raise an error.
	*/
	rt2 = rtalloc1((struct sockaddr *)sin6, 0, RTF_CLONING);
	if (rt2) {
	if ((rt2->rt_flags & (RTF_CLONING\|RTF_HOST\|RTF_GATEWAY))
	== RTF_CLONING
	&& rt2->rt_gateway
	&& rt2->rt_gateway->sa_family == AF_LINK
	&& rt2->rt_ifp == rt->rt_ifp) {
	ret = rt2->rt_nodes;
	}
	RTFREE_LOCKED(rt2);
	}
	}
	return ret;
	}

	/*
	* This code is the inverse of in6_clsroute: on first reference, if we
	* were managing the route, stop doing so and set the expiration timer
	* back off again.
	*/
	static struct radix_node *
	in6_matroute(void v_arg, struct radix_node_head head)
	{
	struct radix_node *rn = rn_match(v_arg, head);
	struct rtentry rt = (struct rtentry )rn;

	if (rt && rt->rt_refcnt == 0) { /* this is first reference */
	if (rt->rt_flags & RTPRF_OURS) {
	rt->rt_flags &= ~RTPRF_OURS;
	rt->rt_rmx.rmx_expire = 0;
	}
	}
	return rn;
	}

	SYSCTL_DECL(_net_inet6_ip6);

	static int rtq_reallyold6 = 60*60;
	/* one hour is ``really old'' */
	SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire,
	CTLFLAG_RW, &rtq_reallyold6 , 0, "");

	static int rtq_minreallyold6 = 10;
	/* never automatically crank down to less */
	SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire,
	CTLFLAG_RW, &rtq_minreallyold6 , 0, "");

	static int rtq_toomany6 = 128;
	/* 128 cached routes is ``too many'' */
	SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache,
	CTLFLAG_RW, &rtq_toomany6 , 0, "");


	/*
	* On last reference drop, mark the route as belong to us so that it can be
	* timed out.
	*/
	static void
	in6_clsroute(struct radix_node rn, struct radix_node_head head)
	{
	+ INIT_VNET_INET6(curvnet);
	struct rtentry rt = (struct rtentry )rn;

	RT_LOCK_ASSERT(rt);

	if (!(rt->rt_flags & RTF_UP))
	return; /* prophylactic measures */

	if ((rt->rt_flags & (RTF_LLINFO \| RTF_HOST)) != RTF_HOST)
	return;

	if ((rt->rt_flags & (RTF_WASCLONED \| RTPRF_OURS)) != RTF_WASCLONED)
	return;

	/*
	* As requested by David Greenman:
	* If rtq_reallyold6 is 0, just delete the route without
	* waiting for a timeout cycle to kill it.
	*/
	if (V_rtq_reallyold6 != 0) {
	rt->rt_flags \|= RTPRF_OURS;
	rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold6;
	} else {
	rtexpunge(rt);
	}
	}

	struct rtqk_arg {
	struct radix_node_head *rnh;
	int mode;
	int updating;
	int draining;
	int killed;
	int found;
	time_t nextstop;
	};

	/*
	* Get rid of old routes. When draining, this deletes everything, even when
	* the timeout is not expired yet. When updating, this makes sure that
	* nothing has a timeout longer than the current value of rtq_reallyold6.
	*/
	static int
	in6_rtqkill(struct radix_node rn, void rock)
	{
	+ INIT_VNET_INET6(curvnet);
	struct rtqk_arg *ap = rock;
	struct rtentry rt = (struct rtentry )rn;
	int err;

	if (rt->rt_flags & RTPRF_OURS) {
	ap->found++;

	if (ap->draining \|\| rt->rt_rmx.rmx_expire <= time_uptime) {
	if (rt->rt_refcnt > 0)
	panic("rtqkill route really not free");

	err = rtrequest(RTM_DELETE,
	(struct sockaddr *)rt_key(rt),
	rt->rt_gateway, rt_mask(rt),
	rt->rt_flags, 0);
	if (err) {
	log(LOG_WARNING, "in6_rtqkill: error %d", err);
	} else {
	ap->killed++;
	}
	} else {
	if (ap->updating
	&& (rt->rt_rmx.rmx_expire - time_uptime
	> V_rtq_reallyold6)) {
	rt->rt_rmx.rmx_expire = time_uptime
	+ V_rtq_reallyold6;
	}
	ap->nextstop = lmin(ap->nextstop,
	rt->rt_rmx.rmx_expire);
	}
	}

	return 0;
	}

	#define RTQ_TIMEOUT 6010 / run no less than once every ten minutes */
	static int rtq_timeout6 = RTQ_TIMEOUT;
	static struct callout rtq_timer6;

	static void
	in6_rtqtimo(void *rock)
	{
	+ CURVNET_SET_QUIET((struct vnet *) rock);
	+ INIT_VNET_NET((struct vnet *) rock);
	+ INIT_VNET_INET6((struct vnet *) rock);
	struct radix_node_head *rnh = rock;
	struct rtqk_arg arg;
	struct timeval atv;
	static time_t last_adjusted_timeout = 0;

	arg.found = arg.killed = 0;
	arg.rnh = rnh;
	arg.nextstop = time_uptime + V_rtq_timeout6;
	arg.draining = arg.updating = 0;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);

	/*
	* Attempt to be somewhat dynamic about this:
	* If there are ``too many'' routes sitting around taking up space,
	* then crank down the timeout, and see if we can't make some more
	* go away. However, we make sure that we will never adjust more
	* than once in rtq_timeout6 seconds, to keep from cranking down too
	* hard.
	*/
	if ((arg.found - arg.killed > V_rtq_toomany6)
	&& (time_uptime - last_adjusted_timeout >= V_rtq_timeout6)
	&& V_rtq_reallyold6 > V_rtq_minreallyold6) {
	V_rtq_reallyold6 = 2*V_rtq_reallyold6 / 3;
	if (V_rtq_reallyold6 < V_rtq_minreallyold6) {
	V_rtq_reallyold6 = V_rtq_minreallyold6;
	}

	last_adjusted_timeout = time_uptime;
	#ifdef DIAGNOSTIC
	log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold6 to %d",
	V_rtq_reallyold6);
	#endif
	arg.found = arg.killed = 0;
	arg.updating = 1;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	}

	atv.tv_usec = 0;
	atv.tv_sec = arg.nextstop - time_uptime;
	callout_reset(&V_rtq_timer6, tvtohz(&atv), in6_rtqtimo, rock);
	+ CURVNET_RESTORE();
	}

	/*
	* Age old PMTUs.
	*/
	struct mtuex_arg {
	struct radix_node_head *rnh;
	time_t nextstop;
	};
	static struct callout rtq_mtutimer;

	static int
	in6_mtuexpire(struct radix_node rn, void rock)
	{
	struct rtentry rt = (struct rtentry )rn;
	struct mtuex_arg *ap = rock;

	/* sanity */
	if (!rt)
	panic("rt == NULL in in6_mtuexpire");

	if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
	if (rt->rt_rmx.rmx_expire <= time_uptime) {
	rt->rt_flags \|= RTF_PROBEMTU;
	} else {
	ap->nextstop = lmin(ap->nextstop,
	rt->rt_rmx.rmx_expire);
	}
	}

	return 0;
	}

	#define MTUTIMO_DEFAULT (60*1)

	static void
	in6_mtutimo(void *rock)
	{
	+ CURVNET_SET_QUIET((struct vnet *) rock);
	+ INIT_VNET_NET((struct vnet *) rock);
	+ INIT_VNET_INET6((struct vnet *) rock);
	struct radix_node_head *rnh = rock;
	struct mtuex_arg arg;
	struct timeval atv;

	arg.rnh = rnh;
	arg.nextstop = time_uptime + MTUTIMO_DEFAULT;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);

	atv.tv_usec = 0;
	atv.tv_sec = arg.nextstop - time_uptime;
	if (atv.tv_sec < 0) {
	printf("invalid mtu expiration time on routing table\n");
	arg.nextstop = time_uptime + 30; /* last resort */
	atv.tv_sec = 30;
	}
	callout_reset(&V_rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock);
	+ CURVNET_RESTORE();
	}

	#if 0
	void
	in6_rtqdrain(void)
	{
	+ INIT_VNET_NET(curvnet);
	struct radix_node_head *rnh = V_rt_tables[AF_INET6];
	struct rtqk_arg arg;

	arg.found = arg.killed = 0;
	arg.rnh = rnh;
	arg.nextstop = 0;
	arg.draining = 1;
	arg.updating = 0;
	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	}
	#endif

	/*
	* Initialize our routing tree.
	* XXX MRT When off == 0, we are being called from vfs_export.c
	* so just set up their table and leave. (we know what the correct
	* value should be so just use that).. FIX AFTER RELENG_7 is MFC'd
	* see also comments in in_inithead() vfs_export.c and domain.h
	*/
	int
	in6_inithead(void **head, int off)
	{
	+ INIT_VNET_INET6(curvnet);
	struct radix_node_head *rnh;

	if (!rn_inithead(head, offsetof(struct sockaddr_in6, sin6_addr) << 3))
	return 0; /* See above */

	if (off == 0) /* See above */
	return 1; /* only do the rest for the real thing */

	rnh = *head;
	rnh->rnh_addaddr = in6_addroute;
	rnh->rnh_matchaddr = in6_matroute;
	rnh->rnh_close = in6_clsroute;
	callout_init(&V_rtq_timer6, CALLOUT_MPSAFE);
	in6_rtqtimo(rnh); /* kick off timeout first time */
	callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE);
	in6_mtutimo(rnh); /* kick off timeout first time */
	return 1;
	}
	Index: head/sys/netinet6/in6_src.c
	===================================================================
	--- head/sys/netinet6/in6_src.c (revision 183549)
	+++ head/sys/netinet6/in6_src.c (revision 183550)
	@@ -1,1108 +1,1120 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/sx.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>
	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_pcb.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>

	static struct mtx addrsel_lock;
	#define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
	#define ADDRSEL_LOCK() mtx_lock(&addrsel_lock)
	#define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock)
	#define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED)

	static struct sx addrsel_sxlock;
	#define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock")
	#define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock)
	#define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock)
	#define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock)
	#define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock)

	#define ADDR_LABEL_NOTAPP (-1)
	struct in6_addrpolicy defaultaddrpolicy;

	int ip6_prefer_tempaddr = 0;

	static int selectroute __P((struct sockaddr_in6 , struct ip6_pktopts ,
	struct ip6_moptions , struct route_in6 , struct ifnet **,
	struct rtentry **, int, int));
	static int in6_selectif __P((struct sockaddr_in6 , struct ip6_pktopts ,
	struct ip6_moptions , struct route_in6 ro, struct ifnet **));

	static struct in6_addrpolicy lookup_addrsel_policy(struct sockaddr_in6 );

	static void init_policy_queue(void);
	static int add_addrsel_policyent(struct in6_addrpolicy *);
	static int delete_addrsel_policyent(struct in6_addrpolicy *);
	static int walk_addrsel_policy __P((int ()(struct in6_addrpolicy , void *),
	void *));
	static int dump_addrsel_policyent(struct in6_addrpolicy , void );
	static struct in6_addrpolicy match_addrsel_policy(struct sockaddr_in6 );

	/*
	* Return an IPv6 address, which is the most appropriate for a given
	* destination and user specified options.
	* If necessary, this function lookups the routing table and returns
	* an entry to the caller for later use.
	*/
	#define REPLACE(r) do {\
	if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \
	sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
	V_ip6stat.ip6s_sources_rule[(r)]++; \
	/* { \
	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
	printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
	} */ \
	goto replace; \
	} while(0)
	#define NEXT(r) do {\
	if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \
	sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
	V_ip6stat.ip6s_sources_rule[(r)]++; \
	/* { \
	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
	printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
	} */ \
	goto next; /* XXX: we can't use 'continue' here */ \
	} while(0)
	#define BREAK(r) do { \
	if ((r) < sizeof(V_ip6stat.ip6s_sources_rule) / \
	sizeof(V_ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
	V_ip6stat.ip6s_sources_rule[(r)]++; \
	goto out; /* XXX: we can't use 'break' here */ \
	} while(0)

	struct in6_addr *
	in6_selectsrc(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct inpcb inp, struct route_in6 ro, struct ucred *cred,
	struct ifnet *ifpp, int errorp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_addr dst;
	struct ifnet *ifp = NULL;
	struct in6_ifaddr ia = NULL, ia_best = NULL;
	struct in6_pktinfo *pi = NULL;
	int dst_scope = -1, best_scope = -1, best_matchlen = -1;
	struct in6_addrpolicy dst_policy = NULL, best_policy = NULL;
	u_int32_t odstzone;
	int prefer_tempaddr;
	struct ip6_moptions *mopts;

	dst = dstsock->sin6_addr; /* make a copy for local operation */
	*errorp = 0;
	if (ifpp)
	*ifpp = NULL;

	if (inp != NULL) {
	INP_LOCK_ASSERT(inp);
	mopts = inp->in6p_moptions;
	} else {
	mopts = NULL;
	}

	/*
	* If the source address is explicitly specified by the caller,
	* check if the requested source address is indeed a unicast address
	* assigned to the node, and can be used as the packet's source
	* address. If everything is okay, use the address as source.
	*/
	if (opts && (pi = opts->ip6po_pktinfo) &&
	!IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
	struct sockaddr_in6 srcsock;
	struct in6_ifaddr *ia6;

	/* get the outgoing interface */
	if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp))
	!= 0) {
	return (NULL);
	}

	/*
	* determine the appropriate zone id of the source based on
	* the zone of the destination and the outgoing interface.
	* If the specified address is ambiguous wrt the scope zone,
	* the interface must be specified; otherwise, ifa_ifwithaddr()
	* will fail matching the address.
	*/
	bzero(&srcsock, sizeof(srcsock));
	srcsock.sin6_family = AF_INET6;
	srcsock.sin6_len = sizeof(srcsock);
	srcsock.sin6_addr = pi->ipi6_addr;
	if (ifp) {
	*errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
	if (*errorp != 0)
	return (NULL);
	}

	ia6 = (struct in6_ifaddr )ifa_ifwithaddr((struct sockaddr )(&srcsock));
	if (ia6 == NULL \|\|
	(ia6->ia6_flags & (IN6_IFF_ANYCAST \| IN6_IFF_NOTREADY))) {
	*errorp = EADDRNOTAVAIL;
	return (NULL);
	}
	pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */
	if (ifpp)
	*ifpp = ifp;
	return (&ia6->ia_addr.sin6_addr);
	}

	/*
	* Otherwise, if the socket has already bound the source, just use it.
	*/
	if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
	return (&inp->in6p_laddr);
	}

	/*
	* If the address is not specified, choose the best one based on
	* the outgoing interface and the destination address.
	*/
	/* get the outgoing interface */
	if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, &ifp)) != 0)
	return (NULL);

	#ifdef DIAGNOSTIC
	if (ifp == NULL) /* this should not happen */
	panic("in6_selectsrc: NULL ifp");
	#endif
	*errorp = in6_setscope(&dst, ifp, &odstzone);
	if (*errorp != 0)
	return (NULL);

	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	int new_scope = -1, new_matchlen = -1;
	struct in6_addrpolicy *new_policy = NULL;
	u_int32_t srczone, osrczone, dstzone;
	struct in6_addr src;
	struct ifnet *ifp1 = ia->ia_ifp;

	/*
	* We'll never take an address that breaks the scope zone
	* of the destination. We also skip an address if its zone
	* does not contain the outgoing interface.
	* XXX: we should probably use sin6_scope_id here.
	*/
	if (in6_setscope(&dst, ifp1, &dstzone) \|\|
	odstzone != dstzone) {
	continue;
	}
	src = ia->ia_addr.sin6_addr;
	if (in6_setscope(&src, ifp, &osrczone) \|\|
	in6_setscope(&src, ifp1, &srczone) \|\|
	osrczone != srczone) {
	continue;
	}

	/* avoid unusable addresses */
	if ((ia->ia6_flags &
	(IN6_IFF_NOTREADY \| IN6_IFF_ANYCAST \| IN6_IFF_DETACHED))) {
	continue;
	}
	if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
	continue;

	/* Rule 1: Prefer same address */
	if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
	ia_best = ia;
	BREAK(1); /* there should be no better candidate */
	}

	if (ia_best == NULL)
	REPLACE(0);

	/* Rule 2: Prefer appropriate scope */
	if (dst_scope < 0)
	dst_scope = in6_addrscope(&dst);
	new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
	if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
	if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
	REPLACE(2);
	NEXT(2);
	} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
	if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
	NEXT(2);
	REPLACE(2);
	}

	/*
	* Rule 3: Avoid deprecated addresses. Note that the case of
	* !ip6_use_deprecated is already rejected above.
	*/
	if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
	NEXT(3);
	if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
	REPLACE(3);

	/* Rule 4: Prefer home addresses */
	/*
	* XXX: This is a TODO. We should probably merge the MIP6
	* case above.
	*/

	/* Rule 5: Prefer outgoing interface */
	if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
	NEXT(5);
	if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
	REPLACE(5);

	/*
	* Rule 6: Prefer matching label
	* Note that best_policy should be non-NULL here.
	*/
	if (dst_policy == NULL)
	dst_policy = lookup_addrsel_policy(dstsock);
	if (dst_policy->label != ADDR_LABEL_NOTAPP) {
	new_policy = lookup_addrsel_policy(&ia->ia_addr);
	if (dst_policy->label == best_policy->label &&
	dst_policy->label != new_policy->label)
	NEXT(6);
	if (dst_policy->label != best_policy->label &&
	dst_policy->label == new_policy->label)
	REPLACE(6);
	}

	/*
	* Rule 7: Prefer public addresses.
	* We allow users to reverse the logic by configuring
	* a sysctl variable, so that privacy conscious users can
	* always prefer temporary addresses.
	*/
	if (opts == NULL \|\|
	opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
	prefer_tempaddr = V_ip6_prefer_tempaddr;
	} else if (opts->ip6po_prefer_tempaddr ==
	IP6PO_TEMPADDR_NOTPREFER) {
	prefer_tempaddr = 0;
	} else
	prefer_tempaddr = 1;
	if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
	(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
	if (prefer_tempaddr)
	REPLACE(7);
	else
	NEXT(7);
	}
	if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
	!(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
	if (prefer_tempaddr)
	NEXT(7);
	else
	REPLACE(7);
	}

	/*
	* Rule 8: prefer addresses on alive interfaces.
	* This is a KAME specific rule.
	*/
	if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
	!(ia->ia_ifp->if_flags & IFF_UP))
	NEXT(8);
	if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
	(ia->ia_ifp->if_flags & IFF_UP))
	REPLACE(8);

	/*
	* Rule 14: Use longest matching prefix.
	* Note: in the address selection draft, this rule is
	* documented as "Rule 8". However, since it is also
	* documented that this rule can be overridden, we assign
	* a large number so that it is easy to assign smaller numbers
	* to more preferred rules.
	*/
	new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
	if (best_matchlen < new_matchlen)
	REPLACE(14);
	if (new_matchlen < best_matchlen)
	NEXT(14);

	/* Rule 15 is reserved. */

	/*
	* Last resort: just keep the current candidate.
	* Or, do we need more rules?
	*/
	continue;

	replace:
	ia_best = ia;
	best_scope = (new_scope >= 0 ? new_scope :
	in6_addrscope(&ia_best->ia_addr.sin6_addr));
	best_policy = (new_policy ? new_policy :
	lookup_addrsel_policy(&ia_best->ia_addr));
	best_matchlen = (new_matchlen >= 0 ? new_matchlen :
	in6_matchlen(&ia_best->ia_addr.sin6_addr,
	&dst));

	next:
	continue;

	out:
	break;
	}

	if ((ia = ia_best) == NULL) {
	*errorp = EADDRNOTAVAIL;
	return (NULL);
	}

	if (ifpp)
	*ifpp = ifp;

	return (&ia->ia_addr.sin6_addr);
	}

	/*
	* clone - meaningful only for bsdi and freebsd
	*/
	static int
	selectroute(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct route_in6 ro,
	struct ifnet retifp, struct rtentry retrt, int clone,
	int norouteok)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	int error = 0;
	struct ifnet *ifp = NULL;
	struct rtentry *rt = NULL;
	struct sockaddr_in6 *sin6_next;
	struct in6_pktinfo *pi = NULL;
	struct in6_addr *dst = &dstsock->sin6_addr;
	#if 0
	char ip6buf[INET6_ADDRSTRLEN];

	if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
	dstsock->sin6_addr.s6_addr32[1] == 0 &&
	!IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
	printf("in6_selectroute: strange destination %s\n",
	ip6_sprintf(ip6buf, &dstsock->sin6_addr));
	} else {
	printf("in6_selectroute: destination = %s%%%d\n",
	ip6_sprintf(ip6buf, &dstsock->sin6_addr),
	dstsock->sin6_scope_id); /* for debug */
	}
	#endif

	/* If the caller specify the outgoing interface explicitly, use it. */
	if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
	/* XXX boundary check is assumed to be already done. */
	ifp = ifnet_byindex(pi->ipi6_ifindex);
	if (ifp != NULL &&
	(norouteok \|\| retrt == NULL \|\|
	IN6_IS_ADDR_MULTICAST(dst))) {
	/*
	* we do not have to check or get the route for
	* multicast.
	*/
	goto done;
	} else
	goto getroute;
	}

	/*
	* If the destination address is a multicast address and the outgoing
	* interface for the address is specified by the caller, use it.
	*/
	if (IN6_IS_ADDR_MULTICAST(dst) &&
	mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
	goto done; /* we do not need a route for multicast. */
	}

	getroute:
	/*
	* If the next hop address for the packet is specified by the caller,
	* use it as the gateway.
	*/
	if (opts && opts->ip6po_nexthop) {
	struct route_in6 *ron;

	sin6_next = satosin6(opts->ip6po_nexthop);

	/* at this moment, we only support AF_INET6 next hops */
	if (sin6_next->sin6_family != AF_INET6) {
	error = EAFNOSUPPORT; /* or should we proceed? */
	goto done;
	}

	/*
	* If the next hop is an IPv6 address, then the node identified
	* by that address must be a neighbor of the sending host.
	*/
	ron = &opts->ip6po_nextroute;
	if ((ron->ro_rt &&
	(ron->ro_rt->rt_flags & (RTF_UP \| RTF_LLINFO)) !=
	(RTF_UP \| RTF_LLINFO)) \|\|
	!IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr,
	&sin6_next->sin6_addr)) {
	if (ron->ro_rt) {
	RTFREE(ron->ro_rt);
	ron->ro_rt = NULL;
	}
	satosin6(&ron->ro_dst) = sin6_next;
	}
	if (ron->ro_rt == NULL) {
	rtalloc((struct route )ron); / multi path case? */
	if (ron->ro_rt == NULL \|\|
	!(ron->ro_rt->rt_flags & RTF_LLINFO)) {
	if (ron->ro_rt) {
	RTFREE(ron->ro_rt);
	ron->ro_rt = NULL;
	}
	error = EHOSTUNREACH;
	goto done;
	}
	}
	rt = ron->ro_rt;
	ifp = rt->rt_ifp;

	/*
	* When cloning is required, try to allocate a route to the
	* destination so that the caller can store path MTU
	* information.
	*/
	if (!clone)
	goto done;
	}

	/*
	* Use a cached route if it exists and is valid, else try to allocate
	* a new one. Note that we should check the address family of the
	* cached destination, in case of sharing the cache with IPv4.
	*/
	if (ro) {
	if (ro->ro_rt &&
	(!(ro->ro_rt->rt_flags & RTF_UP) \|\|
	((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 \|\|
	!IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
	dst))) {
	RTFREE(ro->ro_rt);
	ro->ro_rt = (struct rtentry *)NULL;
	}
	if (ro->ro_rt == (struct rtentry *)NULL) {
	struct sockaddr_in6 *sa6;

	/* No route yet, so try to acquire one */
	bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
	sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
	sa6 = dstsock;
	sa6->sin6_scope_id = 0;

	if (clone) {
	#ifdef RADIX_MPATH
	rtalloc_mpath((struct route *)ro,
	ntohl(sa6->sin6_addr.s6_addr32[3]));
	#else
	rtalloc((struct route *)ro);
	#endif
	} else {
	ro->ro_rt = rtalloc1(&((struct route *)ro)
	->ro_dst, 0, 0UL);
	if (ro->ro_rt)
	RT_UNLOCK(ro->ro_rt);
	}
	}

	/*
	* do not care about the result if we have the nexthop
	* explicitly specified.
	*/
	if (opts && opts->ip6po_nexthop)
	goto done;

	if (ro->ro_rt) {
	ifp = ro->ro_rt->rt_ifp;

	if (ifp == NULL) { /* can this really happen? */
	RTFREE(ro->ro_rt);
	ro->ro_rt = NULL;
	}
	}
	if (ro->ro_rt == NULL)
	error = EHOSTUNREACH;
	rt = ro->ro_rt;

	/*
	* Check if the outgoing interface conflicts with
	* the interface specified by ipi6_ifindex (if specified).
	* Note that loopback interface is always okay.
	* (this may happen when we are sending a packet to one of
	* our own addresses.)
	*/
	if (ifp && opts && opts->ip6po_pktinfo &&
	opts->ip6po_pktinfo->ipi6_ifindex) {
	if (!(ifp->if_flags & IFF_LOOPBACK) &&
	ifp->if_index !=
	opts->ip6po_pktinfo->ipi6_ifindex) {
	error = EHOSTUNREACH;
	goto done;
	}
	}
	}

	done:
	if (ifp == NULL && rt == NULL) {
	/*
	* This can happen if the caller did not pass a cached route
	* nor any other hints. We treat this case an error.
	*/
	error = EHOSTUNREACH;
	}
	if (error == EHOSTUNREACH)
	V_ip6stat.ip6s_noroute++;

	if (retifp != NULL)
	*retifp = ifp;
	if (retrt != NULL)
	retrt = rt; / rt may be NULL */

	return (error);
	}

	static int
	in6_selectif(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct route_in6 ro, struct ifnet **retifp)
	{
	int error;
	struct route_in6 sro;
	struct rtentry *rt = NULL;

	if (ro == NULL) {
	bzero(&sro, sizeof(sro));
	ro = &sro;
	}

	if ((error = selectroute(dstsock, opts, mopts, ro, retifp,
	&rt, 0, 1)) != 0) {
	if (ro == &sro && rt && rt == sro.ro_rt)
	RTFREE(rt);
	return (error);
	}

	/*
	* do not use a rejected or black hole route.
	* XXX: this check should be done in the L2 output routine.
	* However, if we skipped this check here, we'd see the following
	* scenario:
	* - install a rejected route for a scoped address prefix
	* (like fe80::/10)
	* - send a packet to a destination that matches the scoped prefix,
	* with ambiguity about the scope zone.
	* - pick the outgoing interface from the route, and disambiguate the
	* scope zone with the interface.
	* - ip6_output() would try to get another route with the "new"
	* destination, which may be valid.
	* - we'd see no error on output.
	* Although this may not be very harmful, it should still be confusing.
	* We thus reject the case here.
	*/
	if (rt && (rt->rt_flags & (RTF_REJECT \| RTF_BLACKHOLE))) {
	int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);

	if (ro == &sro && rt && rt == sro.ro_rt)
	RTFREE(rt);
	return (flags);
	}

	/*
	* Adjust the "outgoing" interface. If we're going to loop the packet
	* back to ourselves, the ifp would be the loopback interface.
	* However, we'd rather know the interface associated to the
	* destination address (which should probably be one of our own
	* addresses.)
	*/
	if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp)
	*retifp = rt->rt_ifa->ifa_ifp;

	if (ro == &sro && rt && rt == sro.ro_rt)
	RTFREE(rt);
	return (0);
	}

	/*
	* clone - meaningful only for bsdi and freebsd
	*/
	int
	in6_selectroute(struct sockaddr_in6 dstsock, struct ip6_pktopts opts,
	struct ip6_moptions mopts, struct route_in6 ro,
	struct ifnet retifp, struct rtentry retrt, int clone)
	{

	return (selectroute(dstsock, opts, mopts, ro, retifp,
	retrt, clone, 0));
	}

	/*
	* Default hop limit selection. The precedence is as follows:
	* 1. Hoplimit value specified via ioctl.
	* 2. (If the outgoing interface is detected) the current
	* hop limit of the interface specified by router advertisement.
	* 3. The system default hoplimit.
	*/
	int
	in6_selecthlim(struct in6pcb in6p, struct ifnet ifp)
	{
	+ INIT_VNET_INET6(curvnet);

	if (in6p && in6p->in6p_hops >= 0)
	return (in6p->in6p_hops);
	else if (ifp)
	return (ND_IFINFO(ifp)->chlim);
	else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
	struct route_in6 ro6;
	struct ifnet *lifp;

	bzero(&ro6, sizeof(ro6));
	ro6.ro_dst.sin6_family = AF_INET6;
	ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
	ro6.ro_dst.sin6_addr = in6p->in6p_faddr;
	rtalloc((struct route *)&ro6);
	if (ro6.ro_rt) {
	lifp = ro6.ro_rt->rt_ifp;
	RTFREE(ro6.ro_rt);
	if (lifp)
	return (ND_IFINFO(lifp)->chlim);
	} else
	return (V_ip6_defhlim);
	}
	return (V_ip6_defhlim);
	}

	/*
	* XXX: this is borrowed from in6_pcbbind(). If possible, we should
	* share this function by all bsd...
	*/
	int
	in6_pcbsetport(struct in6_addr laddr, struct inpcb inp, struct ucred *cred)
	{
	+ INIT_VNET_INET(curvnet);
	struct socket *so = inp->inp_socket;
	u_int16_t lport = 0, first, last, *lastport;
	int count, error = 0, wild = 0;
	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;

	INP_INFO_WLOCK_ASSERT(pcbinfo);
	INP_WLOCK_ASSERT(inp);

	/* XXX: this is redundant when called from in6_pcbbind */
	if ((so->so_options & (SO_REUSEADDR\|SO_REUSEPORT)) == 0)
	wild = INPLOOKUP_WILDCARD;

	inp->inp_flags \|= INP_ANONPORT;

	if (inp->inp_flags & INP_HIGHPORT) {
	first = V_ipport_hifirstauto; /* sysctl */
	last = V_ipport_hilastauto;
	lastport = &pcbinfo->ipi_lasthi;
	} else if (inp->inp_flags & INP_LOWPORT) {
	error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
	if (error)
	return error;
	first = V_ipport_lowfirstauto; /* 1023 */
	last = V_ipport_lowlastauto; /* 600 */
	lastport = &pcbinfo->ipi_lastlow;
	} else {
	first = V_ipport_firstauto; /* sysctl */
	last = V_ipport_lastauto;
	lastport = &pcbinfo->ipi_lastport;
	}
	/*
	* Simple check to ensure all ports are not used up causing
	* a deadlock here.
	*
	* We split the two cases (up and down) so that the direction
	* is not being tested on each round of the loop.
	*/
	if (first > last) {
	/*
	* counting down
	*/
	count = first - last;

	do {
	if (count-- < 0) { /* completely used? */
	/*
	* Undo any address bind that may have
	* occurred above.
	*/
	inp->in6p_laddr = in6addr_any;
	return (EAGAIN);
	}
	--*lastport;
	if (lastport > first \|\| lastport < last)
	*lastport = first;
	lport = htons(*lastport);
	} while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr,
	lport, wild, cred));
	} else {
	/*
	* counting up
	*/
	count = last - first;

	do {
	if (count-- < 0) { /* completely used? */
	/*
	* Undo any address bind that may have
	* occurred above.
	*/
	inp->in6p_laddr = in6addr_any;
	return (EAGAIN);
	}
	++*lastport;
	if (lastport < first \|\| lastport > last)
	*lastport = first;
	lport = htons(*lastport);
	} while (in6_pcblookup_local(pcbinfo, &inp->in6p_laddr,
	lport, wild, cred));
	}

	inp->inp_lport = lport;
	if (in_pcbinshash(inp) != 0) {
	inp->in6p_laddr = in6addr_any;
	inp->inp_lport = 0;
	return (EAGAIN);
	}

	return (0);
	}

	void
	addrsel_policy_init(void)
	{
	ADDRSEL_LOCK_INIT();
	ADDRSEL_SXLOCK_INIT();
	+ INIT_VNET_INET6(curvnet);

	init_policy_queue();

	/* initialize the "last resort" policy */
	bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
	V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
	}

	static struct in6_addrpolicy *
	lookup_addrsel_policy(struct sockaddr_in6 *key)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_addrpolicy *match = NULL;

	ADDRSEL_LOCK();
	match = match_addrsel_policy(key);

	if (match == NULL)
	match = &V_defaultaddrpolicy;
	else
	match->use++;
	ADDRSEL_UNLOCK();

	return (match);
	}

	/*
	* Subroutines to manage the address selection policy table via sysctl.
	*/
	struct walkarg {
	struct sysctl_req *w_req;
	};

	static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
	SYSCTL_DECL(_net_inet6_ip6);
	SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
	CTLFLAG_RD, in6_src_sysctl, "");

	static int
	in6_src_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct walkarg w;

	if (req->newptr)
	return EPERM;

	bzero(&w, sizeof(w));
	w.w_req = req;

	return (walk_addrsel_policy(dump_addrsel_policyent, &w));
	}

	int
	in6_src_ioctl(u_long cmd, caddr_t data)
	{
	int i;
	struct in6_addrpolicy ent0;

	if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
	return (EOPNOTSUPP); /* check for safety */

	ent0 = (struct in6_addrpolicy )data;

	if (ent0.label == ADDR_LABEL_NOTAPP)
	return (EINVAL);
	/* check if the prefix mask is consecutive. */
	if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
	return (EINVAL);
	/* clear trailing garbages (if any) of the prefix address. */
	for (i = 0; i < 4; i++) {
	ent0.addr.sin6_addr.s6_addr32[i] &=
	ent0.addrmask.sin6_addr.s6_addr32[i];
	}
	ent0.use = 0;

	switch (cmd) {
	case SIOCAADDRCTL_POLICY:
	return (add_addrsel_policyent(&ent0));
	case SIOCDADDRCTL_POLICY:
	return (delete_addrsel_policyent(&ent0));
	}

	return (0); /* XXX: compromise compilers */
	}

	/*
	* The followings are implementation of the policy table using a
	* simple tail queue.
	* XXX such details should be hidden.
	* XXX implementation using binary tree should be more efficient.
	*/
	struct addrsel_policyent {
	TAILQ_ENTRY(addrsel_policyent) ape_entry;
	struct in6_addrpolicy ape_policy;
	};

	TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);

	struct addrsel_policyhead addrsel_policytab;

	static void
	init_policy_queue(void)
	{
	+ INIT_VNET_INET6(curvnet);

	TAILQ_INIT(&V_addrsel_policytab);
	}

	static int
	add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
	{
	+ INIT_VNET_INET6(curvnet);
	struct addrsel_policyent new, pol;

	MALLOC(new, struct addrsel_policyent , sizeof(new), M_IFADDR,
	M_WAITOK);
	ADDRSEL_XLOCK();
	ADDRSEL_LOCK();

	/* duplication check */
	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
	if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
	&pol->ape_policy.addr.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
	&pol->ape_policy.addrmask.sin6_addr)) {
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();
	FREE(new, M_IFADDR);
	return (EEXIST); /* or override it? */
	}
	}

	bzero(new, sizeof(*new));

	/* XXX: should validate entry */
	new->ape_policy = *newpolicy;

	TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();

	return (0);
	}

	static int
	delete_addrsel_policyent(struct in6_addrpolicy *key)
	{
	+ INIT_VNET_INET6(curvnet);
	struct addrsel_policyent *pol;

	ADDRSEL_XLOCK();
	ADDRSEL_LOCK();

	/* search for the entry in the table */
	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
	if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
	&pol->ape_policy.addr.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
	&pol->ape_policy.addrmask.sin6_addr)) {
	break;
	}
	}
	if (pol == NULL) {
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();
	return (ESRCH);
	}

	TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
	ADDRSEL_UNLOCK();
	ADDRSEL_XUNLOCK();

	return (0);
	}

	static int
	walk_addrsel_policy(int (callback)(struct in6_addrpolicy , void *),
	void *w)
	{
	+ INIT_VNET_INET6(curvnet);
	struct addrsel_policyent *pol;
	int error = 0;

	ADDRSEL_SLOCK();
	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
	if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
	ADDRSEL_SUNLOCK();
	return (error);
	}
	}
	ADDRSEL_SUNLOCK();
	return (error);
	}

	static int
	dump_addrsel_policyent(struct in6_addrpolicy pol, void arg)
	{
	int error = 0;
	struct walkarg *w = arg;

	error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));

	return (error);
	}

	static struct in6_addrpolicy *
	match_addrsel_policy(struct sockaddr_in6 *key)
	{
	+ INIT_VNET_INET6(curvnet);
	struct addrsel_policyent *pent;
	struct in6_addrpolicy bestpol = NULL, pol;
	int matchlen, bestmatchlen = -1;
	u_char mp, ep, k, p, m;

	TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
	matchlen = 0;

	pol = &pent->ape_policy;
	mp = (u_char *)&pol->addrmask.sin6_addr;
	ep = mp + 16; /* XXX: scope field? */
	k = (u_char *)&key->sin6_addr;
	p = (u_char *)&pol->addr.sin6_addr;
	for (; mp < ep && *mp; mp++, k++, p++) {
	m = *mp;
	if ((k & m) != p)
	goto next; /* not match */
	if (m == 0xff) /* short cut for a typical case */
	matchlen += 8;
	else {
	while (m >= 0x80) {
	matchlen++;
	m <<= 1;
	}
	}
	}

	/* matched. check if this is better than the current best. */
	if (bestpol == NULL \|\|
	matchlen > bestmatchlen) {
	bestpol = pol;
	bestmatchlen = matchlen;
	}

	next:
	continue;
	}

	return (bestpol);
	}
	Index: head/sys/netinet6/ip6_forward.c
	===================================================================
	--- head/sys/netinet6/ip6_forward.c (revision 183549)
	+++ head/sys/netinet6/ip6_forward.c (revision 183550)
	@@ -1,657 +1,659 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: ip6_forward.c,v 1.69 2001/05/17 03:48:30 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_ipstealth.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <net/pfil.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet/icmp6.h>
	#include <netinet6/nd6.h>

	#include <netinet/in_pcb.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#include <netipsec/key.h>
	#endif /* IPSEC */

	#include <netinet6/ip6protosw.h>

	struct route_in6 ip6_forward_rt;

	/*
	* Forward a packet. If some error occurs return the sender
	* an icmp packet. Note we can't always generate a meaningful
	* icmp message because icmp doesn't have a large enough repertoire
	* of codes and types.
	*
	* If not forwarding, just drop the packet. This could be confusing
	* if ipforwarding was zero but some routing protocol was advancing
	* us as a gateway to somewhere. However, we must let the routing
	* protocol deal with that.
	*
	*/
	void
	ip6_forward(struct mbuf *m, int srcrt)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct sockaddr_in6 *dst = NULL;
	struct rtentry *rt = NULL;
	int error, type = 0, code = 0;
	struct mbuf *mcopy = NULL;
	struct ifnet origifp; / maybe unnecessary */
	u_int32_t inzone, outzone;
	struct in6_addr src_in6, dst_in6;
	#ifdef IPSEC
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp = NULL;
	int ipsecrt = 0;
	#endif
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	/* GIANT_REQUIRED; / / XXX bz: ip6_forward_rt */

	#ifdef IPSEC
	/*
	* Check AH/ESP integrity.
	*/
	/*
	* Don't increment ip6s_cantforward because this is the check
	* before forwarding packet actually.
	*/
	if (ipsec6_in_reject(m, NULL)) {
	V_ipsec6stat.in_polvio++;
	m_freem(m);
	return;
	}
	#endif /* IPSEC */

	/*
	* Do not forward packets to multicast destination (should be handled
	* by ip6_mforward().
	* Do not forward packets with unspecified source. It was discussed
	* in July 2000, on the ipngwg mailing list.
	*/
	if ((m->m_flags & (M_BCAST\|M_MCAST)) != 0 \|\|
	IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
	V_ip6stat.ip6s_cantforward++;
	/* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */
	if (V_ip6_log_time + V_ip6_log_interval < time_second) {
	V_ip6_log_time = time_second;
	log(LOG_DEBUG,
	"cannot forward "
	"from %s to %s nxt %d received on %s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ip6->ip6_nxt,
	if_name(m->m_pkthdr.rcvif));
	}
	m_freem(m);
	return;
	}

	#ifdef IPSTEALTH
	if (!V_ip6stealth) {
	#endif
	if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
	/* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */
	icmp6_error(m, ICMP6_TIME_EXCEEDED,
	ICMP6_TIME_EXCEED_TRANSIT, 0);
	return;
	}
	ip6->ip6_hlim -= IPV6_HLIMDEC;

	#ifdef IPSTEALTH
	}
	#endif

	/*
	* Save at most ICMPV6_PLD_MAXLEN (= the min IPv6 MTU -
	* size of IPv6 + ICMPv6 headers) bytes of the packet in case
	* we need to generate an ICMP6 message to the src.
	* Thanks to M_EXT, in most cases copy will not occur.
	*
	* It is important to save it before IPsec processing as IPsec
	* processing may modify the mbuf.
	*/
	mcopy = m_copy(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN));

	#ifdef IPSEC
	/* get a security policy for this packet */
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
	IP_FORWARDING, &error);
	if (sp == NULL) {
	V_ipsec6stat.out_inval++;
	V_ip6stat.ip6s_cantforward++;
	if (mcopy) {
	#if 0
	/* XXX: what icmp ? */
	#else
	m_freem(mcopy);
	#endif
	}
	m_freem(m);
	return;
	}

	error = 0;

	/* check policy */
	switch (sp->policy) {
	case IPSEC_POLICY_DISCARD:
	/*
	* This packet is just discarded.
	*/
	V_ipsec6stat.out_polvio++;
	V_ip6stat.ip6s_cantforward++;
	KEY_FREESP(&sp);
	if (mcopy) {
	#if 0
	/* XXX: what icmp ? */
	#else
	m_freem(mcopy);
	#endif
	}
	m_freem(m);
	return;

	case IPSEC_POLICY_BYPASS:
	case IPSEC_POLICY_NONE:
	/* no need to do IPsec. */
	KEY_FREESP(&sp);
	goto skip_ipsec;

	case IPSEC_POLICY_IPSEC:
	if (sp->req == NULL) {
	/* XXX should be panic ? */
	printf("ip6_forward: No IPsec request specified.\n");
	V_ip6stat.ip6s_cantforward++;
	KEY_FREESP(&sp);
	if (mcopy) {
	#if 0
	/* XXX: what icmp ? */
	#else
	m_freem(mcopy);
	#endif
	}
	m_freem(m);
	return;
	}
	/* do IPsec */
	break;

	case IPSEC_POLICY_ENTRUST:
	default:
	/* should be panic ?? */
	printf("ip6_forward: Invalid policy found. %d\n", sp->policy);
	KEY_FREESP(&sp);
	goto skip_ipsec;
	}

	{
	struct ipsecrequest *isr = NULL;
	struct ipsec_output_state state;

	/*
	* when the kernel forwards a packet, it is not proper to apply
	* IPsec transport mode to the packet is not proper. this check
	* avoid from this.
	* at present, if there is even a transport mode SA request in the
	* security policy, the kernel does not apply IPsec to the packet.
	* this check is not enough because the following case is valid.
	* ipsec esp/tunnel/xxx-xxx/require esp/transport//require;
	*/
	for (isr = sp->req; isr; isr = isr->next) {
	if (isr->saidx.mode == IPSEC_MODE_ANY)
	goto doipsectunnel;
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
	goto doipsectunnel;
	}

	/*
	* if there's no need for tunnel mode IPsec, skip.
	*/
	if (!isr)
	goto skip_ipsec;

	doipsectunnel:
	/*
	* All the extension headers will become inaccessible
	* (since they can be encrypted).
	* Don't panic, we need no more updates to extension headers
	* on inner IPv6 packet (since they are now encapsulated).
	*
	* IPv6 [ESP\|AH] IPv6 [extension headers] payload
	*/
	bzero(&state, sizeof(state));
	state.m = m;
	state.ro = NULL; /* update at ipsec6_output_tunnel() */
	state.dst = NULL; /* update at ipsec6_output_tunnel() */

	error = ipsec6_output_tunnel(&state, sp, 0);

	m = state.m;
	KEY_FREESP(&sp);

	if (error) {
	/* mbuf is already reclaimed in ipsec6_output_tunnel. */
	switch (error) {
	case EHOSTUNREACH:
	case ENETUNREACH:
	case EMSGSIZE:
	case ENOBUFS:
	case ENOMEM:
	break;
	default:
	printf("ip6_output (ipsec): error code %d\n", error);
	/* FALLTHROUGH */
	case ENOENT:
	/* don't show these error codes to the user */
	break;
	}
	V_ip6stat.ip6s_cantforward++;
	if (mcopy) {
	#if 0
	/* XXX: what icmp ? */
	#else
	m_freem(mcopy);
	#endif
	}
	m_freem(m);
	return;
	} else {
	/*
	* In the FAST IPSec case we have already
	* re-injected the packet and it has been freed
	* by the ipsec_done() function. So, just clean
	* up after ourselves.
	*/
	m = NULL;
	goto freecopy;
	}

	if ((m != NULL) && (ip6 != mtod(m, struct ip6_hdr *)) ){
	/*
	* now tunnel mode headers are added. we are originating
	* packet instead of forwarding the packet.
	*/
	ip6_output(m, NULL, NULL, IPV6_FORWARDING/XXX/, NULL, NULL,
	NULL);
	goto freecopy;
	}

	/* adjust pointer */
	dst = (struct sockaddr_in6 *)state.dst;
	rt = state.ro ? state.ro->ro_rt : NULL;
	if (dst != NULL && rt != NULL)
	ipsecrt = 1;
	}
	skip_ipsec:
	#endif /* IPSEC */

	#ifdef IPSEC
	if (ipsecrt)
	goto skip_routing;
	#endif

	dst = (struct sockaddr_in6 *)&V_ip6_forward_rt.ro_dst;
	if (!srcrt) {
	/* ip6_forward_rt.ro_dst.sin6_addr is equal to ip6->ip6_dst */
	if (V_ip6_forward_rt.ro_rt == 0 \|\|
	(V_ip6_forward_rt.ro_rt->rt_flags & RTF_UP) == 0) {
	if (V_ip6_forward_rt.ro_rt) {
	RTFREE(V_ip6_forward_rt.ro_rt);
	V_ip6_forward_rt.ro_rt = 0;
	}

	/* this probably fails but give it a try again */
	rtalloc((struct route *)&V_ip6_forward_rt);
	}

	if (V_ip6_forward_rt.ro_rt == 0) {
	V_ip6stat.ip6s_noroute++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
	if (mcopy) {
	icmp6_error(mcopy, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_NOROUTE, 0);
	}
	m_freem(m);
	return;
	}
	} else if ((rt = V_ip6_forward_rt.ro_rt) == 0 \|\|
	!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &dst->sin6_addr)) {
	if (V_ip6_forward_rt.ro_rt) {
	RTFREE(V_ip6_forward_rt.ro_rt);
	V_ip6_forward_rt.ro_rt = 0;
	}
	bzero(dst, sizeof(*dst));
	dst->sin6_len = sizeof(struct sockaddr_in6);
	dst->sin6_family = AF_INET6;
	dst->sin6_addr = ip6->ip6_dst;

	rtalloc((struct route *)&V_ip6_forward_rt);
	if (V_ip6_forward_rt.ro_rt == 0) {
	V_ip6stat.ip6s_noroute++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
	if (mcopy) {
	icmp6_error(mcopy, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_NOROUTE, 0);
	}
	m_freem(m);
	return;
	}
	}
	rt = V_ip6_forward_rt.ro_rt;
	#ifdef IPSEC
	skip_routing:;
	#endif

	/*
	* Source scope check: if a packet can't be delivered to its
	* destination for the reason that the destination is beyond the scope
	* of the source address, discard the packet and return an icmp6
	* destination unreachable error with Code 2 (beyond scope of source
	* address). We use a local copy of ip6_src, since in6_setscope()
	* will possibly modify its first argument.
	* [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1]
	*/
	src_in6 = ip6->ip6_src;
	if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) {
	/* XXX: this should not happen */
	V_ip6stat.ip6s_cantforward++;
	V_ip6stat.ip6s_badscope++;
	m_freem(m);
	return;
	}
	if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) {
	V_ip6stat.ip6s_cantforward++;
	V_ip6stat.ip6s_badscope++;
	m_freem(m);
	return;
	}
	if (inzone != outzone
	#ifdef IPSEC
	&& !ipsecrt
	#endif
	) {
	V_ip6stat.ip6s_cantforward++;
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard);

	if (V_ip6_log_time + V_ip6_log_interval < time_second) {
	V_ip6_log_time = time_second;
	log(LOG_DEBUG,
	"cannot forward "
	"src %s, dst %s, nxt %d, rcvif %s, outif %s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ip6->ip6_nxt,
	if_name(m->m_pkthdr.rcvif), if_name(rt->rt_ifp));
	}
	if (mcopy)
	icmp6_error(mcopy, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_BEYONDSCOPE, 0);
	m_freem(m);
	return;
	}

	/*
	* Destination scope check: if a packet is going to break the scope
	* zone of packet's destination address, discard it. This case should
	* usually be prevented by appropriately-configured routing table, but
	* we need an explicit check because we may mistakenly forward the
	* packet to a different zone by (e.g.) a default route.
	*/
	dst_in6 = ip6->ip6_dst;
	if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 \|\|
	in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 \|\|
	inzone != outzone) {
	V_ip6stat.ip6s_cantforward++;
	V_ip6stat.ip6s_badscope++;
	m_freem(m);
	return;
	}

	if (m->m_pkthdr.len > IN6_LINKMTU(rt->rt_ifp)) {
	in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig);
	if (mcopy) {
	u_long mtu;
	#ifdef IPSEC
	struct secpolicy *sp;
	int ipsecerror;
	size_t ipsechdrsiz;
	#endif /* IPSEC */

	mtu = IN6_LINKMTU(rt->rt_ifp);
	#ifdef IPSEC
	/*
	* When we do IPsec tunnel ingress, we need to play
	* with the link value (decrement IPsec header size
	* from mtu value). The code is much simpler than v4
	* case, as we have the outgoing interface for
	* encapsulated packet as "rt->rt_ifp".
	*/
	sp = ipsec_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND,
	IP_FORWARDING, &ipsecerror);
	if (sp) {
	ipsechdrsiz = ipsec6_hdrsiz(mcopy,
	IPSEC_DIR_OUTBOUND, NULL);
	if (ipsechdrsiz < mtu)
	mtu -= ipsechdrsiz;
	}

	/*
	* if mtu becomes less than minimum MTU,
	* tell minimum MTU (and I'll need to fragment it).
	*/
	if (mtu < IPV6_MMTU)
	mtu = IPV6_MMTU;
	#endif /* IPSEC */
	icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu);
	}
	m_freem(m);
	return;
	}

	if (rt->rt_flags & RTF_GATEWAY)
	dst = (struct sockaddr_in6 *)rt->rt_gateway;

	/*
	* If we are to forward the packet using the same interface
	* as one we got the packet from, perhaps we should send a redirect
	* to sender to shortcut a hop.
	* Only send redirect if source is sending directly to us,
	* and if packet was not source routed (or has any options).
	* Also, don't send redirect if forwarding using a route
	* modified by a redirect.
	*/
	if (V_ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt &&
	#ifdef IPSEC
	!ipsecrt &&
	#endif /* IPSEC */
	(rt->rt_flags & (RTF_DYNAMIC\|RTF_MODIFIED)) == 0) {
	if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) {
	/*
	* If the incoming interface is equal to the outgoing
	* one, and the link attached to the interface is
	* point-to-point, then it will be highly probable
	* that a routing loop occurs. Thus, we immediately
	* drop the packet and send an ICMPv6 error message.
	*
	* type/code is based on suggestion by Rich Draves.
	* not sure if it is the best pick.
	*/
	icmp6_error(mcopy, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_ADDR, 0);
	m_freem(m);
	return;
	}
	type = ND_REDIRECT;
	}

	/*
	* Fake scoped addresses. Note that even link-local source or
	* destinaion can appear, if the originating node just sends the
	* packet to us (without address resolution for the destination).
	* Since both icmp6_error and icmp6_redirect_output fill the embedded
	* link identifiers, we can do this stuff after making a copy for
	* returning an error.
	*/
	if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
	/*
	* See corresponding comments in ip6_output.
	* XXX: but is it possible that ip6_forward() sends a packet
	* to a loopback interface? I don't think so, and thus
	* I bark here. (jinmei@kame.net)
	* XXX: it is common to route invalid packets to loopback.
	* also, the codepath will be visited on use of ::1 in
	* rthdr. (itojun)
	*/
	#if 1
	if (0)
	#else
	if ((rt->rt_flags & (RTF_BLACKHOLE\|RTF_REJECT)) == 0)
	#endif
	{
	printf("ip6_forward: outgoing interface is loopback. "
	"src %s, dst %s, nxt %d, rcvif %s, outif %s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif),
	if_name(rt->rt_ifp));
	}

	/* we can just use rcvif in forwarding. */
	origifp = m->m_pkthdr.rcvif;
	}
	else
	origifp = rt->rt_ifp;
	/*
	* clear embedded scope identifiers if necessary.
	* in6_clearscope will touch the addresses only when necessary.
	*/
	in6_clearscope(&ip6->ip6_src);
	in6_clearscope(&ip6->ip6_dst);

	/* Jump over all PFIL processing if hooks are not active. */
	if (!PFIL_HOOKED(&inet6_pfil_hook))
	goto pass;

	/* Run through list of hooks for output packets. */
	error = pfil_run_hooks(&inet6_pfil_hook, &m, rt->rt_ifp, PFIL_OUT, NULL);
	if (error != 0)
	goto senderr;
	if (m == NULL)
	goto freecopy;
	ip6 = mtod(m, struct ip6_hdr *);

	pass:
	error = nd6_output(rt->rt_ifp, origifp, m, dst, rt);
	if (error) {
	in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard);
	V_ip6stat.ip6s_cantforward++;
	} else {
	V_ip6stat.ip6s_forward++;
	in6_ifstat_inc(rt->rt_ifp, ifs6_out_forward);
	if (type)
	V_ip6stat.ip6s_redirectsent++;
	else {
	if (mcopy)
	goto freecopy;
	}
	}

	senderr:
	if (mcopy == NULL)
	return;
	switch (error) {
	case 0:
	if (type == ND_REDIRECT) {
	icmp6_redirect_output(mcopy, rt);
	return;
	}
	goto freecopy;

	case EMSGSIZE:
	/* xxx MTU is constant in PPP? */
	goto freecopy;

	case ENOBUFS:
	/* Tell source to slow down like source quench in IP? */
	goto freecopy;

	case ENETUNREACH: /* shouldn't happen, checked above */
	case EHOSTUNREACH:
	case ENETDOWN:
	case EHOSTDOWN:
	default:
	type = ICMP6_DST_UNREACH;
	code = ICMP6_DST_UNREACH_ADDR;
	break;
	}
	icmp6_error(mcopy, type, code, 0);
	return;

	freecopy:
	m_freem(mcopy);
	return;
	}
	Index: head/sys/netinet6/ip6_input.c
	===================================================================
	--- head/sys/netinet6/ip6_input.c (revision 183549)
	+++ head/sys/netinet6/ip6_input.c (revision 183550)
	@@ -1,1586 +1,1593 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/proc.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/netisr.h>
	#include <net/pfil.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#ifdef INET
	#include <netinet/ip.h>
	#include <netinet/ip_icmp.h>
	#endif /* INET */
	#include <netinet/ip6.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/ip6_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/icmp6.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet6/nd6.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netinet6/ip6_ipsec.h>
	#include <netipsec/ipsec6.h>
	#endif /* IPSEC */

	#include <netinet6/ip6protosw.h>

	extern struct domain inet6domain;

	u_char ip6_protox[IPPROTO_MAX];
	static struct ifqueue ip6intrq;
	static int ip6qmaxlen = IFQ_MAXLEN;
	struct in6_ifaddr *in6_ifaddr;

	extern struct callout in6_tmpaddrtimer_ch;

	int ip6_forward_srcrt; /* XXX */
	int ip6_sourcecheck; /* XXX */
	int ip6_sourcecheck_interval; /* XXX */

	int ip6_ours_check_algorithm;

	struct pfil_head inet6_pfil_hook;

	struct ip6stat ip6stat;

	static void ip6_init2(void *);
	static struct ip6aux ip6_setdstifaddr(struct mbuf , struct in6_ifaddr *);
	static int ip6_hopopts_input(u_int32_t , u_int32_t , struct mbuf *, int );
	#ifdef PULLDOWN_TEST
	static struct mbuf ip6_pullexthdr(struct mbuf , size_t, int);
	#endif

	/*
	* IP6 initialization: fill in IP6 protocol switch table.
	* All protocols not implemented in kernel go to raw IP6 protocol handler.
	*/
	void
	ip6_init(void)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6protosw *pr;
	int i;

	#ifdef DIAGNOSTIC
	if (sizeof(struct protosw) != sizeof(struct ip6protosw))
	panic("sizeof(protosw) != sizeof(ip6protosw)");
	#endif
	pr = (struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
	if (pr == 0)
	panic("ip6_init");

	/* Initialize the entire ip_protox[] array to IPPROTO_RAW. */
	for (i = 0; i < IPPROTO_MAX; i++)
	ip6_protox[i] = pr - inet6sw;
	/*
	* Cycle through IP protocols and put them into the appropriate place
	* in ip6_protox[].
	*/
	for (pr = (struct ip6protosw *)inet6domain.dom_protosw;
	pr < (struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++)
	if (pr->pr_domain->dom_family == PF_INET6 &&
	pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) {
	/* Be careful to only index valid IP protocols. */
	if (pr->pr_protocol < IPPROTO_MAX)
	ip6_protox[pr->pr_protocol] = pr - inet6sw;
	}

	/* Initialize packet filter hooks. */
	inet6_pfil_hook.ph_type = PFIL_TYPE_AF;
	inet6_pfil_hook.ph_af = AF_INET6;
	if ((i = pfil_head_register(&inet6_pfil_hook)) != 0)
	printf("%s: WARNING: unable to register pfil hook, "
	"error %d\n", __func__, i);

	ip6intrq.ifq_maxlen = V_ip6qmaxlen;
	mtx_init(&ip6intrq.ifq_mtx, "ip6_inq", NULL, MTX_DEF);
	netisr_register(NETISR_IPV6, ip6_input, &ip6intrq, 0);
	scope6_init();
	addrsel_policy_init();
	nd6_init();
	frag6_init();
	V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR;
	}

	static void
	ip6_init2(void *dummy)
	{
	+ INIT_VNET_INET6(curvnet);

	/* nd6_timer_init */
	callout_init(&V_nd6_timer_ch, 0);
	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, NULL);

	/* timer for regeneranation of temporary addresses randomize ID */
	callout_init(&V_in6_tmpaddrtimer_ch, 0);
	callout_reset(&V_in6_tmpaddrtimer_ch,
	(V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
	V_ip6_temp_regen_advance) * hz,
	in6_tmpaddrtimer, NULL);
	}

	/* cheat */
	/* This must be after route_init(), which is now SI_ORDER_THIRD */
	SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL);

	extern struct route_in6 ip6_forward_rt;

	void
	ip6_input(struct mbuf *m)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr *ip6;
	int off = sizeof(struct ip6_hdr), nest;
	u_int32_t plen;
	u_int32_t rtalert = ~0;
	int nxt, ours = 0;
	struct ifnet *deliverifp = NULL;
	struct in6_addr odst;
	int srcrt = 0;

	#ifdef IPSEC
	/*
	* should the inner packet be considered authentic?
	* see comment in ah4_input().
	* NB: m cannot be NULL when passed to the input routine
	*/

	m->m_flags &= ~M_AUTHIPHDR;
	m->m_flags &= ~M_AUTHIPDGM;

	#endif /* IPSEC */

	/*
	* make sure we don't have onion peering information into m_tag.
	*/
	ip6_delaux(m);

	/*
	* mbuf statistics
	*/
	if (m->m_flags & M_EXT) {
	if (m->m_next)
	V_ip6stat.ip6s_mext2m++;
	else
	V_ip6stat.ip6s_mext1++;
	} else {
	#define M2MMAX (sizeof(V_ip6stat.ip6s_m2m)/sizeof(V_ip6stat.ip6s_m2m[0]))
	if (m->m_next) {
	if (m->m_flags & M_LOOP) {
	V_ip6stat.ip6s_m2m[V_loif[0].if_index]++; /* XXX */
	} else if (m->m_pkthdr.rcvif->if_index < M2MMAX)
	V_ip6stat.ip6s_m2m[m->m_pkthdr.rcvif->if_index]++;
	else
	V_ip6stat.ip6s_m2m[0]++;
	} else
	V_ip6stat.ip6s_m1++;
	#undef M2MMAX
	}

	/* drop the packet if IPv6 operation is disabled on the IF */
	if ((ND_IFINFO(m->m_pkthdr.rcvif)->flags & ND6_IFF_IFDISABLED)) {
	m_freem(m);
	return;
	}

	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive);
	V_ip6stat.ip6s_total++;

	#ifndef PULLDOWN_TEST
	/*
	* L2 bridge code and some other code can return mbuf chain
	* that does not conform to KAME requirement. too bad.
	* XXX: fails to join if interface MTU > MCLBYTES. jumbogram?
	*/
	if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) {
	struct mbuf *n;

	MGETHDR(n, M_DONTWAIT, MT_HEADER);
	if (n)
	M_MOVE_PKTHDR(n, m);
	if (n && n->m_pkthdr.len > MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_freem(n);
	n = NULL;
	}
	}
	if (n == NULL) {
	m_freem(m);
	return; /* ENOBUFS */
	}

	m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t));
	n->m_len = n->m_pkthdr.len;
	m_freem(m);
	m = n;
	}
	IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */);
	#endif

	if (m->m_len < sizeof(struct ip6_hdr)) {
	struct ifnet *inifp;
	inifp = m->m_pkthdr.rcvif;
	if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
	V_ip6stat.ip6s_toosmall++;
	in6_ifstat_inc(inifp, ifs6_in_hdrerr);
	return;
	}
	}

	ip6 = mtod(m, struct ip6_hdr *);

	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
	V_ip6stat.ip6s_badvers++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
	goto bad;
	}

	V_ip6stat.ip6s_nxthist[ip6->ip6_nxt]++;

	/*
	* Check against address spoofing/corruption.
	*/
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) \|\|
	IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
	/*
	* XXX: "badscope" is not very suitable for a multicast source.
	*/
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
	goto bad;
	}
	if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) &&
	!(m->m_flags & M_LOOP)) {
	/*
	* In this case, the packet should come from the loopback
	* interface. However, we cannot just check the if_flags,
	* because ip6_mloopback() passes the "actual" interface
	* as the outgoing/incoming interface.
	*/
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
	goto bad;
	}

	#ifdef ALTQ
	if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) {
	/* packet is dropped by traffic conditioner */
	return;
	}
	#endif
	/*
	* The following check is not documented in specs. A malicious
	* party may be able to use IPv4 mapped addr to confuse tcp/udp stack
	* and bypass security checks (act as if it was from 127.0.0.1 by using
	* IPv6 src ::ffff:127.0.0.1). Be cautious.
	*
	* This check chokes if we are in an SIIT cloud. As none of BSDs
	* support IPv4-less kernel compilation, we cannot support SIIT
	* environment at all. So, it makes more sense for us to reject any
	* malicious packets for non-SIIT environment, than try to do a
	* partial support for SIIT environment.
	*/
	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) \|\|
	IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
	goto bad;
	}
	#if 0
	/*
	* Reject packets with IPv4 compatible addresses (auto tunnel).
	*
	* The code forbids auto tunnel relay case in RFC1933 (the check is
	* stronger than RFC1933). We may want to re-enable it if mech-xx
	* is revised to forbid relaying case.
	*/
	if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) \|\|
	IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
	goto bad;
	}
	#endif

	/*
	* Run through list of hooks for input packets.
	*
	* NB: Beware of the destination address changing
	* (e.g. by NAT rewriting). When this happens,
	* tell ip6_forward to do the right thing.
	*/
	odst = ip6->ip6_dst;

	/* Jump over all PFIL processing if hooks are not active. */
	if (!PFIL_HOOKED(&inet6_pfil_hook))
	goto passin;

	if (pfil_run_hooks(&inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL))
	return;
	if (m == NULL) /* consumed by filter */
	return;
	ip6 = mtod(m, struct ip6_hdr *);
	srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);

	passin:
	/*
	* Disambiguate address scope zones (if there is ambiguity).
	* We first make sure that the original source or destination address
	* is not in our internal form for scoped addresses. Such addresses
	* are not necessarily invalid spec-wise, but we cannot accept them due
	* to the usage conflict.
	* in6_setscope() then also checks and rejects the cases where src or
	* dst are the loopback address and the receiving interface
	* is not loopback.
	*/
	if (in6_clearscope(&ip6->ip6_src) \|\| in6_clearscope(&ip6->ip6_dst)) {
	V_ip6stat.ip6s_badscope++; /* XXX */
	goto bad;
	}
	if (in6_setscope(&ip6->ip6_src, m->m_pkthdr.rcvif, NULL) \|\|
	in6_setscope(&ip6->ip6_dst, m->m_pkthdr.rcvif, NULL)) {
	V_ip6stat.ip6s_badscope++;
	goto bad;
	}

	/*
	* Multicast check
	*/
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	struct in6_multi *in6m = 0;

	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mcast);
	/*
	* See if we belong to the destination multicast group on the
	* arrival interface.
	*/
	IN6_LOOKUP_MULTI(ip6->ip6_dst, m->m_pkthdr.rcvif, in6m);
	if (in6m)
	ours = 1;
	else if (!ip6_mrouter) {
	V_ip6stat.ip6s_notmember++;
	V_ip6stat.ip6s_cantforward++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
	goto bad;
	}
	deliverifp = m->m_pkthdr.rcvif;
	goto hbhcheck;
	}

	/*
	* Unicast check
	*/
	if (V_ip6_forward_rt.ro_rt != NULL &&
	(V_ip6_forward_rt.ro_rt->rt_flags & RTF_UP) != 0 &&
	IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
	&((struct sockaddr_in6 *)(&V_ip6_forward_rt.ro_dst))->sin6_addr))
	V_ip6stat.ip6s_forward_cachehit++;
	else {
	struct sockaddr_in6 *dst6;

	if (V_ip6_forward_rt.ro_rt) {
	/* route is down or destination is different */
	V_ip6stat.ip6s_forward_cachemiss++;
	RTFREE(V_ip6_forward_rt.ro_rt);
	V_ip6_forward_rt.ro_rt = 0;
	}

	bzero(&V_ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6));
	dst6 = (struct sockaddr_in6 *)&V_ip6_forward_rt.ro_dst;
	dst6->sin6_len = sizeof(struct sockaddr_in6);
	dst6->sin6_family = AF_INET6;
	dst6->sin6_addr = ip6->ip6_dst;

	rtalloc((struct route *)&V_ip6_forward_rt);
	}

	#define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key))

	/*
	* Accept the packet if the forwarding interface to the destination
	* according to the routing table is the loopback interface,
	* unless the associated route has a gateway.
	* Note that this approach causes to accept a packet if there is a
	* route to the loopback interface for the destination of the packet.
	* But we think it's even useful in some situations, e.g. when using
	* a special daemon which wants to intercept the packet.
	*
	* XXX: some OSes automatically make a cloned route for the destination
	* of an outgoing packet. If the outgoing interface of the packet
	* is a loopback one, the kernel would consider the packet to be
	* accepted, even if we have no such address assinged on the interface.
	* We check the cloned flag of the route entry to reject such cases,
	* assuming that route entries for our own addresses are not made by
	* cloning (it should be true because in6_addloop explicitly installs
	* the host route). However, we might have to do an explicit check
	* while it would be less efficient. Or, should we rather install a
	* reject route for such a case?
	*/
	if (V_ip6_forward_rt.ro_rt &&
	(V_ip6_forward_rt.ro_rt->rt_flags &
	(RTF_HOST\|RTF_GATEWAY)) == RTF_HOST &&
	#ifdef RTF_WASCLONED
	!(V_ip6_forward_rt.ro_rt->rt_flags & RTF_WASCLONED) &&
	#endif
	#ifdef RTF_CLONED
	!(V_ip6_forward_rt.ro_rt->rt_flags & RTF_CLONED) &&
	#endif
	#if 0
	/*
	* The check below is redundant since the comparison of
	* the destination and the key of the rtentry has
	* already done through looking up the routing table.
	*/
	IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
	&rt6_key(V_ip6_forward_rt.ro_rt)->sin6_addr)
	#endif
	V_ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) {
	struct in6_ifaddr *ia6 =
	(struct in6_ifaddr *)V_ip6_forward_rt.ro_rt->rt_ifa;

	/*
	* record address information into m_tag.
	*/
	(void)ip6_setdstifaddr(m, ia6);

	/*
	* packets to a tentative, duplicated, or somehow invalid
	* address must not be accepted.
	*/
	if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) {
	/* this address is ready */
	ours = 1;
	deliverifp = ia6->ia_ifp; /* correct? */
	/* Count the packet in the ip address stats */
	ia6->ia_ifa.if_ipackets++;
	ia6->ia_ifa.if_ibytes += m->m_pkthdr.len;
	goto hbhcheck;
	} else {
	char ip6bufs[INET6_ADDRSTRLEN];
	char ip6bufd[INET6_ADDRSTRLEN];
	/* address is not ready, so discard the packet. */
	nd6log((LOG_INFO,
	"ip6_input: packet to an unready address %s->%s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst)));

	goto bad;
	}
	}

	/*
	* FAITH (Firewall Aided Internet Translator)
	*/
	if (V_ip6_keepfaith) {
	if (V_ip6_forward_rt.ro_rt && V_ip6_forward_rt.ro_rt->rt_ifp
	&& V_ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_FAITH) {
	/* XXX do we need more sanity checks? */
	ours = 1;
	deliverifp = V_ip6_forward_rt.ro_rt->rt_ifp; /* faith */
	goto hbhcheck;
	}
	}

	/*
	* Now there is no reason to process the packet if it's not our own
	* and we're not a router.
	*/
	if (!V_ip6_forwarding) {
	V_ip6stat.ip6s_cantforward++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
	goto bad;
	}

	hbhcheck:
	/*
	* record address information into m_tag, if we don't have one yet.
	* note that we are unable to record it, if the address is not listed
	* as our interface address (e.g. multicast addresses, addresses
	* within FAITH prefixes and such).
	*/
	if (deliverifp && !ip6_getdstifaddr(m)) {
	struct in6_ifaddr *ia6;

	ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
	if (ia6) {
	if (!ip6_setdstifaddr(m, ia6)) {
	/*
	* XXX maybe we should drop the packet here,
	* as we could not provide enough information
	* to the upper layers.
	*/
	}
	}
	}

	/*
	* Process Hop-by-Hop options header if it's contained.
	* m may be modified in ip6_hopopts_input().
	* If a JumboPayload option is included, plen will also be modified.
	*/
	plen = (u_int32_t)ntohs(ip6->ip6_plen);
	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
	struct ip6_hbh *hbh;

	if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) {
	#if 0 /touches NULL pointer/
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
	#endif
	return; /* m have already been freed */
	}

	/* adjust pointer */
	ip6 = mtod(m, struct ip6_hdr *);

	/*
	* if the payload length field is 0 and the next header field
	* indicates Hop-by-Hop Options header, then a Jumbo Payload
	* option MUST be included.
	*/
	if (ip6->ip6_plen == 0 && plen == 0) {
	/*
	* Note that if a valid jumbo payload option is
	* contained, ip6_hopopts_input() must set a valid
	* (non-zero) payload length to the variable plen.
	*/
	V_ip6stat.ip6s_badoptions++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr);
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	(caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
	return;
	}
	#ifndef PULLDOWN_TEST
	/* ip6_hopopts_input() ensures that mbuf is contiguous */
	hbh = (struct ip6_hbh *)(ip6 + 1);
	#else
	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
	sizeof(struct ip6_hbh));
	if (hbh == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return;
	}
	#endif
	nxt = hbh->ip6h_nxt;

	/*
	* If we are acting as a router and the packet contains a
	* router alert option, see if we know the option value.
	* Currently, we only support the option value for MLD, in which
	* case we should pass the packet to the multicast routing
	* daemon.
	*/
	if (rtalert != ~0 && V_ip6_forwarding) {
	switch (rtalert) {
	case IP6OPT_RTALERT_MLD:
	ours = 1;
	break;
	default:
	/*
	* RFC2711 requires unrecognized values must be
	* silently ignored.
	*/
	break;
	}
	}
	} else
	nxt = ip6->ip6_nxt;

	/*
	* Check that the amount of data in the buffers
	* is as at least much as the IPv6 header would have us expect.
	* Trim mbufs if longer than we expect.
	* Drop packet if shorter than we expect.
	*/
	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
	V_ip6stat.ip6s_tooshort++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
	goto bad;
	}
	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
	if (m->m_len == m->m_pkthdr.len) {
	m->m_len = sizeof(struct ip6_hdr) + plen;
	m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
	} else
	m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
	}

	/*
	* Forward if desirable.
	*/
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	/*
	* If we are acting as a multicast router, all
	* incoming multicast packets are passed to the
	* kernel-level multicast forwarding function.
	* The packet is returned (relatively) intact; if
	* ip6_mforward() returns a non-zero value, the packet
	* must be discarded, else it may be accepted below.
	*/
	if (ip6_mrouter && ip6_mforward &&
	ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
	V_ip6stat.ip6s_cantforward++;
	m_freem(m);
	return;
	}
	if (!ours) {
	m_freem(m);
	return;
	}
	} else if (!ours) {
	ip6_forward(m, srcrt);
	return;
	}

	ip6 = mtod(m, struct ip6_hdr *);

	/*
	* Malicious party may be able to use IPv4 mapped addr to confuse
	* tcp/udp stack and bypass security checks (act as if it was from
	* 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious.
	*
	* For SIIT end node behavior, you may want to disable the check.
	* However, you will become vulnerable to attacks using IPv4 mapped
	* source.
	*/
	if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) \|\|
	IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr);
	goto bad;
	}

	/*
	* Tell launch routine the next header
	*/
	V_ip6stat.ip6s_delivered++;
	in6_ifstat_inc(deliverifp, ifs6_in_deliver);
	nest = 0;

	while (nxt != IPPROTO_DONE) {
	if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
	V_ip6stat.ip6s_toomanyhdr++;
	goto bad;
	}

	/*
	* protection against faulty packet - there should be
	* more sanity checks in header chain processing.
	*/
	if (m->m_pkthdr.len < off) {
	V_ip6stat.ip6s_tooshort++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
	goto bad;
	}

	#ifdef IPSEC
	/*
	* enforce IPsec policy checking if we are seeing last header.
	* note that we do not visit this with protocols with pcb layer
	* code - like udp/tcp/raw ip.
	*/
	if (ip6_ipsec_input(m, nxt))
	goto bad;
	#endif /* IPSEC */
	nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
	}
	return;
	bad:
	m_freem(m);
	}

	/*
	* set/grab in6_ifaddr correspond to IPv6 destination address.
	* XXX backward compatibility wrapper
	*/
	static struct ip6aux *
	ip6_setdstifaddr(struct mbuf m, struct in6_ifaddr ia6)
	{
	struct ip6aux *ip6a;

	ip6a = ip6_addaux(m);
	if (ip6a)
	ip6a->ip6a_dstia6 = ia6;
	return ip6a; /* NULL if failed to set */
	}

	struct in6_ifaddr *
	ip6_getdstifaddr(struct mbuf *m)
	{
	struct ip6aux *ip6a;

	ip6a = ip6_findaux(m);
	if (ip6a)
	return ip6a->ip6a_dstia6;
	else
	return NULL;
	}

	/*
	* Hop-by-Hop options header processing. If a valid jumbo payload option is
	* included, the real payload length will be stored in plenp.
	*
	* rtalertp - XXX: should be stored more smart way
	*/
	static int
	ip6_hopopts_input(u_int32_t plenp, u_int32_t rtalertp,
	struct mbuf *mp, int offp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	int off = *offp, hbhlen;
	struct ip6_hbh *hbh;
	u_int8_t *opt;

	/* validation of the length of the header */
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1);
	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
	hbhlen = (hbh->ip6h_len + 1) << 3;

	IP6_EXTHDR_CHECK(m, off, hbhlen, -1);
	hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
	sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
	if (hbh == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return -1;
	}
	hbhlen = (hbh->ip6h_len + 1) << 3;
	IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
	hbhlen);
	if (hbh == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return -1;
	}
	#endif
	off += hbhlen;
	hbhlen -= sizeof(struct ip6_hbh);
	opt = (u_int8_t *)hbh + sizeof(struct ip6_hbh);

	if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
	hbhlen, rtalertp, plenp) < 0)
	return (-1);

	*offp = off;
	*mp = m;
	return (0);
	}

	/*
	* Search header for all Hop-by-hop options and process each option.
	* This function is separate from ip6_hopopts_input() in order to
	* handle a case where the sending node itself process its hop-by-hop
	* options header. In such a case, the function is called from ip6_output().
	*
	* The function assumes that hbh header is located right after the IPv6 header
	* (RFC2460 p7), opthead is pointer into data content in m, and opthead to
	* opthead + hbhlen is located in continuous memory region.
	*/
	int
	ip6_process_hopopts(struct mbuf m, u_int8_t opthead, int hbhlen,
	u_int32_t rtalertp, u_int32_t plenp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr *ip6;
	int optlen = 0;
	u_int8_t *opt = opthead;
	u_int16_t rtalert_val;
	u_int32_t jumboplen;
	const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);

	for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) {
	switch (*opt) {
	case IP6OPT_PAD1:
	optlen = 1;
	break;
	case IP6OPT_PADN:
	if (hbhlen < IP6OPT_MINLEN) {
	V_ip6stat.ip6s_toosmall++;
	goto bad;
	}
	optlen = *(opt + 1) + 2;
	break;
	case IP6OPT_ROUTER_ALERT:
	/* XXX may need check for alignment */
	if (hbhlen < IP6OPT_RTALERT_LEN) {
	V_ip6stat.ip6s_toosmall++;
	goto bad;
	}
	if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
	/* XXX stat */
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	erroff + opt + 1 - opthead);
	return (-1);
	}
	optlen = IP6OPT_RTALERT_LEN;
	bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2);
	*rtalertp = ntohs(rtalert_val);
	break;
	case IP6OPT_JUMBO:
	/* XXX may need check for alignment */
	if (hbhlen < IP6OPT_JUMBO_LEN) {
	V_ip6stat.ip6s_toosmall++;
	goto bad;
	}
	if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
	/* XXX stat */
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	erroff + opt + 1 - opthead);
	return (-1);
	}
	optlen = IP6OPT_JUMBO_LEN;

	/*
	* IPv6 packets that have non 0 payload length
	* must not contain a jumbo payload option.
	*/
	ip6 = mtod(m, struct ip6_hdr *);
	if (ip6->ip6_plen) {
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	erroff + opt - opthead);
	return (-1);
	}

	/*
	* We may see jumbolen in unaligned location, so
	* we'd need to perform bcopy().
	*/
	bcopy(opt + 2, &jumboplen, sizeof(jumboplen));
	jumboplen = (u_int32_t)htonl(jumboplen);

	#if 1
	/*
	* if there are multiple jumbo payload options,
	* *plenp will be non-zero and the packet will be
	* rejected.
	* the behavior may need some debate in ipngwg -
	* multiple options does not make sense, however,
	* there's no explicit mention in specification.
	*/
	if (*plenp != 0) {
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	erroff + opt + 2 - opthead);
	return (-1);
	}
	#endif

	/*
	* jumbo payload length must be larger than 65535.
	*/
	if (jumboplen <= IPV6_MAXPACKET) {
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_HEADER,
	erroff + opt + 2 - opthead);
	return (-1);
	}
	*plenp = jumboplen;

	break;
	default: /* unknown option */
	if (hbhlen < IP6OPT_MINLEN) {
	V_ip6stat.ip6s_toosmall++;
	goto bad;
	}
	optlen = ip6_unknown_opt(opt, m,
	erroff + opt - opthead);
	if (optlen == -1)
	return (-1);
	optlen += 2;
	break;
	}
	}

	return (0);

	bad:
	m_freem(m);
	return (-1);
	}

	/*
	* Unknown option processing.
	* The third argument `off' is the offset from the IPv6 header to the option,
	* which is necessary if the IPv6 header the and option header and IPv6 header
	* is not continuous in order to return an ICMPv6 error.
	*/
	int
	ip6_unknown_opt(u_int8_t optp, struct mbuf m, int off)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr *ip6;

	switch (IP6OPT_TYPE(*optp)) {
	case IP6OPT_TYPE_SKIP: /* ignore the option */
	return ((int)*(optp + 1));
	case IP6OPT_TYPE_DISCARD: /* silently discard */
	m_freem(m);
	return (-1);
	case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
	return (-1);
	case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
	V_ip6stat.ip6s_badoptions++;
	ip6 = mtod(m, struct ip6_hdr *);
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	(m->m_flags & (M_BCAST\|M_MCAST)))
	m_freem(m);
	else
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_OPTION, off);
	return (-1);
	}

	m_freem(m); /* XXX: NOTREACHED */
	return (-1);
	}

	/*
	* Create the "control" list for this pcb.
	* These functions will not modify mbuf chain at all.
	*
	* With KAME mbuf chain restriction:
	* The routine will be called from upper layer handlers like tcp6_input().
	* Thus the routine assumes that the caller (tcp6_input) have already
	* called IP6_EXTHDR_CHECK() and all the extension headers are located in the
	* very first mbuf on the mbuf chain.
	*
	* ip6_savecontrol_v4 will handle those options that are possible to be
	* set on a v4-mapped socket.
	* ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those
	* options and handle the v6-only ones itself.
	*/
	struct mbuf **
	ip6_savecontrol_v4(struct inpcb inp, struct mbuf m, struct mbuf **mp,
	int *v4only)
	{
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );

	#ifdef SO_TIMESTAMP
	if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) {
	struct timeval tv;

	microtime(&tv);
	*mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
	SCM_TIMESTAMP, SOL_SOCKET);
	if (*mp)
	mp = &(*mp)->m_next;
	}
	#endif

	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
	if (v4only != NULL)
	*v4only = 1;
	return (mp);
	}

	#define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
	/* RFC 2292 sec. 5 */
	if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
	struct in6_pktinfo pi6;

	bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr));
	in6_clearscope(&pi6.ipi6_addr); /* XXX */
	pi6.ipi6_ifindex =
	(m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0;

	*mp = sbcreatecontrol((caddr_t) &pi6,
	sizeof(struct in6_pktinfo),
	IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
	if (*mp)
	mp = &(*mp)->m_next;
	}

	if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) {
	int hlim = ip6->ip6_hlim & 0xff;

	*mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int),
	IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT),
	IPPROTO_IPV6);
	if (*mp)
	mp = &(*mp)->m_next;
	}

	if (v4only != NULL)
	*v4only = 0;
	return (mp);
	}

	void
	ip6_savecontrol(struct inpcb in6p, struct mbuf m, struct mbuf **mp)
	{
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	int v4only = 0;

	mp = ip6_savecontrol_v4(in6p, m, mp, &v4only);
	if (v4only)
	return;

	if ((in6p->in6p_flags & IN6P_TCLASS) != 0) {
	u_int32_t flowinfo;
	int tclass;

	flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
	flowinfo >>= 20;

	tclass = flowinfo & 0xff;
	*mp = sbcreatecontrol((caddr_t) &tclass, sizeof(tclass),
	IPV6_TCLASS, IPPROTO_IPV6);
	if (*mp)
	mp = &(*mp)->m_next;
	}

	/*
	* IPV6_HOPOPTS socket option. Recall that we required super-user
	* privilege for the option (see ip6_ctloutput), but it might be too
	* strict, since there might be some hop-by-hop options which can be
	* returned to normal user.
	* See also RFC 2292 section 6 (or RFC 3542 section 8).
	*/
	if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0) {
	/*
	* Check if a hop-by-hop options header is contatined in the
	* received packet, and if so, store the options as ancillary
	* data. Note that a hop-by-hop options header must be
	* just after the IPv6 header, which is assured through the
	* IPv6 input processing.
	*/
	if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
	struct ip6_hbh *hbh;
	int hbhlen = 0;
	#ifdef PULLDOWN_TEST
	struct mbuf *ext;
	#endif

	#ifndef PULLDOWN_TEST
	hbh = (struct ip6_hbh *)(ip6 + 1);
	hbhlen = (hbh->ip6h_len + 1) << 3;
	#else
	ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
	ip6->ip6_nxt);
	if (ext == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return;
	}
	hbh = mtod(ext, struct ip6_hbh *);
	hbhlen = (hbh->ip6h_len + 1) << 3;
	if (hbhlen != ext->m_len) {
	m_freem(ext);
	V_ip6stat.ip6s_tooshort++;
	return;
	}
	#endif

	/*
	* XXX: We copy the whole header even if a
	* jumbo payload option is included, the option which
	* is to be removed before returning according to
	* RFC2292.
	* Note: this constraint is removed in RFC3542
	*/
	*mp = sbcreatecontrol((caddr_t)hbh, hbhlen,
	IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS),
	IPPROTO_IPV6);
	if (*mp)
	mp = &(*mp)->m_next;
	#ifdef PULLDOWN_TEST
	m_freem(ext);
	#endif
	}
	}

	if ((in6p->in6p_flags & (IN6P_RTHDR \| IN6P_DSTOPTS)) != 0) {
	int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr);

	/*
	* Search for destination options headers or routing
	* header(s) through the header chain, and stores each
	* header as ancillary data.
	* Note that the order of the headers remains in
	* the chain of ancillary data.
	*/
	while (1) { /* is explicit loop prevention necessary? */
	struct ip6_ext *ip6e = NULL;
	int elen;
	#ifdef PULLDOWN_TEST
	struct mbuf *ext = NULL;
	#endif

	/*
	* if it is not an extension header, don't try to
	* pull it from the chain.
	*/
	switch (nxt) {
	case IPPROTO_DSTOPTS:
	case IPPROTO_ROUTING:
	case IPPROTO_HOPOPTS:
	case IPPROTO_AH: /* is it possible? */
	break;
	default:
	goto loopend;
	}

	#ifndef PULLDOWN_TEST
	if (off + sizeof(*ip6e) > m->m_len)
	goto loopend;
	ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off);
	if (nxt == IPPROTO_AH)
	elen = (ip6e->ip6e_len + 2) << 2;
	else
	elen = (ip6e->ip6e_len + 1) << 3;
	if (off + elen > m->m_len)
	goto loopend;
	#else
	ext = ip6_pullexthdr(m, off, nxt);
	if (ext == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return;
	}
	ip6e = mtod(ext, struct ip6_ext *);
	if (nxt == IPPROTO_AH)
	elen = (ip6e->ip6e_len + 2) << 2;
	else
	elen = (ip6e->ip6e_len + 1) << 3;
	if (elen != ext->m_len) {
	m_freem(ext);
	V_ip6stat.ip6s_tooshort++;
	return;
	}
	#endif

	switch (nxt) {
	case IPPROTO_DSTOPTS:
	if (!(in6p->in6p_flags & IN6P_DSTOPTS))
	break;

	*mp = sbcreatecontrol((caddr_t)ip6e, elen,
	IS2292(in6p,
	IPV6_2292DSTOPTS, IPV6_DSTOPTS),
	IPPROTO_IPV6);
	if (*mp)
	mp = &(*mp)->m_next;
	break;
	case IPPROTO_ROUTING:
	if (!in6p->in6p_flags & IN6P_RTHDR)
	break;

	*mp = sbcreatecontrol((caddr_t)ip6e, elen,
	IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR),
	IPPROTO_IPV6);
	if (*mp)
	mp = &(*mp)->m_next;
	break;
	case IPPROTO_HOPOPTS:
	case IPPROTO_AH: /* is it possible? */
	break;

	default:
	/*
	* other cases have been filtered in the above.
	* none will visit this case. here we supply
	* the code just in case (nxt overwritten or
	* other cases).
	*/
	#ifdef PULLDOWN_TEST
	m_freem(ext);
	#endif
	goto loopend;

	}

	/* proceed with the next header. */
	off += elen;
	nxt = ip6e->ip6e_nxt;
	ip6e = NULL;
	#ifdef PULLDOWN_TEST
	m_freem(ext);
	ext = NULL;
	#endif
	}
	loopend:
	;
	}
	}
	#undef IS2292

	void
	ip6_notify_pmtu(struct inpcb in6p, struct sockaddr_in6 dst, u_int32_t *mtu)
	{
	struct socket *so;
	struct mbuf *m_mtu;
	struct ip6_mtuinfo mtuctl;

	so = in6p->inp_socket;

	if (mtu == NULL)
	return;

	#ifdef DIAGNOSTIC
	if (so == NULL) /* I believe this is impossible */
	panic("ip6_notify_pmtu: socket is NULL");
	#endif

	bzero(&mtuctl, sizeof(mtuctl)); /* zero-clear for safety */
	mtuctl.ip6m_mtu = *mtu;
	mtuctl.ip6m_addr = *dst;
	if (sa6_recoverscope(&mtuctl.ip6m_addr))
	return;

	if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl),
	IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
	return;

	if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu)
	== 0) {
	m_freem(m_mtu);
	/* XXX: should count statistics */
	} else
	sorwakeup(so);

	return;
	}

	#ifdef PULLDOWN_TEST
	/*
	* pull single extension header from mbuf chain. returns single mbuf that
	* contains the result, or NULL on error.
	*/
	static struct mbuf *
	ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
	{
	struct ip6_ext ip6e;
	size_t elen;
	struct mbuf *n;

	#ifdef DIAGNOSTIC
	switch (nxt) {
	case IPPROTO_DSTOPTS:
	case IPPROTO_ROUTING:
	case IPPROTO_HOPOPTS:
	case IPPROTO_AH: /* is it possible? */
	break;
	default:
	printf("ip6_pullexthdr: invalid nxt=%d\n", nxt);
	}
	#endif

	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
	if (nxt == IPPROTO_AH)
	elen = (ip6e.ip6e_len + 2) << 2;
	else
	elen = (ip6e.ip6e_len + 1) << 3;

	MGET(n, M_DONTWAIT, MT_DATA);
	if (n && elen >= MLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_free(n);
	n = NULL;
	}
	}
	if (!n)
	return NULL;

	n->m_len = 0;
	if (elen >= M_TRAILINGSPACE(n)) {
	m_free(n);
	return NULL;
	}

	m_copydata(m, off, elen, mtod(n, caddr_t));
	n->m_len = elen;
	return n;
	}
	#endif

	/*
	* Get pointer to the previous header followed by the header
	* currently processed.
	* XXX: This function supposes that
	* M includes all headers,
	* the next header field and the header length field of each header
	* are valid, and
	* the sum of each header length equals to OFF.
	* Because of these assumptions, this function must be called very
	* carefully. Moreover, it will not be used in the near future when
	* we develop `neater' mechanism to process extension headers.
	*/
	char *
	ip6_get_prevhdr(struct mbuf *m, int off)
	{
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );

	if (off == sizeof(struct ip6_hdr))
	return (&ip6->ip6_nxt);
	else {
	int len, nxt;
	struct ip6_ext *ip6e = NULL;

	nxt = ip6->ip6_nxt;
	len = sizeof(struct ip6_hdr);
	while (len < off) {
	ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + len);

	switch (nxt) {
	case IPPROTO_FRAGMENT:
	len += sizeof(struct ip6_frag);
	break;
	case IPPROTO_AH:
	len += (ip6e->ip6e_len + 2) << 2;
	break;
	default:
	len += (ip6e->ip6e_len + 1) << 3;
	break;
	}
	nxt = ip6e->ip6e_nxt;
	}
	if (ip6e)
	return (&ip6e->ip6e_nxt);
	else
	return NULL;
	}
	}

	/*
	* get next header offset. m will be retained.
	*/
	int
	ip6_nexthdr(struct mbuf m, int off, int proto, int nxtp)
	{
	struct ip6_hdr ip6;
	struct ip6_ext ip6e;
	struct ip6_frag fh;

	/* just in case */
	if (m == NULL)
	panic("ip6_nexthdr: m == NULL");
	if ((m->m_flags & M_PKTHDR) == 0 \|\| m->m_pkthdr.len < off)
	return -1;

	switch (proto) {
	case IPPROTO_IPV6:
	if (m->m_pkthdr.len < off + sizeof(ip6))
	return -1;
	m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6);
	if (nxtp)
	*nxtp = ip6.ip6_nxt;
	off += sizeof(ip6);
	return off;

	case IPPROTO_FRAGMENT:
	/*
	* terminate parsing if it is not the first fragment,
	* it does not make sense to parse through it.
	*/
	if (m->m_pkthdr.len < off + sizeof(fh))
	return -1;
	m_copydata(m, off, sizeof(fh), (caddr_t)&fh);
	/* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */
	if (fh.ip6f_offlg & IP6F_OFF_MASK)
	return -1;
	if (nxtp)
	*nxtp = fh.ip6f_nxt;
	off += sizeof(struct ip6_frag);
	return off;

	case IPPROTO_AH:
	if (m->m_pkthdr.len < off + sizeof(ip6e))
	return -1;
	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
	if (nxtp)
	*nxtp = ip6e.ip6e_nxt;
	off += (ip6e.ip6e_len + 2) << 2;
	return off;

	case IPPROTO_HOPOPTS:
	case IPPROTO_ROUTING:
	case IPPROTO_DSTOPTS:
	if (m->m_pkthdr.len < off + sizeof(ip6e))
	return -1;
	m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e);
	if (nxtp)
	*nxtp = ip6e.ip6e_nxt;
	off += (ip6e.ip6e_len + 1) << 3;
	return off;

	case IPPROTO_NONE:
	case IPPROTO_ESP:
	case IPPROTO_IPCOMP:
	/* give up */
	return -1;

	default:
	return -1;
	}

	return -1;
	}

	/*
	* get offset for the last header in the chain. m will be kept untainted.
	*/
	int
	ip6_lasthdr(struct mbuf m, int off, int proto, int nxtp)
	{
	int newoff;
	int nxt;

	if (!nxtp) {
	nxt = -1;
	nxtp = &nxt;
	}
	while (1) {
	newoff = ip6_nexthdr(m, off, proto, nxtp);
	if (newoff < 0)
	return off;
	else if (newoff < off)
	return -1; /* invalid */
	else if (newoff == off)
	return newoff;

	off = newoff;
	proto = *nxtp;
	}
	}

	struct ip6aux *
	ip6_addaux(struct mbuf *m)
	{
	struct m_tag *mtag;

	mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
	if (!mtag) {
	mtag = m_tag_get(PACKET_TAG_IPV6_INPUT, sizeof(struct ip6aux),
	M_NOWAIT);
	if (mtag) {
	m_tag_prepend(m, mtag);
	bzero(mtag + 1, sizeof(struct ip6aux));
	}
	}
	return mtag ? (struct ip6aux *)(mtag + 1) : NULL;
	}

	struct ip6aux *
	ip6_findaux(struct mbuf *m)
	{
	struct m_tag *mtag;

	mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
	return mtag ? (struct ip6aux *)(mtag + 1) : NULL;
	}

	void
	ip6_delaux(struct mbuf *m)
	{
	struct m_tag *mtag;

	mtag = m_tag_find(m, PACKET_TAG_IPV6_INPUT, NULL);
	if (mtag)
	m_tag_delete(m, mtag);
	}

	/*
	* System control for IP6
	*/

	u_char inet6ctlerrmap[PRC_NCMDS] = {
	0, 0, 0, 0,
	0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
	EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
	EMSGSIZE, EHOSTUNREACH, 0, 0,
	0, 0, 0, 0,
	ENOPROTOOPT
	};
	Index: head/sys/netinet6/ip6_ipsec.c
	===================================================================
	--- head/sys/netinet6/ip6_ipsec.c (revision 183549)
	+++ head/sys/netinet6/ip6_ipsec.c (revision 183550)
	@@ -1,366 +1,369 @@
	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipsec.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/mac.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip6.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_options.h>

	#include <machine/in_cksum.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#include <netipsec/xform.h>
	#include <netipsec/key.h>
	#ifdef IPSEC_DEBUG
	#include <netipsec/key_debug.h>
	#else
	#define KEYDEBUG(lev,arg)
	#endif
	#endif /IPSEC/

	#include <netinet6/ip6_ipsec.h>
	#include <netinet6/ip6_var.h>

	extern struct protosw inet6sw[];

	/*
	* Check if we have to jump over firewall processing for this packet.
	* Called from ip_input().
	* 1 = jump over firewall, 0 = packet goes through firewall.
	*/
	int
	ip6_ipsec_filtertunnel(struct mbuf *m)
	{
	#if defined(IPSEC) && !defined(IPSEC_FILTERTUNNEL)
	/*
	* Bypass packet filtering for packets from a tunnel.
	*/
	if (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL)
	return 1;
	#endif
	return 0;
	}

	/*
	* Check if this packet has an active SA and needs to be dropped instead
	* of forwarded.
	* Called from ip_input().
	* 1 = drop packet, 0 = forward packet.
	*/
	int
	ip6_ipsec_fwd(struct mbuf *m)
	{
	#ifdef IPSEC
	+ INIT_VNET_INET6(curvnet);
	+ INIT_VNET_IPSEC(curvnet);
	struct m_tag *mtag;
	struct tdb_ident *tdbi;
	struct secpolicy *sp;
	int s, error;
	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
	s = splnet();
	if (mtag != NULL) {
	tdbi = (struct tdb_ident *)(mtag + 1);
	sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
	} else {
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
	IP_FORWARDING, &error);
	}
	if (sp == NULL) { /* NB: can happen if error */
	splx(s);
	/XXX error stat???/
	DPRINTF(("ip_input: no SP for forwarding\n")); /XXX/
	return 1;
	}

	/*
	* Check security policy against packet attributes.
	*/
	error = ipsec_in_reject(sp, m);
	KEY_FREESP(&sp);
	splx(s);
	if (error) {
	V_ip6stat.ip6s_cantforward++;
	return 1;
	}
	#endif /* IPSEC */
	return 0;
	}

	/*
	* Check if protocol type doesn't have a further header and do IPSEC
	* decryption or reject right now. Protocols with further headers get
	* their IPSEC treatment within the protocol specific processing.
	* Called from ip_input().
	* 1 = drop packet, 0 = continue processing packet.
	*/
	int
	ip6_ipsec_input(struct mbuf *m, int nxt)
	{
	#ifdef IPSEC
	+ INIT_VNET_IPSEC(curvnet);
	struct m_tag *mtag;
	struct tdb_ident *tdbi;
	struct secpolicy *sp;
	int s, error;
	/*
	* enforce IPsec policy checking if we are seeing last header.
	* note that we do not visit this with protocols with pcb layer
	* code - like udp/tcp/raw ip.
	*/
	if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 &&
	ipsec6_in_reject(m, NULL)) {

	/*
	* Check if the packet has already had IPsec processing
	* done. If so, then just pass it along. This tag gets
	* set during AH, ESP, etc. input handling, before the
	* packet is returned to the ip input queue for delivery.
	*/
	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
	s = splnet();
	if (mtag != NULL) {
	tdbi = (struct tdb_ident *)(mtag + 1);
	sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND);
	} else {
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND,
	IP_FORWARDING, &error);
	}
	if (sp != NULL) {
	/*
	* Check security policy against packet attributes.
	*/
	error = ipsec_in_reject(sp, m);
	KEY_FREESP(&sp);
	} else {
	/* XXX error stat??? */
	error = EINVAL;
	DPRINTF(("ip_input: no SP, packet discarded\n"));/XXX/
	return 1;
	}
	splx(s);
	if (error)
	return 1;
	}
	#endif /* IPSEC */
	return 0;
	}

	/*
	* Called from ip6_output().
	* 1 = drop packet, 0 = continue processing packet,
	* -1 = packet was reinjected and stop processing packet
	*/

	int
	ip6_ipsec_output(struct mbuf *m, struct inpcb inp, int flags, int error,
	struct ifnet ifp, struct secpolicy sp)
	{
	#ifdef IPSEC
	struct tdb_ident *tdbi;
	struct m_tag *mtag;
	/* XXX int s; */
	if (sp == NULL)
	return 1;
	mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL);
	if (mtag != NULL) {
	tdbi = (struct tdb_ident *)(mtag + 1);
	*sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND);
	if (*sp == NULL)
	error = -EINVAL; / force silent drop */
	m_tag_delete(*m, mtag);
	} else {
	sp = ipsec4_checkpolicy(m, IPSEC_DIR_OUTBOUND, *flags,
	error, inp);
	}

	/*
	* There are four return cases:
	* sp != NULL apply IPsec policy
	* sp == NULL, error == 0 no IPsec handling needed
	* sp == NULL, error == -EINVAL discard packet w/o error
	* sp == NULL, error != 0 discard packet, report error
	*/
	if (*sp != NULL) {
	/* Loop detection, check if ipsec processing already done */
	KASSERT((*sp)->req != NULL, ("ip_output: no ipsec request"));
	for (mtag = m_tag_first(*m); mtag != NULL;
	mtag = m_tag_next(*m, mtag)) {
	if (mtag->m_tag_cookie != MTAG_ABI_COMPAT)
	continue;
	if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE &&
	mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED)
	continue;
	/*
	* Check if policy has an SA associated with it.
	* This can happen when an SP has yet to acquire
	* an SA; e.g. on first reference. If it occurs,
	* then we let ipsec4_process_packet do its thing.
	*/
	if ((*sp)->req->sav == NULL)
	break;
	tdbi = (struct tdb_ident *)(mtag + 1);
	if (tdbi->spi == (*sp)->req->sav->spi &&
	tdbi->proto == (*sp)->req->sav->sah->saidx.proto &&
	bcmp(&tdbi->dst, &(*sp)->req->sav->sah->saidx.dst,
	sizeof (union sockaddr_union)) == 0) {
	/*
	* No IPsec processing is needed, free
	* reference to SP.
	*
	* NB: null pointer to avoid free at
	* done: below.
	*/
	KEY_FREESP(sp), *sp = NULL;
	/* XXX splx(s); */
	goto done;
	}
	}

	/*
	* Do delayed checksums now because we send before
	* this is done in the normal processing path.
	*/
	if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	in_delayed_cksum(*m);
	(*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}

	/*
	* Preserve KAME behaviour: ENOENT can be returned
	* when an SA acquire is in progress. Don't propagate
	* this to user-level; it confuses applications.
	*
	* XXX this will go away when the SADB is redone.
	*/
	if (*error == ENOENT)
	*error = 0;
	goto do_ipsec;
	} else { /* sp == NULL */
	if (*error != 0) {
	/*
	* Hack: -EINVAL is used to signal that a packet
	* should be silently discarded. This is typically
	* because we asked key management for an SA and
	* it was delayed (e.g. kicked up to IKE).
	*/
	if (*error == -EINVAL)
	*error = 0;
	goto bad;
	} else {
	/* No IPsec processing for this packet. */
	}
	}
	done:
	return 0;
	do_ipsec:
	return -1;
	bad:
	return 1;
	#endif /* IPSEC */
	return 0;
	}

	#if 0
	/*
	* Compute the MTU for a forwarded packet that gets IPSEC encapsulated.
	* Called from ip_forward().
	* Returns MTU suggestion for ICMP needfrag reply.
	*/
	int
	ip6_ipsec_mtu(struct mbuf *m)
	{
	int mtu = 0;
	/*
	* If the packet is routed over IPsec tunnel, tell the
	* originator the tunnel MTU.
	* tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
	* XXX quickhack!!!
	*/
	#ifdef IPSEC
	struct secpolicy *sp = NULL;
	int ipsecerror;
	int ipsechdr;
	struct route *ro;
	sp = ipsec_getpolicybyaddr(m,
	IPSEC_DIR_OUTBOUND,
	IP_FORWARDING,
	&ipsecerror);
	if (sp != NULL) {
	/* count IPsec header size */
	ipsechdr = ipsec4_hdrsiz(m,
	IPSEC_DIR_OUTBOUND,
	NULL);

	/*
	* find the correct route for outer IPv4
	* header, compute tunnel MTU.
	*/
	if (sp->req != NULL &&
	sp->req->sav != NULL &&
	sp->req->sav->sah != NULL) {
	ro = &sp->req->sav->sah->sa_route;
	if (ro->ro_rt && ro->ro_rt->rt_ifp) {
	mtu =
	ro->ro_rt->rt_rmx.rmx_mtu ?
	ro->ro_rt->rt_rmx.rmx_mtu :
	ro->ro_rt->rt_ifp->if_mtu;
	mtu -= ipsechdr;
	}
	}
	KEY_FREESP(&sp);
	}
	#endif /* IPSEC */
	/* XXX else case missing. */
	return mtu;
	}
	#endif
	Index: head/sys/netinet6/ip6_mroute.c
	===================================================================
	--- head/sys/netinet6/ip6_mroute.c (revision 183549)
	+++ head/sys/netinet6/ip6_mroute.c (revision 183550)
	@@ -1,1920 +1,1930 @@
	/*-
	* Copyright (C) 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $
	*/

	/*-
	* Copyright (c) 1989 Stephen Deering
	* Copyright (c) 1992, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Stephen Deering of Stanford University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
	* BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp
	*/

	/*
	* IP multicast forwarding procedures
	*
	* Written by David Waitzman, BBN Labs, August 1988.
	* Modified by Steve Deering, Stanford, February 1989.
	* Modified by Mark J. Steiglitz, Stanford, May, 1991
	* Modified by Van Jacobson, LBL, January 1993
	* Modified by Ajit Thyagarajan, PARC, August 1993
	* Modified by Bill Fenner, PARC, April 1994
	*
	* MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/callout.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/raw_cb.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/icmp6.h>

	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/ip6_mroute.h>
	#include <netinet6/ip6protosw.h>
	#include <netinet6/pim6.h>
	#include <netinet6/pim6_var.h>

	static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry");

	/* XXX: this is a very common idiom; move to <sys/mbuf.h> ? */
	#define M_HASCL(m) ((m)->m_flags & M_EXT)

	static int ip6_mdq(struct mbuf , struct ifnet , struct mf6c *);
	static void phyint_send(struct ip6_hdr , struct mif6 , struct mbuf *);

	static int set_pim6(int *);
	static int socket_send __P((struct socket , struct mbuf ,
	struct sockaddr_in6 *));
	static int register_send __P((struct ip6_hdr , struct mif6 ,
	struct mbuf *));

	extern struct domain inet6domain;

	/* XXX: referenced from ip_mroute.c for dynamically loading this code. */
	struct ip6protosw in6_pim_protosw = {
	.pr_type = SOCK_RAW,
	.pr_domain = &inet6domain,
	.pr_protocol = IPPROTO_PIM,
	.pr_flags = PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	.pr_input = pim6_input,
	.pr_output = rip6_output,
	.pr_ctloutput = rip6_ctloutput,
	.pr_usrreqs = &rip6_usrreqs
	};

	static int ip6_mrouter_ver = 0;

	SYSCTL_DECL(_net_inet6);
	SYSCTL_DECL(_net_inet6_ip6);
	SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM");

	static struct mrt6stat mrt6stat;
	SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW,
	&mrt6stat, mrt6stat,
	"Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)");

	#define NO_RTE_FOUND 0x1
	#define RTE_FOUND 0x2

	static struct mf6c *mf6ctable[MF6CTBLSIZ];
	SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD,
	&mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]",
	"Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], "
	"netinet6/ip6_mroute.h)");

	static u_char n6expire[MF6CTBLSIZ];

	static struct mif6 mif6table[MAXMIFS];
	SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mif6table, CTLFLAG_RD,
	&mif6table, sizeof(mif6table), "S,vif[MAXMIFS]",
	"Multicast Interfaces (struct mif[MAXMIFS], netinet6/ip6_mroute.h)");

	#ifdef MRT6DEBUG
	static u_int mrt6debug = 0; /* debug level */
	#define DEBUG_MFC 0x02
	#define DEBUG_FORWARD 0x04
	#define DEBUG_EXPIRE 0x08
	#define DEBUG_XMIT 0x10
	#define DEBUG_REG 0x20
	#define DEBUG_PIM 0x40
	#endif

	static void expire_upcalls(void *);
	#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
	#define UPCALL_EXPIRE 6 /* number of timeouts */

	#ifdef INET
	#ifdef MROUTING
	extern struct socket *ip_mrouter;
	#endif
	#endif

	/*
	* 'Interfaces' associated with decapsulator (so we can tell
	* packets that went through it from ones that get reflected
	* by a broken gateway). Different from IPv4 register_if,
	* these interfaces are linked into the system ifnet list,
	* because per-interface IPv6 statistics are maintained in
	* ifp->if_afdata. But it does not have any routes point
	* to them. I.e., packets can't be sent this way. They
	* only exist as a placeholder for multicast source
	* verification.
	*/
	static struct ifnet *multicast_register_if6;

	#define ENCAP_HOPS 64

	/*
	* Private variables.
	*/
	static mifi_t nummifs = 0;
	static mifi_t reg_mif_num = (mifi_t)-1;

	static struct pim6stat pim6stat;
	SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RD,
	&pim6stat, pim6stat,
	"PIM Statistics (struct pim6stat, netinet6/pim_var.h)");

	static int pim6;

	/*
	* Hash function for a source, group entry
	*/
	#define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
	(a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
	(g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
	(g).s6_addr32[2] ^ (g).s6_addr32[3])

	/*
	* Find a route for a given origin IPv6 address and Multicast group address.
	*/
	#define MF6CFIND(o, g, rt) do { \
	struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
	rt = NULL; \
	mrt6stat.mrt6s_mfc_lookups++; \
	while (_rt) { \
	if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
	IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
	(_rt->mf6c_stall == NULL)) { \
	rt = _rt; \
	break; \
	} \
	_rt = _rt->mf6c_next; \
	} \
	if (rt == NULL) { \
	mrt6stat.mrt6s_mfc_misses++; \
	} \
	} while (/CONSTCOND/ 0)

	/*
	* Macros to compute elapsed time efficiently
	* Borrowed from Van Jacobson's scheduling code
	* XXX: replace with timersub() ?
	*/
	#define TV_DELTA(a, b, delta) do { \
	int xxs; \
	\
	delta = (a).tv_usec - (b).tv_usec; \
	if ((xxs = (a).tv_sec - (b).tv_sec)) { \
	switch (xxs) { \
	case 2: \
	delta += 1000000; \
	/* FALLTHROUGH */ \
	case 1: \
	delta += 1000000; \
	break; \
	default: \
	delta += (1000000 * xxs); \
	} \
	} \
	} while (/CONSTCOND/ 0)

	/* XXX: replace with timercmp(a, b, <) ? */
	#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
	(a).tv_sec <= (b).tv_sec) \|\| (a).tv_sec < (b).tv_sec)

	#ifdef UPCALL_TIMING
	#define UPCALL_MAX 50
	static u_long upcall_data[UPCALL_MAX + 1];
	static void collate();
	#endif /* UPCALL_TIMING */

	static int get_sg_cnt(struct sioc_sg_req6 *);
	static int get_mif6_cnt(struct sioc_mif_req6 *);
	static int ip6_mrouter_init(struct socket *, int, int);
	static int add_m6if(struct mif6ctl *);
	static int del_m6if(mifi_t *);
	static int add_m6fc(struct mf6cctl *);
	static int del_m6fc(struct mf6cctl *);

	static struct callout expire_upcalls_ch;

	int X_ip6_mforward(struct ip6_hdr ip6, struct ifnet ifp, struct mbuf *m);
	int X_ip6_mrouter_done(void);
	int X_ip6_mrouter_set(struct socket so, struct sockopt sopt);
	int X_ip6_mrouter_get(struct socket so, struct sockopt sopt);
	int X_mrt6_ioctl(int cmd, caddr_t data);

	/*
	* Handle MRT setsockopt commands to modify the multicast routing tables.
	*/
	int
	X_ip6_mrouter_set(struct socket so, struct sockopt sopt)
	{
	int error = 0;
	int optval;
	struct mif6ctl mifc;
	struct mf6cctl mfcc;
	mifi_t mifi;

	if (so != ip6_mrouter && sopt->sopt_name != MRT6_INIT)
	return (EACCES);

	switch (sopt->sopt_name) {
	case MRT6_INIT:
	#ifdef MRT6_OINIT
	case MRT6_OINIT:
	#endif
	error = sooptcopyin(sopt, &optval, sizeof(optval),
	sizeof(optval));
	if (error)
	break;
	error = ip6_mrouter_init(so, optval, sopt->sopt_name);
	break;
	case MRT6_DONE:
	error = X_ip6_mrouter_done();
	break;
	case MRT6_ADD_MIF:
	error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc));
	if (error)
	break;
	error = add_m6if(&mifc);
	break;
	case MRT6_ADD_MFC:
	error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
	if (error)
	break;
	error = add_m6fc(&mfcc);
	break;
	case MRT6_DEL_MFC:
	error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
	if (error)
	break;
	error = del_m6fc(&mfcc);
	break;
	case MRT6_DEL_MIF:
	error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi));
	if (error)
	break;
	error = del_m6if(&mifi);
	break;
	case MRT6_PIM:
	error = sooptcopyin(sopt, &optval, sizeof(optval),
	sizeof(optval));
	if (error)
	break;
	error = set_pim6(&optval);
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	return (error);
	}

	/*
	* Handle MRT getsockopt commands
	*/
	int
	X_ip6_mrouter_get(struct socket so, struct sockopt sopt)
	{
	+ INIT_VNET_INET6(curvnet);
	int error = 0;

	if (so != ip6_mrouter)
	return (EACCES);

	switch (sopt->sopt_name) {
	case MRT6_PIM:
	error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6));
	break;
	}
	return (error);
	}

	/*
	* Handle ioctl commands to obtain information from the cache
	*/
	int
	X_mrt6_ioctl(int cmd, caddr_t data)
	{
	switch (cmd) {
	case SIOCGETSGCNT_IN6:
	return (get_sg_cnt((struct sioc_sg_req6 *)data));
	case SIOCGETMIFCNT_IN6:
	return (get_mif6_cnt((struct sioc_mif_req6 *)data));
	default:
	return (EINVAL);
	}
	}

	/*
	* returns the packet, byte, rpf-failure count for the source group provided
	*/
	static int
	get_sg_cnt(struct sioc_sg_req6 *req)
	{
	struct mf6c *rt;
	int s;

	s = splnet();
	MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
	splx(s);
	if (rt != NULL) {
	req->pktcnt = rt->mf6c_pkt_cnt;
	req->bytecnt = rt->mf6c_byte_cnt;
	req->wrong_if = rt->mf6c_wrong_if;
	} else
	return (ESRCH);
	#if 0
	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
	#endif

	return (0);
	}

	/*
	* returns the input and output packet and byte counts on the mif provided
	*/
	static int
	get_mif6_cnt(struct sioc_mif_req6 *req)
	{
	mifi_t mifi = req->mifi;

	if (mifi >= nummifs)
	return (EINVAL);

	req->icount = mif6table[mifi].m6_pkt_in;
	req->ocount = mif6table[mifi].m6_pkt_out;
	req->ibytes = mif6table[mifi].m6_bytes_in;
	req->obytes = mif6table[mifi].m6_bytes_out;

	return (0);
	}

	static int
	set_pim6(int *i)
	{
	+ INIT_VNET_INET6(curvnet);
	if ((i != 1) && (i != 0))
	return (EINVAL);

	V_pim6 = *i;

	return (0);
	}

	/*
	* Enable multicast routing
	*/
	static int
	ip6_mrouter_init(struct socket *so, int v, int cmd)
	{
	+ INIT_VNET_INET6(curvnet);
	+
	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_DEBUG,
	"ip6_mrouter_init: so_type = %d, pr_protocol = %d\n",
	so->so_type, so->so_proto->pr_protocol);
	#endif

	if (so->so_type != SOCK_RAW \|\|
	so->so_proto->pr_protocol != IPPROTO_ICMPV6)
	return (EOPNOTSUPP);

	if (v != 1)
	return (ENOPROTOOPT);

	if (ip6_mrouter != NULL)
	return (EADDRINUSE);

	ip6_mrouter = so;
	V_ip6_mrouter_ver = cmd;

	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
	bzero((caddr_t)n6expire, sizeof(n6expire));

	V_pim6 = 0;/* used for stubbing out/in pim stuff */

	callout_init(&expire_upcalls_ch, 0);
	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
	expire_upcalls, NULL);

	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_DEBUG, "ip6_mrouter_init\n");
	#endif

	return (0);
	}

	/*
	* Disable multicast routing
	*/
	int
	X_ip6_mrouter_done(void)
	{
	+ INIT_VNET_INET6(curvnet);
	mifi_t mifi;
	int i;
	struct mf6c *rt;
	struct rtdetq *rte;
	int s;

	s = splnet();

	/*
	* For each phyint in use, disable promiscuous reception of all IPv6
	* multicasts.
	*/
	#ifdef INET
	#ifdef MROUTING
	/*
	* If there is still IPv4 multicast routing daemon,
	* we remain interfaces to receive all muliticasted packets.
	* XXX: there may be an interface in which the IPv4 multicast
	* daemon is not interested...
	*/
	if (!V_ip_mrouter)
	#endif
	#endif
	{
	for (mifi = 0; mifi < nummifs; mifi++) {
	if (mif6table[mifi].m6_ifp &&
	!(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
	if_allmulti(mif6table[mifi].m6_ifp, 0);
	}
	}
	}
	bzero((caddr_t)mif6table, sizeof(mif6table));
	nummifs = 0;

	V_pim6 = 0; /* used to stub out/in pim specific code */

	callout_stop(&expire_upcalls_ch);

	/*
	* Free all multicast forwarding cache entries.
	*/
	for (i = 0; i < MF6CTBLSIZ; i++) {
	rt = mf6ctable[i];
	while (rt) {
	struct mf6c *frt;

	for (rte = rt->mf6c_stall; rte != NULL; ) {
	struct rtdetq *n = rte->next;

	m_free(rte->m);
	free(rte, M_MRTABLE6);
	rte = n;
	}
	frt = rt;
	rt = rt->mf6c_next;
	free(frt, M_MRTABLE6);
	}
	}

	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));

	/*
	* Reset register interface
	*/
	if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) {
	if_detach(multicast_register_if6);
	if_free(multicast_register_if6);
	reg_mif_num = (mifi_t)-1;
	multicast_register_if6 = NULL;
	}

	ip6_mrouter = NULL;
	V_ip6_mrouter_ver = 0;

	splx(s);

	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_DEBUG, "ip6_mrouter_done\n");
	#endif

	return (0);
	}

	static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };

	/*
	* Add a mif to the mif table
	*/
	static int
	add_m6if(struct mif6ctl *mifcp)
	{
	+ INIT_VNET_NET(curvnet);
	struct mif6 *mifp;
	struct ifnet *ifp;
	int error, s;

	if (mifcp->mif6c_mifi >= MAXMIFS)
	return (EINVAL);
	mifp = mif6table + mifcp->mif6c_mifi;
	if (mifp->m6_ifp)
	return (EADDRINUSE); /* XXX: is it appropriate? */
	if (mifcp->mif6c_pifi == 0 \|\| mifcp->mif6c_pifi > V_if_index)
	return (ENXIO);
	ifp = ifnet_byindex(mifcp->mif6c_pifi);

	if (mifcp->mif6c_flags & MIFF_REGISTER) {
	if (reg_mif_num == (mifi_t)-1) {
	ifp = if_alloc(IFT_OTHER);

	if_initname(ifp, "register_mif", 0);
	ifp->if_flags \|= IFF_LOOPBACK;
	if_attach(ifp);
	multicast_register_if6 = ifp;
	reg_mif_num = mifcp->mif6c_mifi;
	/*
	* it is impossible to guess the ifindex of the
	* register interface. So mif6c_pifi is automatically
	* calculated.
	*/
	mifcp->mif6c_pifi = ifp->if_index;
	} else {
	ifp = multicast_register_if6;
	}

	} /* if REGISTER */
	else {
	/* Make sure the interface supports multicast */
	if ((ifp->if_flags & IFF_MULTICAST) == 0)
	return (EOPNOTSUPP);

	s = splnet();
	error = if_allmulti(ifp, 1);
	splx(s);
	if (error)
	return (error);
	}

	s = splnet();
	mifp->m6_flags = mifcp->mif6c_flags;
	mifp->m6_ifp = ifp;

	/* initialize per mif pkt counters */
	mifp->m6_pkt_in = 0;
	mifp->m6_pkt_out = 0;
	mifp->m6_bytes_in = 0;
	mifp->m6_bytes_out = 0;
	splx(s);

	/* Adjust nummifs up if the mifi is higher than nummifs */
	if (nummifs <= mifcp->mif6c_mifi)
	nummifs = mifcp->mif6c_mifi + 1;

	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_DEBUG,
	"add_mif #%d, phyint %s\n",
	mifcp->mif6c_mifi,
	ifp->if_xname);
	#endif

	return (0);
	}

	/*
	* Delete a mif from the mif table
	*/
	static int
	del_m6if(mifi_t *mifip)
	{
	struct mif6 mifp = mif6table + mifip;
	mifi_t mifi;
	struct ifnet *ifp;
	int s;

	if (*mifip >= nummifs)
	return (EINVAL);
	if (mifp->m6_ifp == NULL)
	return (EINVAL);

	s = splnet();

	if (!(mifp->m6_flags & MIFF_REGISTER)) {
	/*
	* XXX: what if there is yet IPv4 multicast daemon
	* using the interface?
	*/
	ifp = mifp->m6_ifp;

	if_allmulti(ifp, 0);
	} else {
	if (reg_mif_num != (mifi_t)-1 &&
	multicast_register_if6 != NULL) {
	if_detach(multicast_register_if6);
	if_free(multicast_register_if6);
	reg_mif_num = (mifi_t)-1;
	multicast_register_if6 = NULL;
	}
	}

	bzero((caddr_t)mifp, sizeof(*mifp));

	/* Adjust nummifs down */
	for (mifi = nummifs; mifi > 0; mifi--)
	if (mif6table[mifi - 1].m6_ifp)
	break;
	nummifs = mifi;

	splx(s);

	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs);
	#endif

	return (0);
	}

	/*
	* Add an mfc entry
	*/
	static int
	add_m6fc(struct mf6cctl *mfccp)
	{
	struct mf6c *rt;
	u_long hash;
	struct rtdetq *rte;
	u_short nstl;
	int s;
	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];

	MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
	mfccp->mf6cc_mcastgrp.sin6_addr, rt);

	/* If an entry already exists, just update the fields */
	if (rt) {
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_MFC) {
	log(LOG_DEBUG,
	"add_m6fc no upcall h %d o %s g %s p %x\n",
	ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
	ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
	mfccp->mf6cc_parent);
	}
	#endif

	s = splnet();
	rt->mf6c_parent = mfccp->mf6cc_parent;
	rt->mf6c_ifset = mfccp->mf6cc_ifset;
	splx(s);
	return (0);
	}

	/*
	* Find the entry for which the upcall was made and update
	*/
	s = splnet();
	hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
	mfccp->mf6cc_mcastgrp.sin6_addr);
	for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) {
	if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
	&mfccp->mf6cc_origin.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
	&mfccp->mf6cc_mcastgrp.sin6_addr) &&
	(rt->mf6c_stall != NULL)) {

	if (nstl++)
	log(LOG_ERR,
	"add_m6fc: %s o %s g %s p %x dbx %p\n",
	"multiple kernel entries",
	ip6_sprintf(ip6bufo,
	&mfccp->mf6cc_origin.sin6_addr),
	ip6_sprintf(ip6bufg,
	&mfccp->mf6cc_mcastgrp.sin6_addr),
	mfccp->mf6cc_parent, rt->mf6c_stall);

	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_MFC)
	log(LOG_DEBUG,
	"add_m6fc o %s g %s p %x dbg %x\n",
	ip6_sprintf(ip6bufo,
	&mfccp->mf6cc_origin.sin6_addr),
	ip6_sprintf(ip6bufg,
	&mfccp->mf6cc_mcastgrp.sin6_addr),
	mfccp->mf6cc_parent, rt->mf6c_stall);
	#endif

	rt->mf6c_origin = mfccp->mf6cc_origin;
	rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
	rt->mf6c_parent = mfccp->mf6cc_parent;
	rt->mf6c_ifset = mfccp->mf6cc_ifset;
	/* initialize pkt counters per src-grp */
	rt->mf6c_pkt_cnt = 0;
	rt->mf6c_byte_cnt = 0;
	rt->mf6c_wrong_if = 0;

	rt->mf6c_expire = 0; /* Don't clean this guy up */
	n6expire[hash]--;

	/* free packets Qed at the end of this entry */
	for (rte = rt->mf6c_stall; rte != NULL; ) {
	struct rtdetq *n = rte->next;
	ip6_mdq(rte->m, rte->ifp, rt);
	m_freem(rte->m);
	#ifdef UPCALL_TIMING
	collate(&(rte->t));
	#endif /* UPCALL_TIMING */
	free(rte, M_MRTABLE6);
	rte = n;
	}
	rt->mf6c_stall = NULL;
	}
	}

	/*
	* It is possible that an entry is being inserted without an upcall
	*/
	if (nstl == 0) {
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_MFC)
	log(LOG_DEBUG,
	"add_mfc no upcall h %d o %s g %s p %x\n",
	hash,
	ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
	ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
	mfccp->mf6cc_parent);
	#endif

	for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {

	if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
	&mfccp->mf6cc_origin.sin6_addr)&&
	IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
	&mfccp->mf6cc_mcastgrp.sin6_addr)) {

	rt->mf6c_origin = mfccp->mf6cc_origin;
	rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
	rt->mf6c_parent = mfccp->mf6cc_parent;
	rt->mf6c_ifset = mfccp->mf6cc_ifset;
	/* initialize pkt counters per src-grp */
	rt->mf6c_pkt_cnt = 0;
	rt->mf6c_byte_cnt = 0;
	rt->mf6c_wrong_if = 0;

	if (rt->mf6c_expire)
	n6expire[hash]--;
	rt->mf6c_expire = 0;
	}
	}
	if (rt == NULL) {
	/* no upcall, so make a new entry */
	rt = (struct mf6c )malloc(sizeof(rt), M_MRTABLE6,
	M_NOWAIT);
	if (rt == NULL) {
	splx(s);
	return (ENOBUFS);
	}

	/* insert new entry at head of hash chain */
	rt->mf6c_origin = mfccp->mf6cc_origin;
	rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
	rt->mf6c_parent = mfccp->mf6cc_parent;
	rt->mf6c_ifset = mfccp->mf6cc_ifset;
	/* initialize pkt counters per src-grp */
	rt->mf6c_pkt_cnt = 0;
	rt->mf6c_byte_cnt = 0;
	rt->mf6c_wrong_if = 0;
	rt->mf6c_expire = 0;
	rt->mf6c_stall = NULL;

	/* link into table */
	rt->mf6c_next = mf6ctable[hash];
	mf6ctable[hash] = rt;
	}
	}
	splx(s);
	return (0);
	}

	#ifdef UPCALL_TIMING
	/*
	* collect delay statistics on the upcalls
	*/
	static void
	collate(struct timeval *t)
	{
	u_long d;
	struct timeval tp;
	u_long delta;

	GET_TIME(tp);

	if (TV_LT(*t, tp))
	{
	TV_DELTA(tp, *t, delta);

	d = delta >> 10;
	if (d > UPCALL_MAX)
	d = UPCALL_MAX;

	++upcall_data[d];
	}
	}
	#endif /* UPCALL_TIMING */

	/*
	* Delete an mfc entry
	*/
	static int
	del_m6fc(struct mf6cctl *mfccp)
	{
	struct sockaddr_in6 origin;
	struct sockaddr_in6 mcastgrp;
	struct mf6c *rt;
	struct mf6c **nptr;
	u_long hash;
	int s;

	origin = mfccp->mf6cc_origin;
	mcastgrp = mfccp->mf6cc_mcastgrp;
	hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);

	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_MFC) {
	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
	log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n",
	ip6_sprintf(ip6bufo, &origin.sin6_addr),
	ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr));
	}
	#endif

	s = splnet();

	nptr = &mf6ctable[hash];
	while ((rt = *nptr) != NULL) {
	if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr,
	&rt->mf6c_origin.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
	&rt->mf6c_mcastgrp.sin6_addr) &&
	rt->mf6c_stall == NULL)
	break;

	nptr = &rt->mf6c_next;
	}
	if (rt == NULL) {
	splx(s);
	return (EADDRNOTAVAIL);
	}

	*nptr = rt->mf6c_next;
	free(rt, M_MRTABLE6);

	splx(s);

	return (0);
	}

	static int
	socket_send(struct socket s, struct mbuf mm, struct sockaddr_in6 *src)
	{

	if (s) {
	if (sbappendaddr(&s->so_rcv,
	(struct sockaddr *)src,
	mm, (struct mbuf *)0) != 0) {
	sorwakeup(s);
	return (0);
	}
	}
	m_freem(mm);
	return (-1);
	}

	/*
	* IPv6 multicast forwarding function. This function assumes that the packet
	* pointed to by "ip6" has arrived on (or is about to be sent to) the interface
	* pointed to by "ifp", and the packet is to be relayed to other networks
	* that have members of the packet's destination IPv6 multicast group.
	*
	* The packet is returned unscathed to the caller, unless it is
	* erroneous, in which case a non-zero return value tells the caller to
	* discard it.
	*
	* NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff
	* this function is called in the originating context (i.e., not when
	* forwarding a packet from other node). ip6_output(), which is currently the
	* only function that calls this function is called in the originating context,
	* explicitly ensures this condition. It is caller's responsibility to ensure
	* that if this function is called from somewhere else in the originating
	* context in the future.
	*/
	int
	X_ip6_mforward(struct ip6_hdr ip6, struct ifnet ifp, struct mbuf *m)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mf6c *rt;
	struct mif6 *mifp;
	struct mbuf *mm;
	int s;
	mifi_t mifi;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_FORWARD)
	log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ifp->if_index);
	#endif

	/*
	* Don't forward a packet with Hop limit of zero or one,
	* or a packet destined to a local-only group.
	*/
	if (ip6->ip6_hlim <= 1 \|\| IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
	return (0);
	ip6->ip6_hlim--;

	/*
	* Source address check: do not forward packets with unspecified
	* source. It was discussed in July 2000, on ipngwg mailing list.
	* This is rather more serious than unicast cases, because some
	* MLD packets can be sent with the unspecified source address
	* (although such packets must normally set 1 to the hop limit field).
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
	V_ip6stat.ip6s_cantforward++;
	if (V_ip6_log_time + V_ip6_log_interval < time_second) {
	V_ip6_log_time = time_second;
	log(LOG_DEBUG,
	"cannot forward "
	"from %s to %s nxt %d received on %s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ip6->ip6_nxt,
	if_name(m->m_pkthdr.rcvif));
	}
	return (0);
	}

	/*
	* Determine forwarding mifs from the forwarding cache table
	*/
	s = splnet();
	MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);

	/* Entry exists, so forward if necessary */
	if (rt) {
	splx(s);
	return (ip6_mdq(m, ifp, rt));
	} else {
	/*
	* If we don't have a route for packet's origin,
	* Make a copy of the packet &
	* send message to routing daemon
	*/

	struct mbuf *mb0;
	struct rtdetq *rte;
	u_long hash;
	/* int i, npkts;*/
	#ifdef UPCALL_TIMING
	struct timeval tp;

	GET_TIME(tp);
	#endif /* UPCALL_TIMING */

	mrt6stat.mrt6s_no_route++;
	#ifdef MRT6DEBUG
	if (V_mrt6debug & (DEBUG_FORWARD \| DEBUG_MFC))
	log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst));
	#endif

	/*
	* Allocate mbufs early so that we don't do extra work if we
	* are just going to fail anyway.
	*/
	rte = (struct rtdetq )malloc(sizeof(rte), M_MRTABLE6,
	M_NOWAIT);
	if (rte == NULL) {
	splx(s);
	return (ENOBUFS);
	}
	mb0 = m_copy(m, 0, M_COPYALL);
	/*
	* Pullup packet header if needed before storing it,
	* as other references may modify it in the meantime.
	*/
	if (mb0 &&
	(M_HASCL(mb0) \|\| mb0->m_len < sizeof(struct ip6_hdr)))
	mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
	if (mb0 == NULL) {
	free(rte, M_MRTABLE6);
	splx(s);
	return (ENOBUFS);
	}

	/* is there an upcall waiting for this packet? */
	hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
	for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
	if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
	&rt->mf6c_origin.sin6_addr) &&
	IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
	&rt->mf6c_mcastgrp.sin6_addr) &&
	(rt->mf6c_stall != NULL))
	break;
	}

	if (rt == NULL) {
	struct mrt6msg *im;
	#ifdef MRT6_OINIT
	struct omrt6msg *oim;
	#endif

	/* no upcall, so make a new entry */
	rt = (struct mf6c )malloc(sizeof(rt), M_MRTABLE6,
	M_NOWAIT);
	if (rt == NULL) {
	free(rte, M_MRTABLE6);
	m_freem(mb0);
	splx(s);
	return (ENOBUFS);
	}
	/*
	* Make a copy of the header to send to the user
	* level process
	*/
	mm = m_copy(mb0, 0, sizeof(struct ip6_hdr));

	if (mm == NULL) {
	free(rte, M_MRTABLE6);
	m_freem(mb0);
	free(rt, M_MRTABLE6);
	splx(s);
	return (ENOBUFS);
	}

	/*
	* Send message to routing daemon
	*/
	sin6.sin6_addr = ip6->ip6_src;

	im = NULL;
	#ifdef MRT6_OINIT
	oim = NULL;
	#endif
	switch (V_ip6_mrouter_ver) {
	#ifdef MRT6_OINIT
	case MRT6_OINIT:
	oim = mtod(mm, struct omrt6msg *);
	oim->im6_msgtype = MRT6MSG_NOCACHE;
	oim->im6_mbz = 0;
	break;
	#endif
	case MRT6_INIT:
	im = mtod(mm, struct mrt6msg *);
	im->im6_msgtype = MRT6MSG_NOCACHE;
	im->im6_mbz = 0;
	break;
	default:
	free(rte, M_MRTABLE6);
	m_freem(mb0);
	free(rt, M_MRTABLE6);
	splx(s);
	return (EINVAL);
	}

	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_FORWARD)
	log(LOG_DEBUG,
	"getting the iif info in the kernel\n");
	#endif

	for (mifp = mif6table, mifi = 0;
	mifi < nummifs && mifp->m6_ifp != ifp;
	mifp++, mifi++)
	;

	switch (V_ip6_mrouter_ver) {
	#ifdef MRT6_OINIT
	case MRT6_OINIT:
	oim->im6_mif = mifi;
	break;
	#endif
	case MRT6_INIT:
	im->im6_mif = mifi;
	break;
	}

	if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
	log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
	"socket queue full\n");
	mrt6stat.mrt6s_upq_sockfull++;
	free(rte, M_MRTABLE6);
	m_freem(mb0);
	free(rt, M_MRTABLE6);
	splx(s);
	return (ENOBUFS);
	}

	mrt6stat.mrt6s_upcalls++;

	/* insert new entry at head of hash chain */
	bzero(rt, sizeof(*rt));
	rt->mf6c_origin.sin6_family = AF_INET6;
	rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6);
	rt->mf6c_origin.sin6_addr = ip6->ip6_src;
	rt->mf6c_mcastgrp.sin6_family = AF_INET6;
	rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6);
	rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst;
	rt->mf6c_expire = UPCALL_EXPIRE;
	n6expire[hash]++;
	rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;

	/* link into table */
	rt->mf6c_next = mf6ctable[hash];
	mf6ctable[hash] = rt;
	/* Add this entry to the end of the queue */
	rt->mf6c_stall = rte;
	} else {
	/* determine if q has overflowed */
	struct rtdetq **p;
	int npkts = 0;

	for (p = &rt->mf6c_stall; p != NULL; p = &(p)->next)
	if (++npkts > MAX_UPQ6) {
	mrt6stat.mrt6s_upq_ovflw++;
	free(rte, M_MRTABLE6);
	m_freem(mb0);
	splx(s);
	return (0);
	}

	/* Add this entry to the end of the queue */
	*p = rte;
	}

	rte->next = NULL;
	rte->m = mb0;
	rte->ifp = ifp;
	#ifdef UPCALL_TIMING
	rte->t = tp;
	#endif /* UPCALL_TIMING */

	splx(s);

	return (0);
	}
	}

	/*
	* Clean up cache entries if upcalls are not serviced
	* Call from the Slow Timeout mechanism, every half second.
	*/
	static void
	expire_upcalls(void *unused)
	{
	struct rtdetq *rte;
	struct mf6c mfc, *nptr;
	int i;
	int s;

	s = splnet();
	for (i = 0; i < MF6CTBLSIZ; i++) {
	if (n6expire[i] == 0)
	continue;
	nptr = &mf6ctable[i];
	while ((mfc = *nptr) != NULL) {
	rte = mfc->mf6c_stall;
	/*
	* Skip real cache entries
	* Make sure it wasn't marked to not expire (shouldn't happen)
	* If it expires now
	*/
	if (rte != NULL &&
	mfc->mf6c_expire != 0 &&
	--mfc->mf6c_expire == 0) {
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_EXPIRE) {
	char ip6bufo[INET6_ADDRSTRLEN];
	char ip6bufg[INET6_ADDRSTRLEN];
	log(LOG_DEBUG, "expire_upcalls: expiring (%s %s)\n",
	ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr),
	ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr));
	}
	#endif
	/*
	* drop all the packets
	* free the mbuf with the pkt, if, timing info
	*/
	do {
	struct rtdetq *n = rte->next;
	m_freem(rte->m);
	free(rte, M_MRTABLE6);
	rte = n;
	} while (rte != NULL);
	mrt6stat.mrt6s_cache_cleanups++;
	n6expire[i]--;

	*nptr = mfc->mf6c_next;
	free(mfc, M_MRTABLE6);
	} else {
	nptr = &mfc->mf6c_next;
	}
	}
	}
	splx(s);
	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
	expire_upcalls, NULL);
	}

	/*
	* Packet forwarding routine once entry in the cache is made
	*/
	static int
	ip6_mdq(struct mbuf m, struct ifnet ifp, struct mf6c *rt)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	mifi_t mifi, iif;
	struct mif6 *mifp;
	int plen = m->m_pkthdr.len;
	struct in6_addr src0, dst0; /* copies for local work */
	u_int32_t iszone, idzone, oszone, odzone;
	int error = 0;

	/*
	* Macro to send packet on mif. Since RSVP packets don't get counted on
	* input, they shouldn't get counted on output, so statistics keeping is
	* separate.
	*/

	#define MC6_SEND(ip6, mifp, m) do { \
	if ((mifp)->m6_flags & MIFF_REGISTER) \
	register_send((ip6), (mifp), (m)); \
	else \
	phyint_send((ip6), (mifp), (m)); \
	} while (/CONSTCOND/ 0)

	/*
	* Don't forward if it didn't arrive from the parent mif
	* for its origin.
	*/
	mifi = rt->mf6c_parent;
	if ((mifi >= nummifs) \|\| (mif6table[mifi].m6_ifp != ifp)) {
	/* came in the wrong interface */
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_FORWARD)
	log(LOG_DEBUG,
	"wrong if: ifid %d mifi %d mififid %x\n",
	ifp->if_index, mifi,
	mif6table[mifi].m6_ifp->if_index);
	#endif
	mrt6stat.mrt6s_wrong_if++;
	rt->mf6c_wrong_if++;
	/*
	* If we are doing PIM processing, and we are forwarding
	* packets on this interface, send a message to the
	* routing daemon.
	*/
	/* have to make sure this is a valid mif */
	if (mifi < nummifs && mif6table[mifi].m6_ifp)
	if (V_pim6 && (m->m_flags & M_LOOP) == 0) {
	/*
	* Check the M_LOOP flag to avoid an
	* unnecessary PIM assert.
	* XXX: M_LOOP is an ad-hoc hack...
	*/
	static struct sockaddr_in6 sin6 =
	{ sizeof(sin6), AF_INET6 };

	struct mbuf *mm;
	struct mrt6msg *im;
	#ifdef MRT6_OINIT
	struct omrt6msg *oim;
	#endif

	mm = m_copy(m, 0, sizeof(struct ip6_hdr));
	if (mm &&
	(M_HASCL(mm) \|\|
	mm->m_len < sizeof(struct ip6_hdr)))
	mm = m_pullup(mm, sizeof(struct ip6_hdr));
	if (mm == NULL)
	return (ENOBUFS);

	#ifdef MRT6_OINIT
	oim = NULL;
	#endif
	im = NULL;
	switch (V_ip6_mrouter_ver) {
	#ifdef MRT6_OINIT
	case MRT6_OINIT:
	oim = mtod(mm, struct omrt6msg *);
	oim->im6_msgtype = MRT6MSG_WRONGMIF;
	oim->im6_mbz = 0;
	break;
	#endif
	case MRT6_INIT:
	im = mtod(mm, struct mrt6msg *);
	im->im6_msgtype = MRT6MSG_WRONGMIF;
	im->im6_mbz = 0;
	break;
	default:
	m_freem(mm);
	return (EINVAL);
	}

	for (mifp = mif6table, iif = 0;
	iif < nummifs && mifp &&
	mifp->m6_ifp != ifp;
	mifp++, iif++)
	;

	switch (V_ip6_mrouter_ver) {
	#ifdef MRT6_OINIT
	case MRT6_OINIT:
	oim->im6_mif = iif;
	sin6.sin6_addr = oim->im6_src;
	break;
	#endif
	case MRT6_INIT:
	im->im6_mif = iif;
	sin6.sin6_addr = im->im6_src;
	break;
	}

	mrt6stat.mrt6s_upcalls++;

	if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n");
	#endif
	++mrt6stat.mrt6s_upq_sockfull;
	return (ENOBUFS);
	} /* if socket Q full */
	} /* if PIM */
	return (0);
	} /* if wrong iif */

	/* If I sourced this packet, it counts as output, else it was input. */
	if (m->m_pkthdr.rcvif == NULL) {
	/* XXX: is rcvif really NULL when output?? */
	mif6table[mifi].m6_pkt_out++;
	mif6table[mifi].m6_bytes_out += plen;
	} else {
	mif6table[mifi].m6_pkt_in++;
	mif6table[mifi].m6_bytes_in += plen;
	}
	rt->mf6c_pkt_cnt++;
	rt->mf6c_byte_cnt += plen;

	/*
	* For each mif, forward a copy of the packet if there are group
	* members downstream on the interface.
	*/
	src0 = ip6->ip6_src;
	dst0 = ip6->ip6_dst;
	if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 \|\|
	(error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
	V_ip6stat.ip6s_badscope++;
	return (error);
	}
	for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
	if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
	/*
	* check if the outgoing packet is going to break
	* a scope boundary.
	* XXX For packets through PIM register tunnel
	* interface, we believe a routing daemon.
	*/
	if (!(mif6table[rt->mf6c_parent].m6_flags &
	MIFF_REGISTER) &&
	!(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
	if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
	&oszone) \|\|
	in6_setscope(&dst0, mif6table[mifi].m6_ifp,
	&odzone) \|\|
	iszone != oszone \|\|
	idzone != odzone) {
	V_ip6stat.ip6s_badscope++;
	continue;
	}
	}

	mifp->m6_pkt_out++;
	mifp->m6_bytes_out += plen;
	MC6_SEND(ip6, mifp, m);
	}
	}
	return (0);
	}

	static void
	phyint_send(struct ip6_hdr ip6, struct mif6 mifp, struct mbuf *m)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf *mb_copy;
	struct ifnet *ifp = mifp->m6_ifp;
	int error = 0;
	int s = splnet(); /* needs to protect static "ro" below. */
	static struct route_in6 ro;
	struct in6_multi *in6m;
	struct sockaddr_in6 *dst6;
	u_long linkmtu;

	/*
	* Make a new reference to the packet; make sure that
	* the IPv6 header is actually copied, not just referenced,
	* so that ip6_output() only scribbles on the copy.
	*/
	mb_copy = m_copy(m, 0, M_COPYALL);
	if (mb_copy &&
	(M_HASCL(mb_copy) \|\| mb_copy->m_len < sizeof(struct ip6_hdr)))
	mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
	if (mb_copy == NULL) {
	splx(s);
	return;
	}
	/* set MCAST flag to the outgoing packet */
	mb_copy->m_flags \|= M_MCAST;

	/*
	* If we sourced the packet, call ip6_output since we may devide
	* the packet into fragments when the packet is too big for the
	* outgoing interface.
	* Otherwise, we can simply send the packet to the interface
	* sending queue.
	*/
	if (m->m_pkthdr.rcvif == NULL) {
	struct ip6_moptions im6o;

	im6o.im6o_multicast_ifp = ifp;
	/* XXX: ip6_output will override ip6->ip6_hlim */
	im6o.im6o_multicast_hlim = ip6->ip6_hlim;
	im6o.im6o_multicast_loop = 1;
	error = ip6_output(mb_copy, NULL, &ro,
	IPV6_FORWARDING, &im6o, NULL, NULL);

	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_XMIT)
	log(LOG_DEBUG, "phyint_send on mif %d err %d\n",
	mifp - mif6table, error);
	#endif
	splx(s);
	return;
	}

	/*
	* If we belong to the destination multicast group
	* on the outgoing interface, loop back a copy.
	*/
	dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
	IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m);
	if (in6m != NULL) {
	dst6->sin6_len = sizeof(struct sockaddr_in6);
	dst6->sin6_family = AF_INET6;
	dst6->sin6_addr = ip6->ip6_dst;
	ip6_mloopback(ifp, m, (struct sockaddr_in6 *)&ro.ro_dst);
	}
	/*
	* Put the packet into the sending queue of the outgoing interface
	* if it would fit in the MTU of the interface.
	*/
	linkmtu = IN6_LINKMTU(ifp);
	if (mb_copy->m_pkthdr.len <= linkmtu \|\| linkmtu < IPV6_MMTU) {
	dst6->sin6_len = sizeof(struct sockaddr_in6);
	dst6->sin6_family = AF_INET6;
	dst6->sin6_addr = ip6->ip6_dst;
	/*
	* We just call if_output instead of nd6_output here, since
	* we need no ND for a multicast forwarded packet...right?
	*/
	error = (*ifp->if_output)(ifp, mb_copy,
	(struct sockaddr *)&ro.ro_dst, NULL);
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_XMIT)
	log(LOG_DEBUG, "phyint_send on mif %d err %d\n",
	mifp - mif6table, error);
	#endif
	} else {
	/*
	* pMTU discovery is intentionally disabled by default, since
	* various router may notify pMTU in multicast, which can be
	* a DDoS to a router
	*/
	if (V_ip6_mcast_pmtu)
	icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu);
	else {
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_XMIT) {
	char ip6bufs[INET6_ADDRSTRLEN];
	char ip6bufd[INET6_ADDRSTRLEN];
	log(LOG_DEBUG,
	"phyint_send: packet too big on %s o %s "
	"g %s size %d(discarded)\n",
	if_name(ifp),
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	mb_copy->m_pkthdr.len);
	}
	#endif /* MRT6DEBUG */
	m_freem(mb_copy); /* simply discard the packet */
	}
	}

	splx(s);
	}

	static int
	register_send(struct ip6_hdr ip6, struct mif6 mif, struct mbuf *m)
	{
	struct mbuf *mm;
	int i, len = m->m_pkthdr.len;
	static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
	struct mrt6msg *im6;

	#ifdef MRT6DEBUG
	if (V_mrt6debug) {
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
	log(LOG_DEBUG, " IPv6 register_send \n src %s dst %s\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst));
	}
	#endif
	++pim6stat.pim6s_snd_registers;

	/* Make a copy of the packet to send to the user level process */
	MGETHDR(mm, M_DONTWAIT, MT_HEADER);
	if (mm == NULL)
	return (ENOBUFS);
	mm->m_pkthdr.rcvif = NULL;
	mm->m_data += max_linkhdr;
	mm->m_len = sizeof(struct ip6_hdr);

	if ((mm->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
	m_freem(mm);
	return (ENOBUFS);
	}
	i = MHLEN - M_LEADINGSPACE(mm);
	if (i > len)
	i = len;
	mm = m_pullup(mm, i);
	if (mm == NULL)
	return (ENOBUFS);
	/* TODO: check it! */
	mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);

	/*
	* Send message to routing daemon
	*/
	sin6.sin6_addr = ip6->ip6_src;

	im6 = mtod(mm, struct mrt6msg *);
	im6->im6_msgtype = MRT6MSG_WHOLEPKT;
	im6->im6_mbz = 0;

	im6->im6_mif = mif - mif6table;

	/* iif info is not given for reg. encap.n */
	mrt6stat.mrt6s_upcalls++;

	if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
	#ifdef MRT6DEBUG
	if (V_mrt6debug)
	log(LOG_WARNING,
	"register_send: ip6_mrouter socket queue full\n");
	#endif
	++mrt6stat.mrt6s_upq_sockfull;
	return (ENOBUFS);
	}
	return (0);
	}

	/*
	* PIM sparse mode hook
	* Receives the pim control messages, and passes them up to the listening
	* socket, using rip6_input.
	* The only message processed is the REGISTER pim message; the pim header
	* is stripped off, and the inner packet is passed to register_mforward.
	*/
	int
	pim6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	struct pim pim; / pointer to a pim struct */
	struct ip6_hdr *ip6;
	int pimlen;
	struct mbuf m = mp;
	int minlen;
	int off = *offp;

	++pim6stat.pim6s_rcv_total;

	ip6 = mtod(m, struct ip6_hdr *);
	pimlen = m->m_pkthdr.len - *offp;

	/*
	* Validate lengths
	*/
	if (pimlen < PIM_MINLEN) {
	++pim6stat.pim6s_rcv_tooshort;
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_PIM)
	log(LOG_DEBUG,"pim6_input: PIM packet too short\n");
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}

	/*
	* if the packet is at least as big as a REGISTER, go ahead
	* and grab the PIM REGISTER header size, to avoid another
	* possible m_pullup() later.
	*
	* PIM_MINLEN == pimhdr + u_int32 == 8
	* PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
	*/
	minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;

	/*
	* Make sure that the IP6 and PIM headers in contiguous memory, and
	* possibly the PIM REGISTER header
	*/
	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, minlen, IPPROTO_DONE);
	/* adjust pointer */
	ip6 = mtod(m, struct ip6_hdr *);

	/* adjust mbuf to point to the PIM header */
	pim = (struct pim *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
	if (pim == NULL) {
	pim6stat.pim6s_rcv_tooshort++;
	return (IPPROTO_DONE);
	}
	#endif

	#define PIM6_CHECKSUM
	#ifdef PIM6_CHECKSUM
	{
	int cksumlen;

	/*
	* Validate checksum.
	* If PIM REGISTER, exclude the data packet
	*/
	if (pim->pim_type == PIM_REGISTER)
	cksumlen = PIM_MINLEN;
	else
	cksumlen = pimlen;

	if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
	++pim6stat.pim6s_rcv_badsum;
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_PIM)
	log(LOG_DEBUG,
	"pim6_input: invalid checksum\n");
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}
	}
	#endif /* PIM_CHECKSUM */

	/* PIM version check */
	if (pim->pim_ver != PIM_VERSION) {
	++pim6stat.pim6s_rcv_badversion;
	#ifdef MRT6DEBUG
	log(LOG_ERR,
	"pim6_input: incorrect version %d, expecting %d\n",
	pim->pim_ver, PIM_VERSION);
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}

	if (pim->pim_type == PIM_REGISTER) {
	/*
	* since this is a REGISTER, we'll make a copy of the register
	* headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
	* routing daemon.
	*/
	static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 };

	struct mbuf *mcp;
	struct ip6_hdr *eip6;
	u_int32_t *reghdr;
	int rc;
	#ifdef MRT6DEBUG
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
	#endif

	++pim6stat.pim6s_rcv_registers;

	if ((reg_mif_num >= nummifs) \|\| (reg_mif_num == (mifi_t) -1)) {
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_PIM)
	log(LOG_DEBUG,
	"pim6_input: register mif not set: %d\n",
	reg_mif_num);
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}

	reghdr = (u_int32_t *)(pim + 1);

	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
	goto pim6_input_to_daemon;

	/*
	* Validate length
	*/
	if (pimlen < PIM6_REG_MINLEN) {
	++pim6stat.pim6s_rcv_tooshort;
	++pim6stat.pim6s_rcv_badregisters;
	#ifdef MRT6DEBUG
	log(LOG_ERR,
	"pim6_input: register packet size too "
	"small %d from %s\n",
	pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src));
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}

	eip6 = (struct ip6_hdr *) (reghdr + 1);
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_PIM)
	log(LOG_DEBUG,
	"pim6_input[register], eip6: %s -> %s, "
	"eip6 plen %d\n",
	ip6_sprintf(ip6bufs, &eip6->ip6_src),
	ip6_sprintf(ip6bufd, &eip6->ip6_dst),
	ntohs(eip6->ip6_plen));
	#endif

	/* verify the version number of the inner packet */
	if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
	++pim6stat.pim6s_rcv_badregisters;
	#ifdef MRT6DEBUG
	log(LOG_DEBUG, "pim6_input: invalid IP version (%d) "
	"of the inner packet\n",
	(eip6->ip6_vfc & IPV6_VERSION));
	#endif
	m_freem(m);
	return (IPPROTO_NONE);
	}

	/* verify the inner packet is destined to a mcast group */
	if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
	++pim6stat.pim6s_rcv_badregisters;
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_PIM)
	log(LOG_DEBUG,
	"pim6_input: inner packet of register "
	"is not multicast %s\n",
	ip6_sprintf(ip6bufd, &eip6->ip6_dst));
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}

	/*
	* make a copy of the whole header to pass to the daemon later.
	*/
	mcp = m_copy(m, 0, off + PIM6_REG_MINLEN);
	if (mcp == NULL) {
	#ifdef MRT6DEBUG
	log(LOG_ERR,
	"pim6_input: pim register: "
	"could not copy register head\n");
	#endif
	m_freem(m);
	return (IPPROTO_DONE);
	}

	/*
	* forward the inner ip6 packet; point m_data at the inner ip6.
	*/
	m_adj(m, off + PIM_MINLEN);
	#ifdef MRT6DEBUG
	if (V_mrt6debug & DEBUG_PIM) {
	log(LOG_DEBUG,
	"pim6_input: forwarding decapsulated register: "
	"src %s, dst %s, mif %d\n",
	ip6_sprintf(ip6bufs, &eip6->ip6_src),
	ip6_sprintf(ip6bufd, &eip6->ip6_dst),
	reg_mif_num);
	}
	#endif

	rc = if_simloop(mif6table[reg_mif_num].m6_ifp, m,
	dst.sin6_family, 0);

	/* prepare the register head to send to the mrouting daemon */
	m = mcp;
	}

	/*
	* Pass the PIM message up to the daemon; if it is a register message
	* pass the 'head' only up to the daemon. This includes the
	* encapsulator ip6 header, pim header, register header and the
	* encapsulated ip6 header.
	*/
	pim6_input_to_daemon:
	rip6_input(&m, offp, proto);
	return (IPPROTO_DONE);
	}
	Index: head/sys/netinet6/ip6_output.c
	===================================================================
	--- head/sys/netinet6/ip6_output.c (revision 183549)
	+++ head/sys/netinet6/ip6_output.c (revision 183550)
	@@ -1,3338 +1,3345 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/errno.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/ucred.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/netisr.h>
	#include <net/route.h>
	#include <net/pfil.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet/in_pcb.h>
	#include <netinet/tcp_var.h>
	#include <netinet6/nd6.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#include <netipsec/key.h>
	#include <netinet6/ip6_ipsec.h>
	#endif /* IPSEC */

	#include <netinet6/ip6protosw.h>
	#include <netinet6/scope6_var.h>

	static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "internet multicast options");

	struct ip6_exthdrs {
	struct mbuf *ip6e_ip6;
	struct mbuf *ip6e_hbh;
	struct mbuf *ip6e_dest1;
	struct mbuf *ip6e_rthdr;
	struct mbuf *ip6e_dest2;
	};

	static int ip6_pcbopt __P((int, u_char , int, struct ip6_pktopts *,
	struct ucred *, int));
	static int ip6_pcbopts __P((struct ip6_pktopts *, struct mbuf ,
	struct socket , struct sockopt ));
	static int ip6_getpcbopt(struct ip6_pktopts , int, struct sockopt );
	static int ip6_setpktopt __P((int, u_char , int, struct ip6_pktopts ,
	struct ucred *, int, int, int));

	static int ip6_setmoptions(int, struct ip6_moptions *, struct mbuf );
	static int ip6_getmoptions(int, struct ip6_moptions , struct mbuf *);
	static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
	static int ip6_insertfraghdr __P((struct mbuf , struct mbuf , int,
	struct ip6_frag **));
	static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
	static int ip6_splithdr(struct mbuf , struct ip6_exthdrs );
	static int ip6_getpmtu __P((struct route_in6 , struct route_in6 ,
	struct ifnet , struct in6_addr , u_long , int ));
	static int copypktopts(struct ip6_pktopts , struct ip6_pktopts , int);


	/*
	* Make an extension header from option data. hp is the source, and
	* mp is the destination.
	*/
	#define MAKE_EXTHDR(hp, mp) \
	do { \
	if (hp) { \
	struct ip6_ext eh = (struct ip6_ext )(hp); \
	error = ip6_copyexthdr((mp), (caddr_t)(hp), \
	((eh)->ip6e_len + 1) << 3); \
	if (error) \
	goto freehdrs; \
	} \
	} while (/CONSTCOND/ 0)

	/*
	* Form a chain of extension headers.
	* m is the extension header mbuf
	* mp is the previous mbuf in the chain
	* p is the next header
	* i is the type of option.
	*/
	#define MAKE_CHAIN(m, mp, p, i)\
	do {\
	if (m) {\
	if (!hdrsplit) \
	panic("assumption failed: hdr not split"); \
	mtod((m), u_char ) = *(p);\
	*(p) = (i);\
	p = mtod((m), u_char *);\
	(m)->m_next = (mp)->m_next;\
	(mp)->m_next = (m);\
	(mp) = (m);\
	}\
	} while (/CONSTCOND/ 0)

	/*
	* IP6 output. The packet in mbuf chain m contains a skeletal IP6
	* header (with pri, len, nxt, hlim, src, dst).
	* This function may modify ver and hlim only.
	* The mbuf chain containing the packet will be freed.
	* The mbuf opt, if present, will not be freed.
	*
	* type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
	* nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one,
	* which is rt_rmx.rmx_mtu.
	*
	* ifpp - XXX: just for statistics
	*/
	int
	ip6_output(struct mbuf m0, struct ip6_pktopts opt,
	struct route_in6 ro, int flags, struct ip6_moptions im6o,
	struct ifnet *ifpp, struct inpcb inp)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr ip6, mhip6;
	struct ifnet ifp, origifp;
	struct mbuf *m = m0;
	struct mbuf *mprev = NULL;
	int hlen, tlen, len, off;
	struct route_in6 ip6route;
	struct rtentry *rt = NULL;
	struct sockaddr_in6 *dst, src_sa, dst_sa;
	struct in6_addr odst;
	int error = 0;
	struct in6_ifaddr *ia = NULL;
	u_long mtu;
	int alwaysfrag, dontfrag;
	u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
	struct ip6_exthdrs exthdrs;
	struct in6_addr finaldst, src0, dst0;
	u_int32_t zone;
	struct route_in6 *ro_pmtu = NULL;
	int hdrsplit = 0;
	int needipsec = 0;
	#ifdef IPSEC
	struct ipsec_output_state state;
	struct ip6_rthdr *rh = NULL;
	int needipsectun = 0;
	int segleft_org = 0;
	struct secpolicy *sp = NULL;
	#endif /* IPSEC */

	ip6 = mtod(m, struct ip6_hdr *);
	if (ip6 == NULL) {
	printf ("ip6 is NULL");
	goto bad;
	}

	finaldst = ip6->ip6_dst;

	bzero(&exthdrs, sizeof(exthdrs));

	if (opt) {
	/* Hop-by-Hop options header */
	MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
	/* Destination options header(1st part) */
	if (opt->ip6po_rthdr) {
	/*
	* Destination options header(1st part)
	* This only makes sense with a routing header.
	* See Section 9.2 of RFC 3542.
	* Disabling this part just for MIP6 convenience is
	* a bad idea. We need to think carefully about a
	* way to make the advanced API coexist with MIP6
	* options, which might automatically be inserted in
	* the kernel.
	*/
	MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
	}
	/* Routing header */
	MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
	/* Destination options header(2nd part) */
	MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
	}

	/*
	* IPSec checking which handles several cases.
	* FAST IPSEC: We re-injected the packet.
	*/
	#ifdef IPSEC
	switch(ip6_ipsec_output(&m, inp, &flags, &error, &ifp, &sp))
	{
	case 1: /* Bad packet */
	goto freehdrs;
	case -1: /* Do IPSec */
	needipsec = 1;
	case 0: /* No IPSec */
	default:
	break;
	}
	#endif /* IPSEC */

	/*
	* Calculate the total length of the extension header chain.
	* Keep the length of the unfragmentable part for fragmentation.
	*/
	optlen = 0;
	if (exthdrs.ip6e_hbh)
	optlen += exthdrs.ip6e_hbh->m_len;
	if (exthdrs.ip6e_dest1)
	optlen += exthdrs.ip6e_dest1->m_len;
	if (exthdrs.ip6e_rthdr)
	optlen += exthdrs.ip6e_rthdr->m_len;
	unfragpartlen = optlen + sizeof(struct ip6_hdr);

	/* NOTE: we don't add AH/ESP length here. do that later. */
	if (exthdrs.ip6e_dest2)
	optlen += exthdrs.ip6e_dest2->m_len;

	/*
	* If we need IPsec, or there is at least one extension header,
	* separate IP6 header from the payload.
	*/
	if ((needipsec \|\| optlen) && !hdrsplit) {
	if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
	m = NULL;
	goto freehdrs;
	}
	m = exthdrs.ip6e_ip6;
	hdrsplit++;
	}

	/* adjust pointer */
	ip6 = mtod(m, struct ip6_hdr *);

	/* adjust mbuf packet header length */
	m->m_pkthdr.len += optlen;
	plen = m->m_pkthdr.len - sizeof(*ip6);

	/* If this is a jumbo payload, insert a jumbo payload option. */
	if (plen > IPV6_MAXPACKET) {
	if (!hdrsplit) {
	if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
	m = NULL;
	goto freehdrs;
	}
	m = exthdrs.ip6e_ip6;
	hdrsplit++;
	}
	/* adjust pointer */
	ip6 = mtod(m, struct ip6_hdr *);
	if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
	goto freehdrs;
	ip6->ip6_plen = 0;
	} else
	ip6->ip6_plen = htons(plen);

	/*
	* Concatenate headers and fill in next header fields.
	* Here we have, on "m"
	* IPv6 payload
	* and we insert headers accordingly. Finally, we should be getting:
	* IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
	*
	* during the header composing process, "m" points to IPv6 header.
	* "mprev" points to an extension header prior to esp.
	*/
	u_char *nexthdrp = &ip6->ip6_nxt;
	mprev = m;

	/*
	* we treat dest2 specially. this makes IPsec processing
	* much easier. the goal here is to make mprev point the
	* mbuf prior to dest2.
	*
	* result: IPv6 dest2 payload
	* m and mprev will point to IPv6 header.
	*/
	if (exthdrs.ip6e_dest2) {
	if (!hdrsplit)
	panic("assumption failed: hdr not split");
	exthdrs.ip6e_dest2->m_next = m->m_next;
	m->m_next = exthdrs.ip6e_dest2;
	mtod(exthdrs.ip6e_dest2, u_char ) = ip6->ip6_nxt;
	ip6->ip6_nxt = IPPROTO_DSTOPTS;
	}

	/*
	* result: IPv6 hbh dest1 rthdr dest2 payload
	* m will point to IPv6 header. mprev will point to the
	* extension header prior to dest2 (rthdr in the above case).
	*/
	MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
	MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
	IPPROTO_DSTOPTS);
	MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
	IPPROTO_ROUTING);

	#ifdef IPSEC
	if (!needipsec)
	goto skip_ipsec2;

	/*
	* pointers after IPsec headers are not valid any more.
	* other pointers need a great care too.
	* (IPsec routines should not mangle mbufs prior to AH/ESP)
	*/
	exthdrs.ip6e_dest2 = NULL;

	if (exthdrs.ip6e_rthdr) {
	rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
	segleft_org = rh->ip6r_segleft;
	rh->ip6r_segleft = 0;
	}

	bzero(&state, sizeof(state));
	state.m = m;
	error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags,
	&needipsectun);
	m = state.m;
	if (error == EJUSTRETURN) {
	/*
	* We had a SP with a level of 'use' and no SA. We
	* will just continue to process the packet without
	* IPsec processing.
	*/
	;
	} else if (error) {
	/* mbuf is already reclaimed in ipsec6_output_trans. */
	m = NULL;
	switch (error) {
	case EHOSTUNREACH:
	case ENETUNREACH:
	case EMSGSIZE:
	case ENOBUFS:
	case ENOMEM:
	break;
	default:
	printf("[%s:%d] (ipsec): error code %d\n",
	__func__, __LINE__, error);
	/* FALLTHROUGH */
	case ENOENT:
	/* don't show these error codes to the user */
	error = 0;
	break;
	}
	goto bad;
	} else if (!needipsectun) {
	/*
	* In the FAST IPSec case we have already
	* re-injected the packet and it has been freed
	* by the ipsec_done() function. So, just clean
	* up after ourselves.
	*/
	m = NULL;
	goto done;
	}
	if (exthdrs.ip6e_rthdr) {
	/* ah6_output doesn't modify mbuf chain */
	rh->ip6r_segleft = segleft_org;
	}
	skip_ipsec2:;
	#endif /* IPSEC */

	/*
	* If there is a routing header, replace the destination address field
	* with the first hop of the routing header.
	*/
	if (exthdrs.ip6e_rthdr) {
	struct ip6_rthdr *rh =
	(struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr,
	struct ip6_rthdr *));
	struct ip6_rthdr0 *rh0;
	struct in6_addr *addr;
	struct sockaddr_in6 sa;

	switch (rh->ip6r_type) {
	case IPV6_RTHDR_TYPE_0:
	rh0 = (struct ip6_rthdr0 *)rh;
	addr = (struct in6_addr *)(rh0 + 1);

	/*
	* construct a sockaddr_in6 form of
	* the first hop.
	*
	* XXX: we may not have enough
	* information about its scope zone;
	* there is no standard API to pass
	* the information from the
	* application.
	*/
	bzero(&sa, sizeof(sa));
	sa.sin6_family = AF_INET6;
	sa.sin6_len = sizeof(sa);
	sa.sin6_addr = addr[0];
	if ((error = sa6_embedscope(&sa,
	V_ip6_use_defzone)) != 0) {
	goto bad;
	}
	ip6->ip6_dst = sa.sin6_addr;
	bcopy(&addr[1], &addr[0], sizeof(struct in6_addr)
	* (rh0->ip6r0_segleft - 1));
	addr[rh0->ip6r0_segleft - 1] = finaldst;
	/* XXX */
	in6_clearscope(addr + rh0->ip6r0_segleft - 1);
	break;
	default: /* is it possible? */
	error = EINVAL;
	goto bad;
	}
	}

	/* Source address validation */
	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
	(flags & IPV6_UNSPECSRC) == 0) {
	error = EOPNOTSUPP;
	V_ip6stat.ip6s_badscope++;
	goto bad;
	}
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
	error = EOPNOTSUPP;
	V_ip6stat.ip6s_badscope++;
	goto bad;
	}

	V_ip6stat.ip6s_localout++;

	/*
	* Route packet.
	*/
	if (ro == 0) {
	ro = &ip6route;
	bzero((caddr_t)ro, sizeof(*ro));
	}
	ro_pmtu = ro;
	if (opt && opt->ip6po_rthdr)
	ro = &opt->ip6po_route;
	dst = (struct sockaddr_in6 *)&ro->ro_dst;

	again:
	/*
	* if specified, try to fill in the traffic class field.
	* do not override if a non-zero value is already set.
	* we check the diffserv field and the ecn field separately.
	*/
	if (opt && opt->ip6po_tclass >= 0) {
	int mask = 0;

	if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
	mask \|= 0xfc;
	if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
	mask \|= 0x03;
	if (mask != 0)
	ip6->ip6_flow \|= htonl((opt->ip6po_tclass & mask) << 20);
	}

	/* fill in or override the hop limit field, if necessary. */
	if (opt && opt->ip6po_hlim != -1)
	ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	if (im6o != NULL)
	ip6->ip6_hlim = im6o->im6o_multicast_hlim;
	else
	ip6->ip6_hlim = V_ip6_defmcasthlim;
	}

	#ifdef IPSEC
	/*
	* We may re-inject packets into the stack here.
	*/
	if (needipsec && needipsectun) {
	struct ipsec_output_state state;

	/*
	* All the extension headers will become inaccessible
	* (since they can be encrypted).
	* Don't panic, we need no more updates to extension headers
	* on inner IPv6 packet (since they are now encapsulated).
	*
	* IPv6 [ESP\|AH] IPv6 [extension headers] payload
	*/
	bzero(&exthdrs, sizeof(exthdrs));
	exthdrs.ip6e_ip6 = m;

	bzero(&state, sizeof(state));
	state.m = m;
	state.ro = (struct route *)ro;
	state.dst = (struct sockaddr *)dst;

	error = ipsec6_output_tunnel(&state, sp, flags);

	m = state.m;
	ro = (struct route_in6 *)state.ro;
	dst = (struct sockaddr_in6 *)state.dst;
	if (error == EJUSTRETURN) {
	/*
	* We had a SP with a level of 'use' and no SA. We
	* will just continue to process the packet without
	* IPsec processing.
	*/
	;
	} else if (error) {
	/* mbuf is already reclaimed in ipsec6_output_tunnel. */
	m0 = m = NULL;
	m = NULL;
	switch (error) {
	case EHOSTUNREACH:
	case ENETUNREACH:
	case EMSGSIZE:
	case ENOBUFS:
	case ENOMEM:
	break;
	default:
	printf("[%s:%d] (ipsec): error code %d\n",
	__func__, __LINE__, error);
	/* FALLTHROUGH */
	case ENOENT:
	/* don't show these error codes to the user */
	error = 0;
	break;
	}
	goto bad;
	} else {
	/*
	* In the FAST IPSec case we have already
	* re-injected the packet and it has been freed
	* by the ipsec_done() function. So, just clean
	* up after ourselves.
	*/
	m = NULL;
	goto done;
	}

	exthdrs.ip6e_ip6 = m;
	}
	#endif /* IPSEC */

	/* adjust pointer */
	ip6 = mtod(m, struct ip6_hdr *);

	bzero(&dst_sa, sizeof(dst_sa));
	dst_sa.sin6_family = AF_INET6;
	dst_sa.sin6_len = sizeof(dst_sa);
	dst_sa.sin6_addr = ip6->ip6_dst;
	if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
	&ifp, &rt, 0)) != 0) {
	switch (error) {
	case EHOSTUNREACH:
	V_ip6stat.ip6s_noroute++;
	break;
	case EADDRNOTAVAIL:
	default:
	break; /* XXX statistics? */
	}
	if (ifp != NULL)
	in6_ifstat_inc(ifp, ifs6_out_discard);
	goto bad;
	}
	if (rt == NULL) {
	/*
	* If in6_selectroute() does not return a route entry,
	* dst may not have been updated.
	*/
	dst = dst_sa; / XXX */
	}

	/*
	* then rt (for unicast) and ifp must be non-NULL valid values.
	*/
	if ((flags & IPV6_FORWARDING) == 0) {
	/* XXX: the FORWARDING flag can be set for mrouting. */
	in6_ifstat_inc(ifp, ifs6_out_request);
	}
	if (rt != NULL) {
	ia = (struct in6_ifaddr *)(rt->rt_ifa);
	rt->rt_use++;
	}

	/*
	* The outgoing interface must be in the zone of source and
	* destination addresses. We should use ia_ifp to support the
	* case of sending packets to an address of our own.
	*/
	if (ia != NULL && ia->ia_ifp)
	origifp = ia->ia_ifp;
	else
	origifp = ifp;

	src0 = ip6->ip6_src;
	if (in6_setscope(&src0, origifp, &zone))
	goto badscope;
	bzero(&src_sa, sizeof(src_sa));
	src_sa.sin6_family = AF_INET6;
	src_sa.sin6_len = sizeof(src_sa);
	src_sa.sin6_addr = ip6->ip6_src;
	if (sa6_recoverscope(&src_sa) \|\| zone != src_sa.sin6_scope_id)
	goto badscope;

	dst0 = ip6->ip6_dst;
	if (in6_setscope(&dst0, origifp, &zone))
	goto badscope;
	/* re-initialize to be sure */
	bzero(&dst_sa, sizeof(dst_sa));
	dst_sa.sin6_family = AF_INET6;
	dst_sa.sin6_len = sizeof(dst_sa);
	dst_sa.sin6_addr = ip6->ip6_dst;
	if (sa6_recoverscope(&dst_sa) \|\| zone != dst_sa.sin6_scope_id) {
	goto badscope;
	}

	/* scope check is done. */
	goto routefound;

	badscope:
	V_ip6stat.ip6s_badscope++;
	in6_ifstat_inc(origifp, ifs6_out_discard);
	if (error == 0)
	error = EHOSTUNREACH; /* XXX */
	goto bad;

	routefound:
	if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	if (opt && opt->ip6po_nextroute.ro_rt) {
	/*
	* The nexthop is explicitly specified by the
	* application. We assume the next hop is an IPv6
	* address.
	*/
	dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
	}
	else if ((rt->rt_flags & RTF_GATEWAY))
	dst = (struct sockaddr_in6 *)rt->rt_gateway;
	}

	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	m->m_flags &= ~(M_BCAST \| M_MCAST); /* just in case */
	} else {
	struct in6_multi *in6m;

	m->m_flags = (m->m_flags & ~M_BCAST) \| M_MCAST;

	in6_ifstat_inc(ifp, ifs6_out_mcast);

	/*
	* Confirm that the outgoing interface supports multicast.
	*/
	if (!(ifp->if_flags & IFF_MULTICAST)) {
	V_ip6stat.ip6s_noroute++;
	in6_ifstat_inc(ifp, ifs6_out_discard);
	error = ENETUNREACH;
	goto bad;
	}
	IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m);
	if (in6m != NULL &&
	(im6o == NULL \|\| im6o->im6o_multicast_loop)) {
	/*
	* If we belong to the destination multicast group
	* on the outgoing interface, and the caller did not
	* forbid loopback, loop back a copy.
	*/
	ip6_mloopback(ifp, m, dst);
	} else {
	/*
	* If we are acting as a multicast router, perform
	* multicast forwarding as if the packet had just
	* arrived on the interface to which we are about
	* to send. The multicast forwarding function
	* recursively calls this function, using the
	* IPV6_FORWARDING flag to prevent infinite recursion.
	*
	* Multicasts that are looped back by ip6_mloopback(),
	* above, will be forwarded by the ip6_input() routine,
	* if necessary.
	*/
	if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
	/*
	* XXX: ip6_mforward expects that rcvif is NULL
	* when it is called from the originating path.
	* However, it is not always the case, since
	* some versions of MGETHDR() does not
	* initialize the field.
	*/
	m->m_pkthdr.rcvif = NULL;
	if (ip6_mforward(ip6, ifp, m) != 0) {
	m_freem(m);
	goto done;
	}
	}
	}
	/*
	* Multicasts with a hoplimit of zero may be looped back,
	* above, but must not be transmitted on a network.
	* Also, multicasts addressed to the loopback interface
	* are not sent -- the above call to ip6_mloopback() will
	* loop back a copy if this host actually belongs to the
	* destination group on the loopback interface.
	*/
	if (ip6->ip6_hlim == 0 \|\| (ifp->if_flags & IFF_LOOPBACK) \|\|
	IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
	m_freem(m);
	goto done;
	}
	}

	/*
	* Fill the outgoing inteface to tell the upper layer
	* to increment per-interface statistics.
	*/
	if (ifpp)
	*ifpp = ifp;

	/* Determine path MTU. */
	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
	&alwaysfrag)) != 0)
	goto bad;

	/*
	* The caller of this function may specify to use the minimum MTU
	* in some cases.
	* An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
	* setting. The logic is a bit complicated; by default, unicast
	* packets will follow path MTU while multicast packets will be sent at
	* the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets
	* including unicast ones will be sent at the minimum MTU. Multicast
	* packets will always be sent at the minimum MTU unless
	* IP6PO_MINMTU_DISABLE is explicitly specified.
	* See RFC 3542 for more details.
	*/
	if (mtu > IPV6_MMTU) {
	if ((flags & IPV6_MINMTU))
	mtu = IPV6_MMTU;
	else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
	mtu = IPV6_MMTU;
	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
	(opt == NULL \|\|
	opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
	mtu = IPV6_MMTU;
	}
	}

	/*
	* clear embedded scope identifiers if necessary.
	* in6_clearscope will touch the addresses only when necessary.
	*/
	in6_clearscope(&ip6->ip6_src);
	in6_clearscope(&ip6->ip6_dst);

	/*
	* If the outgoing packet contains a hop-by-hop options header,
	* it must be examined and processed even by the source node.
	* (RFC 2460, section 4.)
	*/
	if (exthdrs.ip6e_hbh) {
	struct ip6_hbh hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh );
	u_int32_t dummy; /* XXX unused */
	u_int32_t plen = 0; /* XXX: ip6_process will check the value */

	#ifdef DIAGNOSTIC
	if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
	panic("ip6e_hbh is not continuous");
	#endif
	/*
	* XXX: if we have to send an ICMPv6 error to the sender,
	* we need the M_LOOP flag since icmp6_error() expects
	* the IPv6 and the hop-by-hop options header are
	* continuous unless the flag is set.
	*/
	m->m_flags \|= M_LOOP;
	m->m_pkthdr.rcvif = ifp;
	if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
	((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
	&dummy, &plen) < 0) {
	/* m was already freed at this point */
	error = EINVAL;/* better error? */
	goto done;
	}
	m->m_flags &= ~M_LOOP; /* XXX */
	m->m_pkthdr.rcvif = NULL;
	}

	/* Jump over all PFIL processing if hooks are not active. */
	if (!PFIL_HOOKED(&inet6_pfil_hook))
	goto passout;

	odst = ip6->ip6_dst;
	/* Run through list of hooks for output packets. */
	error = pfil_run_hooks(&inet6_pfil_hook, &m, ifp, PFIL_OUT, inp);
	if (error != 0 \|\| m == NULL)
	goto done;
	ip6 = mtod(m, struct ip6_hdr *);

	/* See if destination IP address was changed by packet filter. */
	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
	m->m_flags \|= M_SKIP_FIREWALL;
	/* If destination is now ourself drop to ip6_input(). */
	if (in6_localaddr(&ip6->ip6_dst)) {
	if (m->m_pkthdr.rcvif == NULL)
	m->m_pkthdr.rcvif = V_loif;
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	m->m_pkthdr.csum_flags \|=
	CSUM_DATA_VALID \| CSUM_PSEUDO_HDR;
	m->m_pkthdr.csum_data = 0xffff;
	}
	m->m_pkthdr.csum_flags \|=
	CSUM_IP_CHECKED \| CSUM_IP_VALID;
	error = netisr_queue(NETISR_IPV6, m);
	goto done;
	} else
	goto again; /* Redo the routing table lookup. */
	}

	/* XXX: IPFIREWALL_FORWARD */

	passout:
	/*
	* Send the packet to the outgoing interface.
	* If necessary, do IPv6 fragmentation before sending.
	*
	* the logic here is rather complex:
	* 1: normal case (dontfrag == 0, alwaysfrag == 0)
	* 1-a: send as is if tlen <= path mtu
	* 1-b: fragment if tlen > path mtu
	*
	* 2: if user asks us not to fragment (dontfrag == 1)
	* 2-a: send as is if tlen <= interface mtu
	* 2-b: error if tlen > interface mtu
	*
	* 3: if we always need to attach fragment header (alwaysfrag == 1)
	* always fragment
	*
	* 4: if dontfrag == 1 && alwaysfrag == 1
	* error, as we cannot handle this conflicting request
	*/
	tlen = m->m_pkthdr.len;

	if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
	dontfrag = 1;
	else
	dontfrag = 0;
	if (dontfrag && alwaysfrag) { /* case 4 */
	/* conflicting request - can't transmit */
	error = EMSGSIZE;
	goto bad;
	}
	if (dontfrag && tlen > IN6_LINKMTU(ifp)) { /* case 2-b */
	/*
	* Even if the DONTFRAG option is specified, we cannot send the
	* packet when the data length is larger than the MTU of the
	* outgoing interface.
	* Notify the error by sending IPV6_PATHMTU ancillary data as
	* well as returning an error code (the latter is not described
	* in the API spec.)
	*/
	u_int32_t mtu32;
	struct ip6ctlparam ip6cp;

	mtu32 = (u_int32_t)mtu;
	bzero(&ip6cp, sizeof(ip6cp));
	ip6cp.ip6c_cmdarg = (void *)&mtu32;
	pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst,
	(void *)&ip6cp);

	error = EMSGSIZE;
	goto bad;
	}

	/*
	* transmit packet without fragmentation
	*/
	if (dontfrag \|\| (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */
	struct in6_ifaddr *ia6;

	ip6 = mtod(m, struct ip6_hdr *);
	ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
	if (ia6) {
	/* Record statistics for this interface address. */
	ia6->ia_ifa.if_opackets++;
	ia6->ia_ifa.if_obytes += m->m_pkthdr.len;
	}
	error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
	goto done;
	}

	/*
	* try to fragment the packet. case 1-b and 3
	*/
	if (mtu < IPV6_MMTU) {
	/* path MTU cannot be less than IPV6_MMTU */
	error = EMSGSIZE;
	in6_ifstat_inc(ifp, ifs6_out_fragfail);
	goto bad;
	} else if (ip6->ip6_plen == 0) {
	/* jumbo payload cannot be fragmented */
	error = EMSGSIZE;
	in6_ifstat_inc(ifp, ifs6_out_fragfail);
	goto bad;
	} else {
	struct mbuf *mnext, m_frgpart;
	struct ip6_frag *ip6f;
	u_int32_t id = htonl(ip6_randomid());
	u_char nextproto;

	int qslots = ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len;

	/*
	* Too large for the destination or interface;
	* fragment if possible.
	* Must be able to put at least 8 bytes per fragment.
	*/
	hlen = unfragpartlen;
	if (mtu > IPV6_MAXPACKET)
	mtu = IPV6_MAXPACKET;

	len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
	if (len < 8) {
	error = EMSGSIZE;
	in6_ifstat_inc(ifp, ifs6_out_fragfail);
	goto bad;
	}

	/*
	* Verify that we have any chance at all of being able to queue
	* the packet or packet fragments
	*/
	if (qslots <= 0 \|\| ((u_int)qslots * (mtu - hlen)
	< tlen /* - hlen */)) {
	error = ENOBUFS;
	V_ip6stat.ip6s_odropped++;
	goto bad;
	}

	mnext = &m->m_nextpkt;

	/*
	* Change the next header field of the last header in the
	* unfragmentable part.
	*/
	if (exthdrs.ip6e_rthdr) {
	nextproto = mtod(exthdrs.ip6e_rthdr, u_char );
	mtod(exthdrs.ip6e_rthdr, u_char ) = IPPROTO_FRAGMENT;
	} else if (exthdrs.ip6e_dest1) {
	nextproto = mtod(exthdrs.ip6e_dest1, u_char );
	mtod(exthdrs.ip6e_dest1, u_char ) = IPPROTO_FRAGMENT;
	} else if (exthdrs.ip6e_hbh) {
	nextproto = mtod(exthdrs.ip6e_hbh, u_char );
	mtod(exthdrs.ip6e_hbh, u_char ) = IPPROTO_FRAGMENT;
	} else {
	nextproto = ip6->ip6_nxt;
	ip6->ip6_nxt = IPPROTO_FRAGMENT;
	}

	/*
	* Loop through length of segment after first fragment,
	* make new header and copy data of each part and link onto
	* chain.
	*/
	m0 = m;
	for (off = hlen; off < tlen; off += len) {
	MGETHDR(m, M_DONTWAIT, MT_HEADER);
	if (!m) {
	error = ENOBUFS;
	V_ip6stat.ip6s_odropped++;
	goto sendorfree;
	}
	m->m_pkthdr.rcvif = NULL;
	m->m_flags = m0->m_flags & M_COPYFLAGS;
	*mnext = m;
	mnext = &m->m_nextpkt;
	m->m_data += max_linkhdr;
	mhip6 = mtod(m, struct ip6_hdr *);
	mhip6 = ip6;
	m->m_len = sizeof(*mhip6);
	error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
	if (error) {
	V_ip6stat.ip6s_odropped++;
	goto sendorfree;
	}
	ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
	if (off + len >= tlen)
	len = tlen - off;
	else
	ip6f->ip6f_offlg \|= IP6F_MORE_FRAG;
	mhip6->ip6_plen = htons((u_short)(len + hlen +
	sizeof(*ip6f) - sizeof(struct ip6_hdr)));
	if ((m_frgpart = m_copy(m0, off, len)) == 0) {
	error = ENOBUFS;
	V_ip6stat.ip6s_odropped++;
	goto sendorfree;
	}
	m_cat(m, m_frgpart);
	m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
	m->m_pkthdr.rcvif = NULL;
	ip6f->ip6f_reserved = 0;
	ip6f->ip6f_ident = id;
	ip6f->ip6f_nxt = nextproto;
	V_ip6stat.ip6s_ofragments++;
	in6_ifstat_inc(ifp, ifs6_out_fragcreat);
	}

	in6_ifstat_inc(ifp, ifs6_out_fragok);
	}

	/*
	* Remove leading garbages.
	*/
	sendorfree:
	m = m0->m_nextpkt;
	m0->m_nextpkt = 0;
	m_freem(m0);
	for (m0 = m; m; m = m0) {
	m0 = m->m_nextpkt;
	m->m_nextpkt = 0;
	if (error == 0) {
	/* Record statistics for this interface address. */
	if (ia) {
	ia->ia_ifa.if_opackets++;
	ia->ia_ifa.if_obytes += m->m_pkthdr.len;
	}
	error = nd6_output(ifp, origifp, m, dst, ro->ro_rt);
	} else
	m_freem(m);
	}

	if (error == 0)
	V_ip6stat.ip6s_fragmented++;

	done:
	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
	RTFREE(ro->ro_rt);
	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
	RTFREE(ro_pmtu->ro_rt);
	}
	#ifdef IPSEC
	if (sp != NULL)
	KEY_FREESP(&sp);
	#endif

	return (error);

	freehdrs:
	m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */
	m_freem(exthdrs.ip6e_dest1);
	m_freem(exthdrs.ip6e_rthdr);
	m_freem(exthdrs.ip6e_dest2);
	/* FALLTHROUGH */
	bad:
	if (m)
	m_freem(m);
	goto done;
	}

	static int
	ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
	{
	struct mbuf *m;

	if (hlen > MCLBYTES)
	return (ENOBUFS); /* XXX */

	MGET(m, M_DONTWAIT, MT_DATA);
	if (!m)
	return (ENOBUFS);

	if (hlen > MLEN) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	m_free(m);
	return (ENOBUFS);
	}
	}
	m->m_len = hlen;
	if (hdr)
	bcopy(hdr, mtod(m, caddr_t), hlen);

	*mp = m;
	return (0);
	}

	/*
	* Insert jumbo payload option.
	*/
	static int
	ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
	{
	struct mbuf *mopt;
	u_char *optbuf;
	u_int32_t v;

	#define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */

	/*
	* If there is no hop-by-hop options header, allocate new one.
	* If there is one but it doesn't have enough space to store the
	* jumbo payload option, allocate a cluster to store the whole options.
	* Otherwise, use it to store the options.
	*/
	if (exthdrs->ip6e_hbh == 0) {
	MGET(mopt, M_DONTWAIT, MT_DATA);
	if (mopt == 0)
	return (ENOBUFS);
	mopt->m_len = JUMBOOPTLEN;
	optbuf = mtod(mopt, u_char *);
	optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */
	exthdrs->ip6e_hbh = mopt;
	} else {
	struct ip6_hbh *hbh;

	mopt = exthdrs->ip6e_hbh;
	if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
	/*
	* XXX assumption:
	* - exthdrs->ip6e_hbh is not referenced from places
	* other than exthdrs.
	* - exthdrs->ip6e_hbh is not an mbuf chain.
	*/
	int oldoptlen = mopt->m_len;
	struct mbuf *n;

	/*
	* XXX: give up if the whole (new) hbh header does
	* not fit even in an mbuf cluster.
	*/
	if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
	return (ENOBUFS);

	/*
	* As a consequence, we must always prepare a cluster
	* at this point.
	*/
	MGET(n, M_DONTWAIT, MT_DATA);
	if (n) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n)
	return (ENOBUFS);
	n->m_len = oldoptlen + JUMBOOPTLEN;
	bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
	oldoptlen);
	optbuf = mtod(n, caddr_t) + oldoptlen;
	m_freem(mopt);
	mopt = exthdrs->ip6e_hbh = n;
	} else {
	optbuf = mtod(mopt, u_char *) + mopt->m_len;
	mopt->m_len += JUMBOOPTLEN;
	}
	optbuf[0] = IP6OPT_PADN;
	optbuf[1] = 1;

	/*
	* Adjust the header length according to the pad and
	* the jumbo payload option.
	*/
	hbh = mtod(mopt, struct ip6_hbh *);
	hbh->ip6h_len += (JUMBOOPTLEN >> 3);
	}

	/* fill in the option. */
	optbuf[2] = IP6OPT_JUMBO;
	optbuf[3] = 4;
	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
	bcopy(&v, &optbuf[4], sizeof(u_int32_t));

	/* finally, adjust the packet header length */
	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;

	return (0);
	#undef JUMBOOPTLEN
	}

	/*
	* Insert fragment header and copy unfragmentable header portions.
	*/
	static int
	ip6_insertfraghdr(struct mbuf m0, struct mbuf m, int hlen,
	struct ip6_frag **frghdrp)
	{
	struct mbuf n, mlast;

	if (hlen > sizeof(struct ip6_hdr)) {
	n = m_copym(m0, sizeof(struct ip6_hdr),
	hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
	if (n == 0)
	return (ENOBUFS);
	m->m_next = n;
	} else
	n = m;

	/* Search for the last mbuf of unfragmentable part. */
	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
	;

	if ((mlast->m_flags & M_EXT) == 0 &&
	M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
	/* use the trailing space of the last mbuf for the fragment hdr */
	frghdrp = (struct ip6_frag )(mtod(mlast, caddr_t) +
	mlast->m_len);
	mlast->m_len += sizeof(struct ip6_frag);
	m->m_pkthdr.len += sizeof(struct ip6_frag);
	} else {
	/* allocate a new mbuf for the fragment header */
	struct mbuf *mfrg;

	MGET(mfrg, M_DONTWAIT, MT_DATA);
	if (mfrg == 0)
	return (ENOBUFS);
	mfrg->m_len = sizeof(struct ip6_frag);
	frghdrp = mtod(mfrg, struct ip6_frag );
	mlast->m_next = mfrg;
	}

	return (0);
	}

	static int
	ip6_getpmtu(struct route_in6 ro_pmtu, struct route_in6 ro,
	struct ifnet ifp, struct in6_addr dst, u_long *mtup,
	int *alwaysfragp)
	{
	u_int32_t mtu = 0;
	int alwaysfrag = 0;
	int error = 0;

	if (ro_pmtu != ro) {
	/* The first hop and the final destination may differ. */
	struct sockaddr_in6 *sa6_dst =
	(struct sockaddr_in6 *)&ro_pmtu->ro_dst;
	if (ro_pmtu->ro_rt &&
	((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 \|\|
	!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
	RTFREE(ro_pmtu->ro_rt);
	ro_pmtu->ro_rt = (struct rtentry *)NULL;
	}
	if (ro_pmtu->ro_rt == NULL) {
	bzero(sa6_dst, sizeof(*sa6_dst));
	sa6_dst->sin6_family = AF_INET6;
	sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
	sa6_dst->sin6_addr = *dst;

	rtalloc((struct route *)ro_pmtu);
	}
	}
	if (ro_pmtu->ro_rt) {
	u_int32_t ifmtu;
	struct in_conninfo inc;

	bzero(&inc, sizeof(inc));
	inc.inc_flags = 1; /* IPv6 */
	inc.inc6_faddr = *dst;

	if (ifp == NULL)
	ifp = ro_pmtu->ro_rt->rt_ifp;
	ifmtu = IN6_LINKMTU(ifp);
	mtu = tcp_hc_getmtu(&inc);
	if (mtu)
	mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
	else
	mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
	if (mtu == 0)
	mtu = ifmtu;
	else if (mtu < IPV6_MMTU) {
	/*
	* RFC2460 section 5, last paragraph:
	* if we record ICMPv6 too big message with
	* mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
	* or smaller, with framgent header attached.
	* (fragment header is needed regardless from the
	* packet size, for translators to identify packets)
	*/
	alwaysfrag = 1;
	mtu = IPV6_MMTU;
	} else if (mtu > ifmtu) {
	/*
	* The MTU on the route is larger than the MTU on
	* the interface! This shouldn't happen, unless the
	* MTU of the interface has been changed after the
	* interface was brought up. Change the MTU in the
	* route to match the interface MTU (as long as the
	* field isn't locked).
	*/
	mtu = ifmtu;
	ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
	}
	} else if (ifp) {
	mtu = IN6_LINKMTU(ifp);
	} else
	error = EHOSTUNREACH; /* XXX */

	*mtup = mtu;
	if (alwaysfragp)
	*alwaysfragp = alwaysfrag;
	return (error);
	}

	/*
	* IP6 socket option processing.
	*/
	int
	ip6_ctloutput(struct socket so, struct sockopt sopt)
	{
	int optdatalen, uproto;
	void *optdata;
	struct inpcb *in6p = sotoinpcb(so);
	int error, optval;
	int level, op, optname;
	int optlen;
	struct thread *td;

	level = sopt->sopt_level;
	op = sopt->sopt_dir;
	optname = sopt->sopt_name;
	optlen = sopt->sopt_valsize;
	td = sopt->sopt_td;
	error = 0;
	optval = 0;
	uproto = (int)so->so_proto->pr_protocol;

	if (level == IPPROTO_IPV6) {
	switch (op) {

	case SOPT_SET:
	switch (optname) {
	case IPV6_2292PKTOPTIONS:
	#ifdef IPV6_PKTOPTIONS
	case IPV6_PKTOPTIONS:
	#endif
	{
	struct mbuf *m;

	error = soopt_getm(sopt, &m); /* XXX */
	if (error != 0)
	break;
	error = soopt_mcopyin(sopt, m); /* XXX */
	if (error != 0)
	break;
	error = ip6_pcbopts(&in6p->in6p_outputopts,
	m, so, sopt);
	m_freem(m); /* XXX */
	break;
	}

	/*
	* Use of some Hop-by-Hop options or some
	* Destination options, might require special
	* privilege. That is, normal applications
	* (without special privilege) might be forbidden
	* from setting certain options in outgoing packets,
	* and might never see certain options in received
	* packets. [RFC 2292 Section 6]
	* KAME specific note:
	* KAME prevents non-privileged users from sending or
	* receiving ANY hbh/dst options in order to avoid
	* overhead of parsing options in the kernel.
	*/
	case IPV6_RECVHOPOPTS:
	case IPV6_RECVDSTOPTS:
	case IPV6_RECVRTHDRDSTOPTS:
	if (td != NULL) {
	error = priv_check(td,
	PRIV_NETINET_SETHDROPTS);
	if (error)
	break;
	}
	/* FALLTHROUGH */
	case IPV6_UNICAST_HOPS:
	case IPV6_HOPLIMIT:
	case IPV6_FAITH:

	case IPV6_RECVPKTINFO:
	case IPV6_RECVHOPLIMIT:
	case IPV6_RECVRTHDR:
	case IPV6_RECVPATHMTU:
	case IPV6_RECVTCLASS:
	case IPV6_V6ONLY:
	case IPV6_AUTOFLOWLABEL:
	if (optlen != sizeof(int)) {
	error = EINVAL;
	break;
	}
	error = sooptcopyin(sopt, &optval,
	sizeof optval, sizeof optval);
	if (error)
	break;
	switch (optname) {

	case IPV6_UNICAST_HOPS:
	if (optval < -1 \|\| optval >= 256)
	error = EINVAL;
	else {
	/* -1 = kernel default */
	in6p->in6p_hops = optval;
	if ((in6p->in6p_vflag &
	INP_IPV4) != 0)
	in6p->inp_ip_ttl = optval;
	}
	break;
	#define OPTSET(bit) \
	do { \
	if (optval) \
	in6p->in6p_flags \|= (bit); \
	else \
	in6p->in6p_flags &= ~(bit); \
	} while (/CONSTCOND/ 0)
	#define OPTSET2292(bit) \
	do { \
	in6p->in6p_flags \|= IN6P_RFC2292; \
	if (optval) \
	in6p->in6p_flags \|= (bit); \
	else \
	in6p->in6p_flags &= ~(bit); \
	} while (/CONSTCOND/ 0)
	#define OPTBIT(bit) (in6p->in6p_flags & (bit) ? 1 : 0)

	case IPV6_RECVPKTINFO:
	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_PKTINFO);
	break;

	case IPV6_HOPLIMIT:
	{
	struct ip6_pktopts **optp;

	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	optp = &in6p->in6p_outputopts;
	error = ip6_pcbopt(IPV6_HOPLIMIT,
	(u_char *)&optval, sizeof(optval),
	optp, (td != NULL) ? td->td_ucred :
	NULL, uproto);
	break;
	}

	case IPV6_RECVHOPLIMIT:
	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_HOPLIMIT);
	break;

	case IPV6_RECVHOPOPTS:
	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_HOPOPTS);
	break;

	case IPV6_RECVDSTOPTS:
	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_DSTOPTS);
	break;

	case IPV6_RECVRTHDRDSTOPTS:
	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_RTHDRDSTOPTS);
	break;

	case IPV6_RECVRTHDR:
	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_RTHDR);
	break;

	case IPV6_FAITH:
	OPTSET(IN6P_FAITH);
	break;

	case IPV6_RECVPATHMTU:
	/*
	* We ignore this option for TCP
	* sockets.
	* (RFC3542 leaves this case
	* unspecified.)
	*/
	if (uproto != IPPROTO_TCP)
	OPTSET(IN6P_MTU);
	break;

	case IPV6_V6ONLY:
	/*
	* make setsockopt(IPV6_V6ONLY)
	* available only prior to bind(2).
	* see ipng mailing list, Jun 22 2001.
	*/
	if (in6p->in6p_lport \|\|
	!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_IPV6_V6ONLY);
	if (optval)
	in6p->in6p_vflag &= ~INP_IPV4;
	else
	in6p->in6p_vflag \|= INP_IPV4;
	break;
	case IPV6_RECVTCLASS:
	/* cannot mix with RFC2292 XXX */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}
	OPTSET(IN6P_TCLASS);
	break;
	case IPV6_AUTOFLOWLABEL:
	OPTSET(IN6P_AUTOFLOWLABEL);
	break;

	}
	break;

	case IPV6_TCLASS:
	case IPV6_DONTFRAG:
	case IPV6_USE_MIN_MTU:
	case IPV6_PREFER_TEMPADDR:
	if (optlen != sizeof(optval)) {
	error = EINVAL;
	break;
	}
	error = sooptcopyin(sopt, &optval,
	sizeof optval, sizeof optval);
	if (error)
	break;
	{
	struct ip6_pktopts **optp;
	optp = &in6p->in6p_outputopts;
	error = ip6_pcbopt(optname,
	(u_char *)&optval, sizeof(optval),
	optp, (td != NULL) ? td->td_ucred :
	NULL, uproto);
	break;
	}

	case IPV6_2292PKTINFO:
	case IPV6_2292HOPLIMIT:
	case IPV6_2292HOPOPTS:
	case IPV6_2292DSTOPTS:
	case IPV6_2292RTHDR:
	/* RFC 2292 */
	if (optlen != sizeof(int)) {
	error = EINVAL;
	break;
	}
	error = sooptcopyin(sopt, &optval,
	sizeof optval, sizeof optval);
	if (error)
	break;
	switch (optname) {
	case IPV6_2292PKTINFO:
	OPTSET2292(IN6P_PKTINFO);
	break;
	case IPV6_2292HOPLIMIT:
	OPTSET2292(IN6P_HOPLIMIT);
	break;
	case IPV6_2292HOPOPTS:
	/*
	* Check super-user privilege.
	* See comments for IPV6_RECVHOPOPTS.
	*/
	if (td != NULL) {
	error = priv_check(td,
	PRIV_NETINET_SETHDROPTS);
	if (error)
	return (error);
	}
	OPTSET2292(IN6P_HOPOPTS);
	break;
	case IPV6_2292DSTOPTS:
	if (td != NULL) {
	error = priv_check(td,
	PRIV_NETINET_SETHDROPTS);
	if (error)
	return (error);
	}
	OPTSET2292(IN6P_DSTOPTS\|IN6P_RTHDRDSTOPTS); /* XXX */
	break;
	case IPV6_2292RTHDR:
	OPTSET2292(IN6P_RTHDR);
	break;
	}
	break;
	case IPV6_PKTINFO:
	case IPV6_HOPOPTS:
	case IPV6_RTHDR:
	case IPV6_DSTOPTS:
	case IPV6_RTHDRDSTOPTS:
	case IPV6_NEXTHOP:
	{
	/* new advanced API (RFC3542) */
	u_char *optbuf;
	u_char optbuf_storage[MCLBYTES];
	int optlen;
	struct ip6_pktopts **optp;

	/* cannot mix with RFC2292 */
	if (OPTBIT(IN6P_RFC2292)) {
	error = EINVAL;
	break;
	}

	/*
	* We only ensure valsize is not too large
	* here. Further validation will be done
	* later.
	*/
	error = sooptcopyin(sopt, optbuf_storage,
	sizeof(optbuf_storage), 0);
	if (error)
	break;
	optlen = sopt->sopt_valsize;
	optbuf = optbuf_storage;
	optp = &in6p->in6p_outputopts;
	error = ip6_pcbopt(optname, optbuf, optlen,
	optp, (td != NULL) ? td->td_ucred : NULL,
	uproto);
	break;
	}
	#undef OPTSET

	case IPV6_MULTICAST_IF:
	case IPV6_MULTICAST_HOPS:
	case IPV6_MULTICAST_LOOP:
	case IPV6_JOIN_GROUP:
	case IPV6_LEAVE_GROUP:
	{
	if (sopt->sopt_valsize > MLEN) {
	error = EMSGSIZE;
	break;
	}
	/* XXX */
	}
	/* FALLTHROUGH */
	{
	struct mbuf *m;

	if (sopt->sopt_valsize > MCLBYTES) {
	error = EMSGSIZE;
	break;
	}
	/* XXX */
	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
	if (m == 0) {
	error = ENOBUFS;
	break;
	}
	if (sopt->sopt_valsize > MLEN) {
	MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	m_free(m);
	error = ENOBUFS;
	break;
	}
	}
	m->m_len = sopt->sopt_valsize;
	error = sooptcopyin(sopt, mtod(m, char *),
	m->m_len, m->m_len);
	if (error) {
	(void)m_free(m);
	break;
	}
	error = ip6_setmoptions(sopt->sopt_name,
	&in6p->in6p_moptions,
	m);
	(void)m_free(m);
	}
	break;

	case IPV6_PORTRANGE:
	error = sooptcopyin(sopt, &optval,
	sizeof optval, sizeof optval);
	if (error)
	break;

	switch (optval) {
	case IPV6_PORTRANGE_DEFAULT:
	in6p->in6p_flags &= ~(IN6P_LOWPORT);
	in6p->in6p_flags &= ~(IN6P_HIGHPORT);
	break;

	case IPV6_PORTRANGE_HIGH:
	in6p->in6p_flags &= ~(IN6P_LOWPORT);
	in6p->in6p_flags \|= IN6P_HIGHPORT;
	break;

	case IPV6_PORTRANGE_LOW:
	in6p->in6p_flags &= ~(IN6P_HIGHPORT);
	in6p->in6p_flags \|= IN6P_LOWPORT;
	break;

	default:
	error = EINVAL;
	break;
	}
	break;

	#ifdef IPSEC
	case IPV6_IPSEC_POLICY:
	{
	caddr_t req;
	struct mbuf *m;

	if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
	break;
	if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
	break;
	req = mtod(m, caddr_t);
	error = ipsec6_set_policy(in6p, optname, req,
	m->m_len, (sopt->sopt_td != NULL) ?
	sopt->sopt_td->td_ucred : NULL);
	m_freem(m);
	break;
	}
	#endif /* IPSEC */

	default:
	error = ENOPROTOOPT;
	break;
	}
	break;

	case SOPT_GET:
	switch (optname) {

	case IPV6_2292PKTOPTIONS:
	#ifdef IPV6_PKTOPTIONS
	case IPV6_PKTOPTIONS:
	#endif
	/*
	* RFC3542 (effectively) deprecated the
	* semantics of the 2292-style pktoptions.
	* Since it was not reliable in nature (i.e.,
	* applications had to expect the lack of some
	* information after all), it would make sense
	* to simplify this part by always returning
	* empty data.
	*/
	sopt->sopt_valsize = 0;
	break;

	case IPV6_RECVHOPOPTS:
	case IPV6_RECVDSTOPTS:
	case IPV6_RECVRTHDRDSTOPTS:
	case IPV6_UNICAST_HOPS:
	case IPV6_RECVPKTINFO:
	case IPV6_RECVHOPLIMIT:
	case IPV6_RECVRTHDR:
	case IPV6_RECVPATHMTU:

	case IPV6_FAITH:
	case IPV6_V6ONLY:
	case IPV6_PORTRANGE:
	case IPV6_RECVTCLASS:
	case IPV6_AUTOFLOWLABEL:
	switch (optname) {

	case IPV6_RECVHOPOPTS:
	optval = OPTBIT(IN6P_HOPOPTS);
	break;

	case IPV6_RECVDSTOPTS:
	optval = OPTBIT(IN6P_DSTOPTS);
	break;

	case IPV6_RECVRTHDRDSTOPTS:
	optval = OPTBIT(IN6P_RTHDRDSTOPTS);
	break;

	case IPV6_UNICAST_HOPS:
	optval = in6p->in6p_hops;
	break;

	case IPV6_RECVPKTINFO:
	optval = OPTBIT(IN6P_PKTINFO);
	break;

	case IPV6_RECVHOPLIMIT:
	optval = OPTBIT(IN6P_HOPLIMIT);
	break;

	case IPV6_RECVRTHDR:
	optval = OPTBIT(IN6P_RTHDR);
	break;

	case IPV6_RECVPATHMTU:
	optval = OPTBIT(IN6P_MTU);
	break;

	case IPV6_FAITH:
	optval = OPTBIT(IN6P_FAITH);
	break;

	case IPV6_V6ONLY:
	optval = OPTBIT(IN6P_IPV6_V6ONLY);
	break;

	case IPV6_PORTRANGE:
	{
	int flags;
	flags = in6p->in6p_flags;
	if (flags & IN6P_HIGHPORT)
	optval = IPV6_PORTRANGE_HIGH;
	else if (flags & IN6P_LOWPORT)
	optval = IPV6_PORTRANGE_LOW;
	else
	optval = 0;
	break;
	}
	case IPV6_RECVTCLASS:
	optval = OPTBIT(IN6P_TCLASS);
	break;

	case IPV6_AUTOFLOWLABEL:
	optval = OPTBIT(IN6P_AUTOFLOWLABEL);
	break;
	}
	if (error)
	break;
	error = sooptcopyout(sopt, &optval,
	sizeof optval);
	break;

	case IPV6_PATHMTU:
	{
	u_long pmtu = 0;
	struct ip6_mtuinfo mtuinfo;
	struct route_in6 sro;

	bzero(&sro, sizeof(sro));

	if (!(so->so_state & SS_ISCONNECTED))
	return (ENOTCONN);
	/*
	* XXX: we dot not consider the case of source
	* routing, or optional information to specify
	* the outgoing interface.
	*/
	error = ip6_getpmtu(&sro, NULL, NULL,
	&in6p->in6p_faddr, &pmtu, NULL);
	if (sro.ro_rt)
	RTFREE(sro.ro_rt);
	if (error)
	break;
	if (pmtu > IPV6_MAXPACKET)
	pmtu = IPV6_MAXPACKET;

	bzero(&mtuinfo, sizeof(mtuinfo));
	mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
	optdata = (void *)&mtuinfo;
	optdatalen = sizeof(mtuinfo);
	error = sooptcopyout(sopt, optdata,
	optdatalen);
	break;
	}

	case IPV6_2292PKTINFO:
	case IPV6_2292HOPLIMIT:
	case IPV6_2292HOPOPTS:
	case IPV6_2292RTHDR:
	case IPV6_2292DSTOPTS:
	switch (optname) {
	case IPV6_2292PKTINFO:
	optval = OPTBIT(IN6P_PKTINFO);
	break;
	case IPV6_2292HOPLIMIT:
	optval = OPTBIT(IN6P_HOPLIMIT);
	break;
	case IPV6_2292HOPOPTS:
	optval = OPTBIT(IN6P_HOPOPTS);
	break;
	case IPV6_2292RTHDR:
	optval = OPTBIT(IN6P_RTHDR);
	break;
	case IPV6_2292DSTOPTS:
	optval = OPTBIT(IN6P_DSTOPTS\|IN6P_RTHDRDSTOPTS);
	break;
	}
	error = sooptcopyout(sopt, &optval,
	sizeof optval);
	break;
	case IPV6_PKTINFO:
	case IPV6_HOPOPTS:
	case IPV6_RTHDR:
	case IPV6_DSTOPTS:
	case IPV6_RTHDRDSTOPTS:
	case IPV6_NEXTHOP:
	case IPV6_TCLASS:
	case IPV6_DONTFRAG:
	case IPV6_USE_MIN_MTU:
	case IPV6_PREFER_TEMPADDR:
	error = ip6_getpcbopt(in6p->in6p_outputopts,
	optname, sopt);
	break;

	case IPV6_MULTICAST_IF:
	case IPV6_MULTICAST_HOPS:
	case IPV6_MULTICAST_LOOP:
	case IPV6_JOIN_GROUP:
	case IPV6_LEAVE_GROUP:
	{
	struct mbuf *m;
	error = ip6_getmoptions(sopt->sopt_name,
	in6p->in6p_moptions, &m);
	if (error == 0)
	error = sooptcopyout(sopt,
	mtod(m, char *), m->m_len);
	m_freem(m);
	}
	break;

	#ifdef IPSEC
	case IPV6_IPSEC_POLICY:
	{
	caddr_t req = NULL;
	size_t len = 0;
	struct mbuf *m = NULL;
	struct mbuf **mp = &m;
	size_t ovalsize = sopt->sopt_valsize;
	caddr_t oval = (caddr_t)sopt->sopt_val;

	error = soopt_getm(sopt, &m); /* XXX */
	if (error != 0)
	break;
	error = soopt_mcopyin(sopt, m); /* XXX */
	if (error != 0)
	break;
	sopt->sopt_valsize = ovalsize;
	sopt->sopt_val = oval;
	if (m) {
	req = mtod(m, caddr_t);
	len = m->m_len;
	}
	error = ipsec6_get_policy(in6p, req, len, mp);
	if (error == 0)
	error = soopt_mcopyout(sopt, m); /* XXX */
	if (error == 0 && m)
	m_freem(m);
	break;
	}
	#endif /* IPSEC */

	default:
	error = ENOPROTOOPT;
	break;
	}
	break;
	}
	} else { /* level != IPPROTO_IPV6 */
	error = EINVAL;
	}
	return (error);
	}

	int
	ip6_raw_ctloutput(struct socket so, struct sockopt sopt)
	{
	int error = 0, optval, optlen;
	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
	struct in6pcb *in6p = sotoin6pcb(so);
	int level, op, optname;

	level = sopt->sopt_level;
	op = sopt->sopt_dir;
	optname = sopt->sopt_name;
	optlen = sopt->sopt_valsize;

	if (level != IPPROTO_IPV6) {
	return (EINVAL);
	}

	switch (optname) {
	case IPV6_CHECKSUM:
	/*
	* For ICMPv6 sockets, no modification allowed for checksum
	* offset, permit "no change" values to help existing apps.
	*
	* RFC3542 says: "An attempt to set IPV6_CHECKSUM
	* for an ICMPv6 socket will fail."
	* The current behavior does not meet RFC3542.
	*/
	switch (op) {
	case SOPT_SET:
	if (optlen != sizeof(int)) {
	error = EINVAL;
	break;
	}
	error = sooptcopyin(sopt, &optval, sizeof(optval),
	sizeof(optval));
	if (error)
	break;
	if ((optval % 2) != 0) {
	/* the API assumes even offset values */
	error = EINVAL;
	} else if (so->so_proto->pr_protocol ==
	IPPROTO_ICMPV6) {
	if (optval != icmp6off)
	error = EINVAL;
	} else
	in6p->in6p_cksum = optval;
	break;

	case SOPT_GET:
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
	optval = icmp6off;
	else
	optval = in6p->in6p_cksum;

	error = sooptcopyout(sopt, &optval, sizeof(optval));
	break;

	default:
	error = EINVAL;
	break;
	}
	break;

	default:
	error = ENOPROTOOPT;
	break;
	}

	return (error);
	}

	/*
	* Set up IP6 options in pcb for insertion in output packets or
	* specifying behavior of outgoing packets.
	*/
	static int
	ip6_pcbopts(struct ip6_pktopts *pktopt, struct mbuf m,
	struct socket so, struct sockopt sopt)
	{
	struct ip6_pktopts opt = pktopt;
	int error = 0;
	struct thread *td = sopt->sopt_td;

	/* turn off any old options. */
	if (opt) {
	#ifdef DIAGNOSTIC
	if (opt->ip6po_pktinfo \|\| opt->ip6po_nexthop \|\|
	opt->ip6po_hbh \|\| opt->ip6po_dest1 \|\| opt->ip6po_dest2 \|\|
	opt->ip6po_rhinfo.ip6po_rhi_rthdr)
	printf("ip6_pcbopts: all specified options are cleared.\n");
	#endif
	ip6_clearpktopts(opt, -1);
	} else
	opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK);
	*pktopt = NULL;

	if (!m \|\| m->m_len == 0) {
	/*
	* Only turning off any previous options, regardless of
	* whether the opt is just created or given.
	*/
	free(opt, M_IP6OPT);
	return (0);
	}

	/* set options specified by user. */
	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
	td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
	ip6_clearpktopts(opt, -1); /* XXX: discard all options */
	free(opt, M_IP6OPT);
	return (error);
	}
	*pktopt = opt;
	return (0);
	}

	/*
	* initialize ip6_pktopts. beware that there are non-zero default values in
	* the struct.
	*/
	void
	ip6_initpktopts(struct ip6_pktopts *opt)
	{

	bzero(opt, sizeof(*opt));
	opt->ip6po_hlim = -1; /* -1 means default hop limit */
	opt->ip6po_tclass = -1; /* -1 means default traffic class */
	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
	}

	static int
	ip6_pcbopt(int optname, u_char buf, int len, struct ip6_pktopts *pktopt,
	struct ucred *cred, int uproto)
	{
	struct ip6_pktopts *opt;

	if (*pktopt == NULL) {
	*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
	M_WAITOK);
	ip6_initpktopts(*pktopt);
	}
	opt = *pktopt;

	return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
	}

	static int
	ip6_getpcbopt(struct ip6_pktopts pktopt, int optname, struct sockopt sopt)
	{
	void *optdata = NULL;
	int optdatalen = 0;
	struct ip6_ext *ip6e;
	int error = 0;
	struct in6_pktinfo null_pktinfo;
	int deftclass = 0, on;
	int defminmtu = IP6PO_MINMTU_MCASTONLY;
	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;

	switch (optname) {
	case IPV6_PKTINFO:
	if (pktopt && pktopt->ip6po_pktinfo)
	optdata = (void *)pktopt->ip6po_pktinfo;
	else {
	/* XXX: we don't have to do this every time... */
	bzero(&null_pktinfo, sizeof(null_pktinfo));
	optdata = (void *)&null_pktinfo;
	}
	optdatalen = sizeof(struct in6_pktinfo);
	break;
	case IPV6_TCLASS:
	if (pktopt && pktopt->ip6po_tclass >= 0)
	optdata = (void *)&pktopt->ip6po_tclass;
	else
	optdata = (void *)&deftclass;
	optdatalen = sizeof(int);
	break;
	case IPV6_HOPOPTS:
	if (pktopt && pktopt->ip6po_hbh) {
	optdata = (void *)pktopt->ip6po_hbh;
	ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
	optdatalen = (ip6e->ip6e_len + 1) << 3;
	}
	break;
	case IPV6_RTHDR:
	if (pktopt && pktopt->ip6po_rthdr) {
	optdata = (void *)pktopt->ip6po_rthdr;
	ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
	optdatalen = (ip6e->ip6e_len + 1) << 3;
	}
	break;
	case IPV6_RTHDRDSTOPTS:
	if (pktopt && pktopt->ip6po_dest1) {
	optdata = (void *)pktopt->ip6po_dest1;
	ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
	optdatalen = (ip6e->ip6e_len + 1) << 3;
	}
	break;
	case IPV6_DSTOPTS:
	if (pktopt && pktopt->ip6po_dest2) {
	optdata = (void *)pktopt->ip6po_dest2;
	ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
	optdatalen = (ip6e->ip6e_len + 1) << 3;
	}
	break;
	case IPV6_NEXTHOP:
	if (pktopt && pktopt->ip6po_nexthop) {
	optdata = (void *)pktopt->ip6po_nexthop;
	optdatalen = pktopt->ip6po_nexthop->sa_len;
	}
	break;
	case IPV6_USE_MIN_MTU:
	if (pktopt)
	optdata = (void *)&pktopt->ip6po_minmtu;
	else
	optdata = (void *)&defminmtu;
	optdatalen = sizeof(int);
	break;
	case IPV6_DONTFRAG:
	if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
	on = 1;
	else
	on = 0;
	optdata = (void *)&on;
	optdatalen = sizeof(on);
	break;
	case IPV6_PREFER_TEMPADDR:
	if (pktopt)
	optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
	else
	optdata = (void *)&defpreftemp;
	optdatalen = sizeof(int);
	break;
	default: /* should not happen */
	#ifdef DIAGNOSTIC
	panic("ip6_getpcbopt: unexpected option\n");
	#endif
	return (ENOPROTOOPT);
	}

	error = sooptcopyout(sopt, optdata, optdatalen);

	return (error);
	}

	void
	ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
	{
	if (pktopt == NULL)
	return;

	if (optname == -1 \|\| optname == IPV6_PKTINFO) {
	if (pktopt->ip6po_pktinfo)
	free(pktopt->ip6po_pktinfo, M_IP6OPT);
	pktopt->ip6po_pktinfo = NULL;
	}
	if (optname == -1 \|\| optname == IPV6_HOPLIMIT)
	pktopt->ip6po_hlim = -1;
	if (optname == -1 \|\| optname == IPV6_TCLASS)
	pktopt->ip6po_tclass = -1;
	if (optname == -1 \|\| optname == IPV6_NEXTHOP) {
	if (pktopt->ip6po_nextroute.ro_rt) {
	RTFREE(pktopt->ip6po_nextroute.ro_rt);
	pktopt->ip6po_nextroute.ro_rt = NULL;
	}
	if (pktopt->ip6po_nexthop)
	free(pktopt->ip6po_nexthop, M_IP6OPT);
	pktopt->ip6po_nexthop = NULL;
	}
	if (optname == -1 \|\| optname == IPV6_HOPOPTS) {
	if (pktopt->ip6po_hbh)
	free(pktopt->ip6po_hbh, M_IP6OPT);
	pktopt->ip6po_hbh = NULL;
	}
	if (optname == -1 \|\| optname == IPV6_RTHDRDSTOPTS) {
	if (pktopt->ip6po_dest1)
	free(pktopt->ip6po_dest1, M_IP6OPT);
	pktopt->ip6po_dest1 = NULL;
	}
	if (optname == -1 \|\| optname == IPV6_RTHDR) {
	if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
	free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
	pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
	if (pktopt->ip6po_route.ro_rt) {
	RTFREE(pktopt->ip6po_route.ro_rt);
	pktopt->ip6po_route.ro_rt = NULL;
	}
	}
	if (optname == -1 \|\| optname == IPV6_DSTOPTS) {
	if (pktopt->ip6po_dest2)
	free(pktopt->ip6po_dest2, M_IP6OPT);
	pktopt->ip6po_dest2 = NULL;
	}
	}

	#define PKTOPT_EXTHDRCPY(type) \
	do {\
	if (src->type) {\
	int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
	dst->type = malloc(hlen, M_IP6OPT, canwait);\
	if (dst->type == NULL && canwait == M_NOWAIT)\
	goto bad;\
	bcopy(src->type, dst->type, hlen);\
	}\
	} while (/CONSTCOND/ 0)

	static int
	copypktopts(struct ip6_pktopts dst, struct ip6_pktopts src, int canwait)
	{
	if (dst == NULL \|\| src == NULL) {
	printf("ip6_clearpktopts: invalid argument\n");
	return (EINVAL);
	}

	dst->ip6po_hlim = src->ip6po_hlim;
	dst->ip6po_tclass = src->ip6po_tclass;
	dst->ip6po_flags = src->ip6po_flags;
	if (src->ip6po_pktinfo) {
	dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
	M_IP6OPT, canwait);
	if (dst->ip6po_pktinfo == NULL)
	goto bad;
	dst->ip6po_pktinfo = src->ip6po_pktinfo;
	}
	if (src->ip6po_nexthop) {
	dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
	M_IP6OPT, canwait);
	if (dst->ip6po_nexthop == NULL)
	goto bad;
	bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
	src->ip6po_nexthop->sa_len);
	}
	PKTOPT_EXTHDRCPY(ip6po_hbh);
	PKTOPT_EXTHDRCPY(ip6po_dest1);
	PKTOPT_EXTHDRCPY(ip6po_dest2);
	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
	return (0);

	bad:
	ip6_clearpktopts(dst, -1);
	return (ENOBUFS);
	}
	#undef PKTOPT_EXTHDRCPY

	struct ip6_pktopts *
	ip6_copypktopts(struct ip6_pktopts *src, int canwait)
	{
	int error;
	struct ip6_pktopts *dst;

	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
	if (dst == NULL)
	return (NULL);
	ip6_initpktopts(dst);

	if ((error = copypktopts(dst, src, canwait)) != 0) {
	free(dst, M_IP6OPT);
	return (NULL);
	}

	return (dst);
	}

	void
	ip6_freepcbopts(struct ip6_pktopts *pktopt)
	{
	if (pktopt == NULL)
	return;

	ip6_clearpktopts(pktopt, -1);

	free(pktopt, M_IP6OPT);
	}

	/*
	* Set the IP6 multicast options in response to user setsockopt().
	*/
	static int
	ip6_setmoptions(int optname, struct ip6_moptions *im6op, struct mbuf m)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	int error = 0;
	u_int loop, ifindex;
	struct ipv6_mreq *mreq;
	struct ifnet *ifp;
	struct ip6_moptions im6o = im6op;
	struct route_in6 ro;
	struct in6_multi_mship *imm;

	if (im6o == NULL) {
	/*
	* No multicast option buffer attached to the pcb;
	* allocate one and initialize to default values.
	*/
	im6o = (struct ip6_moptions *)
	malloc(sizeof(*im6o), M_IP6MOPTS, M_WAITOK);

	if (im6o == NULL)
	return (ENOBUFS);
	*im6op = im6o;
	im6o->im6o_multicast_ifp = NULL;
	im6o->im6o_multicast_hlim = V_ip6_defmcasthlim;
	im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
	LIST_INIT(&im6o->im6o_memberships);
	}

	switch (optname) {

	case IPV6_MULTICAST_IF:
	/*
	* Select the interface for outgoing multicast packets.
	*/
	if (m == NULL \|\| m->m_len != sizeof(u_int)) {
	error = EINVAL;
	break;
	}
	bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex));
	if (ifindex < 0 \|\| V_if_index < ifindex) {
	error = ENXIO; /* XXX EINVAL? */
	break;
	}
	ifp = ifnet_byindex(ifindex);
	if (ifp == NULL \|\| (ifp->if_flags & IFF_MULTICAST) == 0) {
	error = EADDRNOTAVAIL;
	break;
	}
	im6o->im6o_multicast_ifp = ifp;
	break;

	case IPV6_MULTICAST_HOPS:
	{
	/*
	* Set the IP6 hoplimit for outgoing multicast packets.
	*/
	int optval;
	if (m == NULL \|\| m->m_len != sizeof(int)) {
	error = EINVAL;
	break;
	}
	bcopy(mtod(m, u_int *), &optval, sizeof(optval));
	if (optval < -1 \|\| optval >= 256)
	error = EINVAL;
	else if (optval == -1)
	im6o->im6o_multicast_hlim = V_ip6_defmcasthlim;
	else
	im6o->im6o_multicast_hlim = optval;
	break;
	}

	case IPV6_MULTICAST_LOOP:
	/*
	* Set the loopback flag for outgoing multicast packets.
	* Must be zero or one.
	*/
	if (m == NULL \|\| m->m_len != sizeof(u_int)) {
	error = EINVAL;
	break;
	}
	bcopy(mtod(m, u_int *), &loop, sizeof(loop));
	if (loop > 1) {
	error = EINVAL;
	break;
	}
	im6o->im6o_multicast_loop = loop;
	break;

	case IPV6_JOIN_GROUP:
	/*
	* Add a multicast group membership.
	* Group must be a valid IP6 multicast address.
	*/
	if (m == NULL \|\| m->m_len != sizeof(struct ipv6_mreq)) {
	error = EINVAL;
	break;
	}
	mreq = mtod(m, struct ipv6_mreq *);

	if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) {
	/*
	* We use the unspecified address to specify to accept
	* all multicast addresses. Only super user is allowed
	* to do this.
	*/
	/* XXX-BZ might need a better PRIV_NETINET_x for this */
	error = priv_check(curthread, PRIV_NETINET_MROUTE);
	if (error)
	break;
	} else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) {
	error = EINVAL;
	break;
	}

	/*
	* If no interface was explicitly specified, choose an
	* appropriate one according to the given multicast address.
	*/
	if (mreq->ipv6mr_interface == 0) {
	struct sockaddr_in6 *dst;

	/*
	* Look up the routing table for the
	* address, and choose the outgoing interface.
	* XXX: is it a good approach?
	*/
	ro.ro_rt = NULL;
	dst = (struct sockaddr_in6 *)&ro.ro_dst;
	bzero(dst, sizeof(*dst));
	dst->sin6_family = AF_INET6;
	dst->sin6_len = sizeof(*dst);
	dst->sin6_addr = mreq->ipv6mr_multiaddr;
	rtalloc((struct route *)&ro);
	if (ro.ro_rt == NULL) {
	error = EADDRNOTAVAIL;
	break;
	}
	ifp = ro.ro_rt->rt_ifp;
	RTFREE(ro.ro_rt);
	} else {
	/*
	* If the interface is specified, validate it.
	*/
	if (mreq->ipv6mr_interface < 0 \|\|
	V_if_index < mreq->ipv6mr_interface) {
	error = ENXIO; /* XXX EINVAL? */
	break;
	}
	ifp = ifnet_byindex(mreq->ipv6mr_interface);
	if (!ifp) {
	error = ENXIO; /* XXX EINVAL? */
	break;
	}
	}

	/*
	* See if we found an interface, and confirm that it
	* supports multicast
	*/
	if (ifp == NULL \|\| (ifp->if_flags & IFF_MULTICAST) == 0) {
	error = EADDRNOTAVAIL;
	break;
	}

	if (in6_setscope(&mreq->ipv6mr_multiaddr, ifp, NULL)) {
	error = EADDRNOTAVAIL; /* XXX: should not happen */
	break;
	}

	/*
	* See if the membership already exists.
	*/
	for (imm = im6o->im6o_memberships.lh_first;
	imm != NULL; imm = imm->i6mm_chain.le_next)
	if (imm->i6mm_maddr->in6m_ifp == ifp &&
	IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
	&mreq->ipv6mr_multiaddr))
	break;
	if (imm != NULL) {
	error = EADDRINUSE;
	break;
	}
	/*
	* Everything looks good; add a new record to the multicast
	* address list for the given interface.
	*/
	imm = in6_joingroup(ifp, &mreq->ipv6mr_multiaddr, &error, 0);
	if (imm == NULL)
	break;
	LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
	break;

	case IPV6_LEAVE_GROUP:
	/*
	* Drop a multicast group membership.
	* Group must be a valid IP6 multicast address.
	*/
	if (m == NULL \|\| m->m_len != sizeof(struct ipv6_mreq)) {
	error = EINVAL;
	break;
	}
	mreq = mtod(m, struct ipv6_mreq *);

	/*
	* If an interface address was specified, get a pointer
	* to its ifnet structure.
	*/
	if (mreq->ipv6mr_interface < 0 \|\|
	V_if_index < mreq->ipv6mr_interface) {
	error = ENXIO; /* XXX EINVAL? */
	break;
	}
	if (mreq->ipv6mr_interface == 0)
	ifp = NULL;
	else
	ifp = ifnet_byindex(mreq->ipv6mr_interface);

	/* Fill in the scope zone ID */
	if (ifp) {
	if (in6_setscope(&mreq->ipv6mr_multiaddr, ifp, NULL)) {
	/* XXX: should not happen */
	error = EADDRNOTAVAIL;
	break;
	}
	} else if (mreq->ipv6mr_interface != 0) {
	/*
	* This case happens when the (positive) index is in
	* the valid range, but the corresponding interface has
	* been detached dynamically (XXX).
	*/
	error = EADDRNOTAVAIL;
	break;
	} else { /* ipv6mr_interface == 0 */
	struct sockaddr_in6 sa6_mc;

	/*
	* The API spec says as follows:
	* If the interface index is specified as 0, the
	* system may choose a multicast group membership to
	* drop by matching the multicast address only.
	* On the other hand, we cannot disambiguate the scope
	* zone unless an interface is provided. Thus, we
	* check if there's ambiguity with the default scope
	* zone as the last resort.
	*/
	bzero(&sa6_mc, sizeof(sa6_mc));
	sa6_mc.sin6_family = AF_INET6;
	sa6_mc.sin6_len = sizeof(sa6_mc);
	sa6_mc.sin6_addr = mreq->ipv6mr_multiaddr;
	error = sa6_embedscope(&sa6_mc, V_ip6_use_defzone);
	if (error != 0)
	break;
	mreq->ipv6mr_multiaddr = sa6_mc.sin6_addr;
	}

	/*
	* Find the membership in the membership list.
	*/
	for (imm = im6o->im6o_memberships.lh_first;
	imm != NULL; imm = imm->i6mm_chain.le_next) {
	if ((ifp == NULL \|\| imm->i6mm_maddr->in6m_ifp == ifp) &&
	IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
	&mreq->ipv6mr_multiaddr))
	break;
	}
	if (imm == NULL) {
	/* Unable to resolve interface */
	error = EADDRNOTAVAIL;
	break;
	}
	/*
	* Give up the multicast address record to which the
	* membership points.
	*/
	LIST_REMOVE(imm, i6mm_chain);
	in6_delmulti(imm->i6mm_maddr);
	free(imm, M_IP6MADDR);
	break;

	default:
	error = EOPNOTSUPP;
	break;
	}

	/*
	* If all options have default values, no need to keep the mbuf.
	*/
	if (im6o->im6o_multicast_ifp == NULL &&
	im6o->im6o_multicast_hlim == V_ip6_defmcasthlim &&
	im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
	im6o->im6o_memberships.lh_first == NULL) {
	free(*im6op, M_IP6MOPTS);
	*im6op = NULL;
	}

	return (error);
	}

	/*
	* Return the IP6 multicast options in response to user getsockopt().
	*/
	static int
	ip6_getmoptions(int optname, struct ip6_moptions im6o, struct mbuf *mp)
	{
	+ INIT_VNET_INET6(curvnet);
	u_int hlim, loop, *ifindex;

	mp = m_get(M_WAIT, MT_HEADER); / XXX */

	switch (optname) {

	case IPV6_MULTICAST_IF:
	ifindex = mtod(mp, u_int );
	(*mp)->m_len = sizeof(u_int);
	if (im6o == NULL \|\| im6o->im6o_multicast_ifp == NULL)
	*ifindex = 0;
	else
	*ifindex = im6o->im6o_multicast_ifp->if_index;
	return (0);

	case IPV6_MULTICAST_HOPS:
	hlim = mtod(mp, u_int );
	(*mp)->m_len = sizeof(u_int);
	if (im6o == NULL)
	*hlim = V_ip6_defmcasthlim;
	else
	*hlim = im6o->im6o_multicast_hlim;
	return (0);

	case IPV6_MULTICAST_LOOP:
	loop = mtod(mp, u_int );
	(*mp)->m_len = sizeof(u_int);
	if (im6o == NULL)
	*loop = V_ip6_defmcasthlim;
	else
	*loop = im6o->im6o_multicast_loop;
	return (0);

	default:
	return (EOPNOTSUPP);
	}
	}

	/*
	* Discard the IP6 multicast options.
	*/
	void
	ip6_freemoptions(struct ip6_moptions *im6o)
	{
	struct in6_multi_mship *imm;

	if (im6o == NULL)
	return;

	while ((imm = im6o->im6o_memberships.lh_first) != NULL) {
	LIST_REMOVE(imm, i6mm_chain);
	if (imm->i6mm_maddr)
	in6_delmulti(imm->i6mm_maddr);
	free(imm, M_IP6MADDR);
	}
	free(im6o, M_IP6MOPTS);
	}

	/*
	* Set IPv6 outgoing packet options based on advanced API.
	*/
	int
	ip6_setpktopts(struct mbuf control, struct ip6_pktopts opt,
	struct ip6_pktopts stickyopt, struct ucred cred, int uproto)
	{
	struct cmsghdr *cm = 0;

	if (control == NULL \|\| opt == NULL)
	return (EINVAL);

	ip6_initpktopts(opt);
	if (stickyopt) {
	int error;

	/*
	* If stickyopt is provided, make a local copy of the options
	* for this particular packet, then override them by ancillary
	* objects.
	* XXX: copypktopts() does not copy the cached route to a next
	* hop (if any). This is not very good in terms of efficiency,
	* but we can allow this since this option should be rarely
	* used.
	*/
	if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
	return (error);
	}

	/*
	* XXX: Currently, we assume all the optional information is stored
	* in a single mbuf.
	*/
	if (control->m_next)
	return (EINVAL);

	for (; control->m_len; control->m_data += CMSG_ALIGN(cm->cmsg_len),
	control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
	int error;

	if (control->m_len < CMSG_LEN(0))
	return (EINVAL);

	cm = mtod(control, struct cmsghdr *);
	if (cm->cmsg_len == 0 \|\| cm->cmsg_len > control->m_len)
	return (EINVAL);
	if (cm->cmsg_level != IPPROTO_IPV6)
	continue;

	error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
	cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
	if (error)
	return (error);
	}

	return (0);
	}

	/*
	* Set a particular packet option, as a sticky option or an ancillary data
	* item. "len" can be 0 only when it's a sticky option.
	* We have 4 cases of combination of "sticky" and "cmsg":
	* "sticky=0, cmsg=0": impossible
	* "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
	* "sticky=1, cmsg=0": RFC3542 socket option
	* "sticky=1, cmsg=1": RFC2292 socket option
	*/
	static int
	ip6_setpktopt(int optname, u_char buf, int len, struct ip6_pktopts opt,
	struct ucred *cred, int sticky, int cmsg, int uproto)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	int minmtupolicy, preftemp;
	int error;

	if (!sticky && !cmsg) {
	#ifdef DIAGNOSTIC
	printf("ip6_setpktopt: impossible case\n");
	#endif
	return (EINVAL);
	}

	/*
	* IPV6_2292xxx is for backward compatibility to RFC2292, and should
	* not be specified in the context of RFC3542. Conversely,
	* RFC3542 types should not be specified in the context of RFC2292.
	*/
	if (!cmsg) {
	switch (optname) {
	case IPV6_2292PKTINFO:
	case IPV6_2292HOPLIMIT:
	case IPV6_2292NEXTHOP:
	case IPV6_2292HOPOPTS:
	case IPV6_2292DSTOPTS:
	case IPV6_2292RTHDR:
	case IPV6_2292PKTOPTIONS:
	return (ENOPROTOOPT);
	}
	}
	if (sticky && cmsg) {
	switch (optname) {
	case IPV6_PKTINFO:
	case IPV6_HOPLIMIT:
	case IPV6_NEXTHOP:
	case IPV6_HOPOPTS:
	case IPV6_DSTOPTS:
	case IPV6_RTHDRDSTOPTS:
	case IPV6_RTHDR:
	case IPV6_USE_MIN_MTU:
	case IPV6_DONTFRAG:
	case IPV6_TCLASS:
	case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
	return (ENOPROTOOPT);
	}
	}

	switch (optname) {
	case IPV6_2292PKTINFO:
	case IPV6_PKTINFO:
	{
	struct ifnet *ifp = NULL;
	struct in6_pktinfo *pktinfo;

	if (len != sizeof(struct in6_pktinfo))
	return (EINVAL);

	pktinfo = (struct in6_pktinfo *)buf;

	/*
	* An application can clear any sticky IPV6_PKTINFO option by
	* doing a "regular" setsockopt with ipi6_addr being
	* in6addr_any and ipi6_ifindex being zero.
	* [RFC 3542, Section 6]
	*/
	if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
	pktinfo->ipi6_ifindex == 0 &&
	IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
	ip6_clearpktopts(opt, optname);
	break;
	}

	if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
	sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
	return (EINVAL);
	}

	/* validate the interface index if specified. */
	if (pktinfo->ipi6_ifindex > V_if_index \|\|
	pktinfo->ipi6_ifindex < 0) {
	return (ENXIO);
	}
	if (pktinfo->ipi6_ifindex) {
	ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
	if (ifp == NULL)
	return (ENXIO);
	}

	/*
	* We store the address anyway, and let in6_selectsrc()
	* validate the specified address. This is because ipi6_addr
	* may not have enough information about its scope zone, and
	* we may need additional information (such as outgoing
	* interface or the scope zone of a destination address) to
	* disambiguate the scope.
	* XXX: the delay of the validation may confuse the
	* application when it is used as a sticky option.
	*/
	if (opt->ip6po_pktinfo == NULL) {
	opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
	M_IP6OPT, M_NOWAIT);
	if (opt->ip6po_pktinfo == NULL)
	return (ENOBUFS);
	}
	bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
	break;
	}

	case IPV6_2292HOPLIMIT:
	case IPV6_HOPLIMIT:
	{
	int *hlimp;

	/*
	* RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
	* to simplify the ordering among hoplimit options.
	*/
	if (optname == IPV6_HOPLIMIT && sticky)
	return (ENOPROTOOPT);

	if (len != sizeof(int))
	return (EINVAL);
	hlimp = (int *)buf;
	if (hlimp < -1 \|\| hlimp > 255)
	return (EINVAL);

	opt->ip6po_hlim = *hlimp;
	break;
	}

	case IPV6_TCLASS:
	{
	int tclass;

	if (len != sizeof(int))
	return (EINVAL);
	tclass = (int )buf;
	if (tclass < -1 \|\| tclass > 255)
	return (EINVAL);

	opt->ip6po_tclass = tclass;
	break;
	}

	case IPV6_2292NEXTHOP:
	case IPV6_NEXTHOP:
	if (cred != NULL) {
	error = priv_check_cred(cred,
	PRIV_NETINET_SETHDROPTS, 0);
	if (error)
	return (error);
	}

	if (len == 0) { /* just remove the option */
	ip6_clearpktopts(opt, IPV6_NEXTHOP);
	break;
	}

	/* check if cmsg_len is large enough for sa_len */
	if (len < sizeof(struct sockaddr) \|\| len < *buf)
	return (EINVAL);

	switch (((struct sockaddr *)buf)->sa_family) {
	case AF_INET6:
	{
	struct sockaddr_in6 sa6 = (struct sockaddr_in6 )buf;
	int error;

	if (sa6->sin6_len != sizeof(struct sockaddr_in6))
	return (EINVAL);

	if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) \|\|
	IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
	return (EINVAL);
	}
	if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
	!= 0) {
	return (error);
	}
	break;
	}
	case AF_LINK: /* should eventually be supported */
	default:
	return (EAFNOSUPPORT);
	}

	/* turn off the previous option, then set the new option. */
	ip6_clearpktopts(opt, IPV6_NEXTHOP);
	opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
	if (opt->ip6po_nexthop == NULL)
	return (ENOBUFS);
	bcopy(buf, opt->ip6po_nexthop, *buf);
	break;

	case IPV6_2292HOPOPTS:
	case IPV6_HOPOPTS:
	{
	struct ip6_hbh *hbh;
	int hbhlen;

	/*
	* XXX: We don't allow a non-privileged user to set ANY HbH
	* options, since per-option restriction has too much
	* overhead.
	*/
	if (cred != NULL) {
	error = priv_check_cred(cred,
	PRIV_NETINET_SETHDROPTS, 0);
	if (error)
	return (error);
	}

	if (len == 0) {
	ip6_clearpktopts(opt, IPV6_HOPOPTS);
	break; /* just remove the option */
	}

	/* message length validation */
	if (len < sizeof(struct ip6_hbh))
	return (EINVAL);
	hbh = (struct ip6_hbh *)buf;
	hbhlen = (hbh->ip6h_len + 1) << 3;
	if (len != hbhlen)
	return (EINVAL);

	/* turn off the previous option, then set the new option. */
	ip6_clearpktopts(opt, IPV6_HOPOPTS);
	opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
	if (opt->ip6po_hbh == NULL)
	return (ENOBUFS);
	bcopy(hbh, opt->ip6po_hbh, hbhlen);

	break;
	}

	case IPV6_2292DSTOPTS:
	case IPV6_DSTOPTS:
	case IPV6_RTHDRDSTOPTS:
	{
	struct ip6_dest dest, *newdest = NULL;
	int destlen;

	if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
	error = priv_check_cred(cred,
	PRIV_NETINET_SETHDROPTS, 0);
	if (error)
	return (error);
	}

	if (len == 0) {
	ip6_clearpktopts(opt, optname);
	break; /* just remove the option */
	}

	/* message length validation */
	if (len < sizeof(struct ip6_dest))
	return (EINVAL);
	dest = (struct ip6_dest *)buf;
	destlen = (dest->ip6d_len + 1) << 3;
	if (len != destlen)
	return (EINVAL);

	/*
	* Determine the position that the destination options header
	* should be inserted; before or after the routing header.
	*/
	switch (optname) {
	case IPV6_2292DSTOPTS:
	/*
	* The old advacned API is ambiguous on this point.
	* Our approach is to determine the position based
	* according to the existence of a routing header.
	* Note, however, that this depends on the order of the
	* extension headers in the ancillary data; the 1st
	* part of the destination options header must appear
	* before the routing header in the ancillary data,
	* too.
	* RFC3542 solved the ambiguity by introducing
	* separate ancillary data or option types.
	*/
	if (opt->ip6po_rthdr == NULL)
	newdest = &opt->ip6po_dest1;
	else
	newdest = &opt->ip6po_dest2;
	break;
	case IPV6_RTHDRDSTOPTS:
	newdest = &opt->ip6po_dest1;
	break;
	case IPV6_DSTOPTS:
	newdest = &opt->ip6po_dest2;
	break;
	}

	/* turn off the previous option, then set the new option. */
	ip6_clearpktopts(opt, optname);
	*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
	if (*newdest == NULL)
	return (ENOBUFS);
	bcopy(dest, *newdest, destlen);

	break;
	}

	case IPV6_2292RTHDR:
	case IPV6_RTHDR:
	{
	struct ip6_rthdr *rth;
	int rthlen;

	if (len == 0) {
	ip6_clearpktopts(opt, IPV6_RTHDR);
	break; /* just remove the option */
	}

	/* message length validation */
	if (len < sizeof(struct ip6_rthdr))
	return (EINVAL);
	rth = (struct ip6_rthdr *)buf;
	rthlen = (rth->ip6r_len + 1) << 3;
	if (len != rthlen)
	return (EINVAL);

	switch (rth->ip6r_type) {
	case IPV6_RTHDR_TYPE_0:
	if (rth->ip6r_len == 0) /* must contain one addr */
	return (EINVAL);
	if (rth->ip6r_len % 2) /* length must be even */
	return (EINVAL);
	if (rth->ip6r_len / 2 != rth->ip6r_segleft)
	return (EINVAL);
	break;
	default:
	return (EINVAL); /* not supported */
	}

	/* turn off the previous option */
	ip6_clearpktopts(opt, IPV6_RTHDR);
	opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
	if (opt->ip6po_rthdr == NULL)
	return (ENOBUFS);
	bcopy(rth, opt->ip6po_rthdr, rthlen);

	break;
	}

	case IPV6_USE_MIN_MTU:
	if (len != sizeof(int))
	return (EINVAL);
	minmtupolicy = (int )buf;
	if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
	minmtupolicy != IP6PO_MINMTU_DISABLE &&
	minmtupolicy != IP6PO_MINMTU_ALL) {
	return (EINVAL);
	}
	opt->ip6po_minmtu = minmtupolicy;
	break;

	case IPV6_DONTFRAG:
	if (len != sizeof(int))
	return (EINVAL);

	if (uproto == IPPROTO_TCP \|\| (int )buf == 0) {
	/*
	* we ignore this option for TCP sockets.
	* (RFC3542 leaves this case unspecified.)
	*/
	opt->ip6po_flags &= ~IP6PO_DONTFRAG;
	} else
	opt->ip6po_flags \|= IP6PO_DONTFRAG;
	break;

	case IPV6_PREFER_TEMPADDR:
	if (len != sizeof(int))
	return (EINVAL);
	preftemp = (int )buf;
	if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
	preftemp != IP6PO_TEMPADDR_NOTPREFER &&
	preftemp != IP6PO_TEMPADDR_PREFER) {
	return (EINVAL);
	}
	opt->ip6po_prefer_tempaddr = preftemp;
	break;

	default:
	return (ENOPROTOOPT);
	} /* end of switch */

	return (0);
	}

	/*
	* Routine called from ip6_output() to loop back a copy of an IP6 multicast
	* packet to the input queue of a specified interface. Note that this
	* calls the output routine of the loopback "driver", but with an interface
	* pointer that might NOT be &loif -- easier than replicating that code here.
	*/
	void
	ip6_mloopback(struct ifnet ifp, struct mbuf m, struct sockaddr_in6 *dst)
	{
	struct mbuf *copym;
	struct ip6_hdr *ip6;

	copym = m_copy(m, 0, M_COPYALL);
	if (copym == NULL)
	return;

	/*
	* Make sure to deep-copy IPv6 header portion in case the data
	* is in an mbuf cluster, so that we can safely override the IPv6
	* header portion later.
	*/
	if ((copym->m_flags & M_EXT) != 0 \|\|
	copym->m_len < sizeof(struct ip6_hdr)) {
	copym = m_pullup(copym, sizeof(struct ip6_hdr));
	if (copym == NULL)
	return;
	}

	#ifdef DIAGNOSTIC
	if (copym->m_len < sizeof(*ip6)) {
	m_freem(copym);
	return;
	}
	#endif

	ip6 = mtod(copym, struct ip6_hdr *);
	/*
	* clear embedded scope identifiers if necessary.
	* in6_clearscope will touch the addresses only when necessary.
	*/
	in6_clearscope(&ip6->ip6_src);
	in6_clearscope(&ip6->ip6_dst);

	(void)if_simloop(ifp, copym, dst->sin6_family, 0);
	}

	/*
	* Chop IPv6 header off from the payload.
	*/
	static int
	ip6_splithdr(struct mbuf m, struct ip6_exthdrs exthdrs)
	{
	struct mbuf *mh;
	struct ip6_hdr *ip6;

	ip6 = mtod(m, struct ip6_hdr *);
	if (m->m_len > sizeof(*ip6)) {
	MGETHDR(mh, M_DONTWAIT, MT_HEADER);
	if (mh == 0) {
	m_freem(m);
	return ENOBUFS;
	}
	M_MOVE_PKTHDR(mh, m);
	MH_ALIGN(mh, sizeof(*ip6));
	m->m_len -= sizeof(*ip6);
	m->m_data += sizeof(*ip6);
	mh->m_next = m;
	m = mh;
	m->m_len = sizeof(*ip6);
	bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
	}
	exthdrs->ip6e_ip6 = m;
	return 0;
	}

	/*
	* Compute IPv6 extension header length.
	*/
	int
	ip6_optlen(struct in6pcb *in6p)
	{
	int len;

	if (!in6p->in6p_outputopts)
	return 0;

	len = 0;
	#define elen(x) \
	(((struct ip6_ext )(x)) ? (((struct ip6_ext )(x))->ip6e_len + 1) << 3 : 0)

	len += elen(in6p->in6p_outputopts->ip6po_hbh);
	if (in6p->in6p_outputopts->ip6po_rthdr)
	/* dest1 is valid with rthdr only */
	len += elen(in6p->in6p_outputopts->ip6po_dest1);
	len += elen(in6p->in6p_outputopts->ip6po_rthdr);
	len += elen(in6p->in6p_outputopts->ip6po_dest2);
	return len;
	#undef elen
	}
	Index: head/sys/netinet6/mld6.c
	===================================================================
	--- head/sys/netinet6/mld6.c (revision 183549)
	+++ head/sys/netinet6/mld6.c (revision 183550)
	@@ -1,651 +1,654 @@
	/*-
	* Copyright (C) 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
	*/

	/*-
	* Copyright (c) 1988 Stephen Deering.
	* Copyright (c) 1992, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Stephen Deering of Stanford University.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)igmp.c 8.1 (Berkeley) 7/19/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/protosw.h>
	#include <sys/syslog.h>
	#include <sys/kernel.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet/icmp6.h>
	#include <netinet6/mld6_var.h>

	/*
	* Protocol constants
	*/

	/* denotes that the MLD max response delay field specifies time in milliseconds */
	#define MLD_TIMER_SCALE 1000
	/*
	* time between repetitions of a node's initial report of interest in a
	* multicast address(in seconds)
	*/
	#define MLD_UNSOLICITED_REPORT_INTERVAL 10

	static struct ip6_pktopts ip6_opts;

	static void mld6_sendpkt(struct in6_multi , int, const struct in6_addr );
	static void mld_starttimer(struct in6_multi *);
	static void mld_stoptimer(struct in6_multi *);
	static void mld_timeo(struct in6_multi *);
	static u_long mld_timerresid(struct in6_multi *);

	void
	mld6_init(void)
	{
	+ INIT_VNET_INET6(curvnet);
	static u_int8_t hbh_buf[8];
	struct ip6_hbh hbh = (struct ip6_hbh )hbh_buf;
	u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD);

	/* ip6h_nxt will be fill in later */
	hbh->ip6h_len = 0; /* (8 >> 3) - 1 */

	/* XXX: grotty hard coding... */
	hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */
	hbh_buf[3] = 0;
	hbh_buf[4] = IP6OPT_ROUTER_ALERT;
	hbh_buf[5] = IP6OPT_RTALERT_LEN - 2;
	bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t));

	ip6_initpktopts(&V_ip6_opts);
	V_ip6_opts.ip6po_hbh = hbh;
	}

	static void
	mld_starttimer(struct in6_multi *in6m)
	{
	struct timeval now;

	microtime(&now);
	in6m->in6m_timer_expire.tv_sec = now.tv_sec + in6m->in6m_timer / hz;
	in6m->in6m_timer_expire.tv_usec = now.tv_usec +
	(in6m->in6m_timer % hz) * (1000000 / hz);
	if (in6m->in6m_timer_expire.tv_usec > 1000000) {
	in6m->in6m_timer_expire.tv_sec++;
	in6m->in6m_timer_expire.tv_usec -= 1000000;
	}

	/* start or restart the timer */
	callout_reset(in6m->in6m_timer_ch, in6m->in6m_timer,
	(void ()(void ))mld_timeo, in6m);
	}

	static void
	mld_stoptimer(struct in6_multi *in6m)
	{
	if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
	return;

	callout_stop(in6m->in6m_timer_ch);
	in6m->in6m_timer = IN6M_TIMER_UNDEF;
	}

	static void
	mld_timeo(struct in6_multi *in6m)
	{
	int s = splnet();

	in6m->in6m_timer = IN6M_TIMER_UNDEF;

	callout_stop(in6m->in6m_timer_ch);

	switch (in6m->in6m_state) {
	case MLD_REPORTPENDING:
	mld6_start_listening(in6m);
	break;
	default:
	mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
	break;
	}

	splx(s);
	}

	static u_long
	mld_timerresid(struct in6_multi *in6m)
	{
	struct timeval now, diff;

	microtime(&now);

	if (now.tv_sec > in6m->in6m_timer_expire.tv_sec \|\|
	(now.tv_sec == in6m->in6m_timer_expire.tv_sec &&
	now.tv_usec > in6m->in6m_timer_expire.tv_usec)) {
	return (0);
	}
	diff = in6m->in6m_timer_expire;
	diff.tv_sec -= now.tv_sec;
	diff.tv_usec -= now.tv_usec;
	if (diff.tv_usec < 0) {
	diff.tv_sec--;
	diff.tv_usec += 1000000;
	}

	/* return the remaining time in milliseconds */
	return (diff.tv_sec * 1000 + diff.tv_usec / 1000);
	}

	void
	mld6_start_listening(struct in6_multi *in6m)
	{
	struct in6_addr all_in6;
	int s = splnet();

	/*
	* RFC2710 page 10:
	* The node never sends a Report or Done for the link-scope all-nodes
	* address.
	* MLD messages are never sent for multicast addresses whose scope is 0
	* (reserved) or 1 (node-local).
	*/
	all_in6 = in6addr_linklocal_allnodes;
	if (in6_setscope(&all_in6, in6m->in6m_ifp, NULL)) {
	/* XXX: this should not happen! */
	in6m->in6m_timer = 0;
	in6m->in6m_state = MLD_OTHERLISTENER;
	}
	if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) \|\|
	IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
	IPV6_ADDR_SCOPE_LINKLOCAL) {
	in6m->in6m_timer = 0;
	in6m->in6m_state = MLD_OTHERLISTENER;
	} else {
	mld6_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
	in6m->in6m_timer = arc4random() %
	MLD_UNSOLICITED_REPORT_INTERVAL * hz;
	in6m->in6m_state = MLD_IREPORTEDLAST;

	mld_starttimer(in6m);
	}
	splx(s);
	}

	void
	mld6_stop_listening(struct in6_multi *in6m)
	{
	struct in6_addr allnode, allrouter;

	allnode = in6addr_linklocal_allnodes;
	if (in6_setscope(&allnode, in6m->in6m_ifp, NULL)) {
	/* XXX: this should not happen! */
	return;
	}
	allrouter = in6addr_linklocal_allrouters;
	if (in6_setscope(&allrouter, in6m->in6m_ifp, NULL)) {
	/* XXX impossible */
	return;
	}
	if (in6m->in6m_state == MLD_IREPORTEDLAST &&
	!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &allnode) &&
	IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) >
	IPV6_ADDR_SCOPE_INTFACELOCAL) {
	mld6_sendpkt(in6m, MLD_LISTENER_DONE, &allrouter);
	}
	}

	void
	mld6_input(struct mbuf *m, int off)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct mld_hdr *mldh;
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct in6_multi *in6m;
	struct in6_addr mld_addr, all_in6;
	struct in6_ifaddr *ia;
	struct ifmultiaddr *ifma;
	u_long timer; /* timer value in the MLD query header */

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(*mldh),);
	mldh = (struct mld_hdr *)(mtod(m, caddr_t) + off);
	#else
	IP6_EXTHDR_GET(mldh, struct mld_hdr , m, off, sizeof(mldh));
	if (mldh == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif

	/* source address validation */
	ip6 = mtod(m, struct ip6_hdr ); / in case mpullup */
	if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) {
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
	log(LOG_ERR,
	"mld6_input: src %s is not link-local (grp=%s)\n",
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufg, &mldh->mld_addr));
	/*
	* spec (RFC2710) does not explicitly
	* specify to discard the packet from a non link-local
	* source address. But we believe it's expected to do so.
	* XXX: do we have to allow :: as source?
	*/
	m_freem(m);
	return;
	}

	/*
	* make a copy for local work (in6_setscope() may modify the 1st arg)
	*/
	mld_addr = mldh->mld_addr;
	if (in6_setscope(&mld_addr, ifp, NULL)) {
	/* XXX: this should not happen! */
	m_free(m);
	return;
	}

	/*
	* In the MLD6 specification, there are 3 states and a flag.
	*
	* In Non-Listener state, we simply don't have a membership record.
	* In Delaying Listener state, our timer is running (in6m->in6m_timer)
	* In Idle Listener state, our timer is not running
	* (in6m->in6m_timer==IN6M_TIMER_UNDEF)
	*
	* The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if
	* we have heard a report from another member, or MLD_IREPORTEDLAST
	* if we sent the last report.
	*/
	switch(mldh->mld_type) {
	case MLD_LISTENER_QUERY:
	if (ifp->if_flags & IFF_LOOPBACK)
	break;

	if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
	!IN6_IS_ADDR_MULTICAST(&mld_addr))
	break; /* print error or log stat? */

	all_in6 = in6addr_linklocal_allnodes;
	if (in6_setscope(&all_in6, ifp, NULL)) {
	/* XXX: this should not happen! */
	break;
	}

	/*
	* - Start the timers in all of our membership records
	* that the query applies to for the interface on
	* which the query arrived excl. those that belong
	* to the "all-nodes" group (ff02::1).
	* - Restart any timer that is already running but has
	* A value longer than the requested timeout.
	* - Use the value specified in the query message as
	* the maximum timeout.
	*/
	timer = ntohs(mldh->mld_maxdelay);

	IFP_TO_IA6(ifp, ia);
	if (ia == NULL)
	break;

	/*
	* XXX: System timer resolution is too low to handle Max
	* Response Delay, so set 1 to the internal timer even if
	* the calculated value equals to zero when Max Response
	* Delay is positive.
	*/
	timer = ntohs(mldh->mld_maxdelay) * PR_FASTHZ / MLD_TIMER_SCALE;
	if (timer == 0 && mldh->mld_maxdelay)
	timer = 1;

	IF_ADDR_LOCK(ifp);
	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
	if (ifma->ifma_addr->sa_family != AF_INET6)
	continue;
	in6m = (struct in6_multi *)ifma->ifma_protospec;

	if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) \|\|
	IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
	IPV6_ADDR_SCOPE_LINKLOCAL)
	continue;

	if (IN6_IS_ADDR_UNSPECIFIED(&mld_addr) \|\|
	IN6_ARE_ADDR_EQUAL(&mld_addr, &in6m->in6m_addr)) {
	if (timer == 0) {
	/* send a report immediately */
	mld_stoptimer(in6m);
	mld6_sendpkt(in6m, MLD_LISTENER_REPORT,
	NULL);
	in6m->in6m_timer = 0; /* reset timer */
	in6m->in6m_state = MLD_IREPORTEDLAST;
	}
	else if (in6m->in6m_timer == IN6M_TIMER_UNDEF \|\|
	mld_timerresid(in6m) > timer) {
	in6m->in6m_timer =
	1 + (arc4random() % timer) * hz / 1000;
	mld_starttimer(in6m);
	}
	}
	}
	IF_ADDR_UNLOCK(ifp);
	break;

	case MLD_LISTENER_REPORT:
	/*
	* For fast leave to work, we have to know that we are the
	* last person to send a report for this group. Reports
	* can potentially get looped back if we are a multicast
	* router, so discard reports sourced by me.
	* Note that it is impossible to check IFF_LOOPBACK flag of
	* ifp for this purpose, since ip6_mloopback pass the physical
	* interface to looutput.
	*/
	if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */
	break;

	if (!IN6_IS_ADDR_MULTICAST(&mld_addr))
	break;

	/*
	* If we belong to the group being reported, stop
	* our timer for that group.
	*/
	IN6_LOOKUP_MULTI(mld_addr, ifp, in6m);
	if (in6m) {
	in6m->in6m_timer = 0; /* transit to idle state */
	in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */
	}
	break;
	default: /* this is impossible */
	log(LOG_ERR, "mld6_input: illegal type(%d)", mldh->mld_type);
	break;
	}

	m_freem(m);
	}

	static void
	mld6_sendpkt(struct in6_multi in6m, int type, const struct in6_addr dst)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf mh, md;
	struct mld_hdr *mldh;
	struct ip6_hdr *ip6;
	struct ip6_moptions im6o;
	struct in6_ifaddr *ia;
	struct ifnet *ifp = in6m->in6m_ifp;
	struct ifnet *outif = NULL;

	/*
	* At first, find a link local address on the outgoing interface
	* to use as the source address of the MLD packet.
	*/
	if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY\|IN6_IFF_ANYCAST))
	== NULL)
	return;

	/*
	* Allocate mbufs to store ip6 header and MLD header.
	* We allocate 2 mbufs and make chain in advance because
	* it is more convenient when inserting the hop-by-hop option later.
	*/
	MGETHDR(mh, M_DONTWAIT, MT_HEADER);
	if (mh == NULL)
	return;
	MGET(md, M_DONTWAIT, MT_DATA);
	if (md == NULL) {
	m_free(mh);
	return;
	}
	mh->m_next = md;

	mh->m_pkthdr.rcvif = NULL;
	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
	mh->m_len = sizeof(struct ip6_hdr);
	MH_ALIGN(mh, sizeof(struct ip6_hdr));

	/* fill in the ip6 header */
	ip6 = mtod(mh, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	/* ip6_plen will be set later */
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	/* ip6_hlim will be set by im6o.im6o_multicast_hlim */
	ip6->ip6_src = ia->ia_addr.sin6_addr;
	ip6->ip6_dst = dst ? *dst : in6m->in6m_addr;

	/* fill in the MLD header */
	md->m_len = sizeof(struct mld_hdr);
	mldh = mtod(md, struct mld_hdr *);
	mldh->mld_type = type;
	mldh->mld_code = 0;
	mldh->mld_cksum = 0;
	/* XXX: we assume the function will not be called for query messages */
	mldh->mld_maxdelay = 0;
	mldh->mld_reserved = 0;
	mldh->mld_addr = in6m->in6m_addr;
	in6_clearscope(&mldh->mld_addr); /* XXX */
	mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr),
	sizeof(struct mld_hdr));

	/* construct multicast option */
	bzero(&im6o, sizeof(im6o));
	im6o.im6o_multicast_ifp = ifp;
	im6o.im6o_multicast_hlim = 1;

	/*
	* Request loopback of the report if we are acting as a multicast
	* router, so that the process-level routing daemon can hear it.
	*/
	im6o.im6o_multicast_loop = (ip6_mrouter != NULL);

	/* increment output statictics */
	V_icmp6stat.icp6s_outhist[type]++;

	ip6_output(mh, &V_ip6_opts, NULL, 0, &im6o, &outif, NULL);
	if (outif) {
	icmp6_ifstat_inc(outif, ifs6_out_msg);
	switch (type) {
	case MLD_LISTENER_QUERY:
	icmp6_ifstat_inc(outif, ifs6_out_mldquery);
	break;
	case MLD_LISTENER_REPORT:
	icmp6_ifstat_inc(outif, ifs6_out_mldreport);
	break;
	case MLD_LISTENER_DONE:
	icmp6_ifstat_inc(outif, ifs6_out_mlddone);
	break;
	}
	}
	}

	/*
	* Add an address to the list of IP6 multicast addresses for a given interface.
	* Add source addresses to the list also, if upstream router is MLDv2 capable
	* and the number of source is not 0.
	*/
	struct in6_multi *
	in6_addmulti(struct in6_addr maddr6, struct ifnet ifp,
	int *errorp, int delay)
	{
	struct in6_multi *in6m;

	*errorp = 0;
	in6m = NULL;

	IFF_LOCKGIANT(ifp);
	/IN6_MULTI_LOCK();/

	IN6_LOOKUP_MULTI(*maddr6, ifp, in6m);
	if (in6m != NULL) {
	/*
	* If we already joined this group, just bump the
	* refcount and return it.
	*/
	KASSERT(in6m->in6m_refcount >= 1,
	("%s: bad refcount %d", __func__, in6m->in6m_refcount));
	++in6m->in6m_refcount;
	} else do {
	struct in6_multi *nin6m;
	struct ifmultiaddr *ifma;
	struct sockaddr_in6 sa6;

	bzero(&sa6, sizeof(sa6));
	sa6.sin6_family = AF_INET6;
	sa6.sin6_len = sizeof(struct sockaddr_in6);
	sa6.sin6_addr = *maddr6;

	errorp = if_addmulti(ifp, (struct sockaddr )&sa6, &ifma);
	if (*errorp)
	break;

	/*
	* If ifma->ifma_protospec is null, then if_addmulti() created
	* a new record. Otherwise, bump refcount, and we are done.
	*/
	if (ifma->ifma_protospec != NULL) {
	in6m = ifma->ifma_protospec;
	++in6m->in6m_refcount;
	break;
	}

	nin6m = malloc(sizeof(*nin6m), M_IP6MADDR, M_NOWAIT \| M_ZERO);
	if (nin6m == NULL) {
	if_delmulti_ifma(ifma);
	break;
	}

	nin6m->in6m_addr = *maddr6;
	nin6m->in6m_ifp = ifp;
	nin6m->in6m_refcount = 1;
	nin6m->in6m_ifma = ifma;
	ifma->ifma_protospec = nin6m;

	nin6m->in6m_timer_ch = malloc(sizeof(*nin6m->in6m_timer_ch),
	M_IP6MADDR, M_NOWAIT);
	if (nin6m->in6m_timer_ch == NULL) {
	free(nin6m, M_IP6MADDR);
	if_delmulti_ifma(ifma);
	break;
	}

	LIST_INSERT_HEAD(&in6_multihead, nin6m, in6m_entry);

	callout_init(nin6m->in6m_timer_ch, 0);
	nin6m->in6m_timer = delay;
	if (nin6m->in6m_timer > 0) {
	nin6m->in6m_state = MLD_REPORTPENDING;
	mld_starttimer(nin6m);
	}

	mld6_start_listening(nin6m);

	in6m = nin6m;

	} while (0);

	/IN6_MULTI_UNLOCK();/
	IFF_UNLOCKGIANT(ifp);

	return (in6m);
	}

	/*
	* Delete a multicast address record.
	*
	* TODO: Locking, as per netinet.
	*/
	void
	in6_delmulti(struct in6_multi *in6m)
	{
	struct ifmultiaddr *ifma;

	KASSERT(in6m->in6m_refcount >= 1, ("%s: freeing freed in6m", __func__));

	if (--in6m->in6m_refcount == 0) {
	mld_stoptimer(in6m);
	mld6_stop_listening(in6m);

	ifma = in6m->in6m_ifma;
	KASSERT(ifma->ifma_protospec == in6m,
	("%s: ifma_protospec != in6m", __func__));
	ifma->ifma_protospec = NULL;

	LIST_REMOVE(in6m, in6m_entry);
	free(in6m->in6m_timer_ch, M_IP6MADDR);
	free(in6m, M_IP6MADDR);

	if_delmulti_ifma(ifma);
	}
	}
	Index: head/sys/netinet6/nd6.c
	===================================================================
	--- head/sys/netinet6/nd6.c (revision 183549)
	+++ head/sys/netinet6/nd6.c (revision 183550)
	@@ -1,2399 +1,2425 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/protosw.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/queue.h>
	#include <sys/sysctl.h>

	#include <net/if.h>
	#include <net/if_arc.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/iso88025.h>
	#include <net/fddi.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/if_ether.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/icmp6.h>

	#include <sys/limits.h>
	#include <sys/vimage.h>

	#include <security/mac/mac_framework.h>

	#define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
	#define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */

	#define SIN6(s) ((struct sockaddr_in6 *)s)
	#define SDL(s) ((struct sockaddr_dl *)s)

	/* timer values */
	int nd6_prune = 1; /* walk list every 1 seconds */
	int nd6_delay = 5; /* delay first probe time 5 second */
	int nd6_umaxtries = 3; /* maximum unicast query */
	int nd6_mmaxtries = 3; /* maximum multicast query */
	int nd6_useloopback = 1; /* use loopback interface for local traffic */
	int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */

	/* preventing too many loops in ND option parsing */
	int nd6_maxndopt = 10; /* max # of ND options allowed */

	int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */
	int nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */

	#ifdef ND6_DEBUG
	int nd6_debug = 1;
	#else
	int nd6_debug = 0;
	#endif

	/* for debugging? */
	static int nd6_inuse, nd6_allocated;

	struct llinfo_nd6 llinfo_nd6 = {&llinfo_nd6, &llinfo_nd6};
	struct nd_drhead nd_defrouter;
	struct nd_prhead nd_prefix = { 0 };

	int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL;
	static struct sockaddr_in6 all1_sa;

	static int nd6_is_new_addr_neighbor __P((struct sockaddr_in6 *,
	struct ifnet *));
	static void nd6_setmtu0(struct ifnet , struct nd_ifinfo );
	static void nd6_slowtimo(void *);
	static int regen_tmpaddr(struct in6_ifaddr *);
	static struct llinfo_nd6 nd6_free(struct rtentry , int);
	static void nd6_llinfo_timer(void *);
	static void clear_llinfo_pqueue(struct llinfo_nd6 *);

	struct callout nd6_slowtimo_ch;
	struct callout nd6_timer_ch;
	extern struct callout in6_tmpaddrtimer_ch;

	void
	nd6_init(void)
	{
	+ INIT_VNET_INET6(curvnet);
	static int nd6_init_done = 0;
	int i;

	if (nd6_init_done) {
	log(LOG_NOTICE, "nd6_init called more than once(ignored)\n");
	return;
	}

	all1_sa.sin6_family = AF_INET6;
	all1_sa.sin6_len = sizeof(struct sockaddr_in6);
	for (i = 0; i < sizeof(all1_sa.sin6_addr); i++)
	all1_sa.sin6_addr.s6_addr[i] = 0xff;

	/* initialization of the default router list */
	TAILQ_INIT(&V_nd_defrouter);
	/* start timer */
	callout_init(&V_nd6_slowtimo_ch, 0);
	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
	nd6_slowtimo, NULL);

	nd6_init_done = 1;

	}

	struct nd_ifinfo *
	nd6_ifattach(struct ifnet *ifp)
	{
	struct nd_ifinfo *nd;

	nd = (struct nd_ifinfo )malloc(sizeof(nd), M_IP6NDP, M_WAITOK);
	bzero(nd, sizeof(*nd));

	nd->initialized = 1;

	nd->chlim = IPV6_DEFHLIM;
	nd->basereachable = REACHABLE_TIME;
	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
	nd->retrans = RETRANS_TIMER;
	/*
	* Note that the default value of ip6_accept_rtadv is 0, which means
	* we won't accept RAs by default even if we set ND6_IFF_ACCEPT_RTADV
	* here.
	*/
	nd->flags = (ND6_IFF_PERFORMNUD \| ND6_IFF_ACCEPT_RTADV);

	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
	nd6_setmtu0(ifp, nd);

	return nd;
	}

	void
	nd6_ifdetach(struct nd_ifinfo *nd)
	{

	free(nd, M_IP6NDP);
	}

	/*
	* Reset ND level link MTU. This function is called when the physical MTU
	* changes, which means we might have to adjust the ND level MTU.
	*/
	void
	nd6_setmtu(struct ifnet *ifp)
	{

	nd6_setmtu0(ifp, ND_IFINFO(ifp));
	}

	/* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
	void
	nd6_setmtu0(struct ifnet ifp, struct nd_ifinfo ndi)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	u_int32_t omaxmtu;

	omaxmtu = ndi->maxmtu;

	switch (ifp->if_type) {
	case IFT_ARCNET:
	ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */
	break;
	case IFT_FDDI:
	ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */
	break;
	case IFT_ISO88025:
	ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu);
	break;
	default:
	ndi->maxmtu = ifp->if_mtu;
	break;
	}

	/*
	* Decreasing the interface MTU under IPV6 minimum MTU may cause
	* undesirable situation. We thus notify the operator of the change
	* explicitly. The check for omaxmtu is necessary to restrict the
	* log to the case of changing the MTU, not initializing it.
	*/
	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
	log(LOG_NOTICE, "nd6_setmtu0: "
	"new link MTU on %s (%lu) is too small for IPv6\n",
	if_name(ifp), (unsigned long)ndi->maxmtu);
	}

	if (ndi->maxmtu > V_in6_maxmtu)
	in6_setmaxmtu(); /* check all interfaces just in case */

	#undef MIN
	}

	void
	nd6_option_init(void opt, int icmp6len, union nd_opts ndopts)
	{

	bzero(ndopts, sizeof(*ndopts));
	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
	ndopts->nd_opts_last
	= (struct nd_opt_hdr )(((u_char )opt) + icmp6len);

	if (icmp6len == 0) {
	ndopts->nd_opts_done = 1;
	ndopts->nd_opts_search = NULL;
	}
	}

	/*
	* Take one ND option.
	*/
	struct nd_opt_hdr *
	nd6_option(union nd_opts *ndopts)
	{
	struct nd_opt_hdr *nd_opt;
	int olen;

	if (ndopts == NULL)
	panic("ndopts == NULL in nd6_option");
	if (ndopts->nd_opts_last == NULL)
	panic("uninitialized ndopts in nd6_option");
	if (ndopts->nd_opts_search == NULL)
	return NULL;
	if (ndopts->nd_opts_done)
	return NULL;

	nd_opt = ndopts->nd_opts_search;

	/* make sure nd_opt_len is inside the buffer */
	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
	bzero(ndopts, sizeof(*ndopts));
	return NULL;
	}

	olen = nd_opt->nd_opt_len << 3;
	if (olen == 0) {
	/*
	* Message validation requires that all included
	* options have a length that is greater than zero.
	*/
	bzero(ndopts, sizeof(*ndopts));
	return NULL;
	}

	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
	/* option overruns the end of buffer, invalid */
	bzero(ndopts, sizeof(*ndopts));
	return NULL;
	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
	/* reached the end of options chain */
	ndopts->nd_opts_done = 1;
	ndopts->nd_opts_search = NULL;
	}
	return nd_opt;
	}

	/*
	* Parse multiple ND options.
	* This function is much easier to use, for ND routines that do not need
	* multiple options of the same type.
	*/
	int
	nd6_options(union nd_opts *ndopts)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_opt_hdr *nd_opt;
	int i = 0;

	if (ndopts == NULL)
	panic("ndopts == NULL in nd6_options");
	if (ndopts->nd_opts_last == NULL)
	panic("uninitialized ndopts in nd6_options");
	if (ndopts->nd_opts_search == NULL)
	return 0;

	while (1) {
	nd_opt = nd6_option(ndopts);
	if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
	/*
	* Message validation requires that all included
	* options have a length that is greater than zero.
	*/
	V_icmp6stat.icp6s_nd_badopt++;
	bzero(ndopts, sizeof(*ndopts));
	return -1;
	}

	if (nd_opt == NULL)
	goto skip1;

	switch (nd_opt->nd_opt_type) {
	case ND_OPT_SOURCE_LINKADDR:
	case ND_OPT_TARGET_LINKADDR:
	case ND_OPT_MTU:
	case ND_OPT_REDIRECTED_HEADER:
	if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
	nd6log((LOG_INFO,
	"duplicated ND6 option found (type=%d)\n",
	nd_opt->nd_opt_type));
	/* XXX bark? */
	} else {
	ndopts->nd_opt_array[nd_opt->nd_opt_type]
	= nd_opt;
	}
	break;
	case ND_OPT_PREFIX_INFORMATION:
	if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
	ndopts->nd_opt_array[nd_opt->nd_opt_type]
	= nd_opt;
	}
	ndopts->nd_opts_pi_end =
	(struct nd_opt_prefix_info *)nd_opt;
	break;
	default:
	/*
	* Unknown options must be silently ignored,
	* to accomodate future extension to the protocol.
	*/
	nd6log((LOG_DEBUG,
	"nd6_options: unsupported option %d - "
	"option ignored\n", nd_opt->nd_opt_type));
	}

	skip1:
	i++;
	if (i > V_nd6_maxndopt) {
	V_icmp6stat.icp6s_nd_toomanyopt++;
	nd6log((LOG_INFO, "too many loop in nd opt\n"));
	break;
	}

	if (ndopts->nd_opts_done)
	break;
	}

	return 0;
	}

	/*
	* ND6 timer routine to handle ND6 entries
	*/
	void
	nd6_llinfo_settimer(struct llinfo_nd6 *ln, long tick)
	{
	if (tick < 0) {
	ln->ln_expire = 0;
	ln->ln_ntick = 0;
	callout_stop(&ln->ln_timer_ch);
	} else {
	ln->ln_expire = time_second + tick / hz;
	if (tick > INT_MAX) {
	ln->ln_ntick = tick - INT_MAX;
	callout_reset(&ln->ln_timer_ch, INT_MAX,
	nd6_llinfo_timer, ln);
	} else {
	ln->ln_ntick = 0;
	callout_reset(&ln->ln_timer_ch, tick,
	nd6_llinfo_timer, ln);
	}
	}
	}

	static void
	nd6_llinfo_timer(void *arg)
	{
	struct llinfo_nd6 *ln;
	struct rtentry *rt;
	struct in6_addr *dst;
	struct ifnet *ifp;
	struct nd_ifinfo *ndi = NULL;

	ln = (struct llinfo_nd6 *)arg;

	if (ln->ln_ntick > 0) {
	if (ln->ln_ntick > INT_MAX) {
	ln->ln_ntick -= INT_MAX;
	nd6_llinfo_settimer(ln, INT_MAX);
	} else {
	ln->ln_ntick = 0;
	nd6_llinfo_settimer(ln, ln->ln_ntick);
	}
	return;
	}

	if ((rt = ln->ln_rt) == NULL)
	panic("ln->ln_rt == NULL");
	if ((ifp = rt->rt_ifp) == NULL)
	panic("ln->ln_rt->rt_ifp == NULL");
	ndi = ND_IFINFO(ifp);

	+ CURVNET_SET(ifp->if_vnet);
	+ INIT_VNET_INET6(curvnet);
	+
	/* sanity check */
	if (rt->rt_llinfo && (struct llinfo_nd6 *)rt->rt_llinfo != ln)
	panic("rt_llinfo(%p) is not equal to ln(%p)",
	rt->rt_llinfo, ln);
	if (rt_key(rt) == NULL)
	panic("rt key is NULL in nd6_timer(ln=%p)", ln);

	dst = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr;

	switch (ln->ln_state) {
	case ND6_LLINFO_INCOMPLETE:
	if (ln->ln_asked < V_nd6_mmaxtries) {
	ln->ln_asked++;
	nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000);
	nd6_ns_output(ifp, NULL, dst, ln, 0);
	} else {
	struct mbuf *m = ln->ln_hold;
	if (m) {
	struct mbuf *m0;

	/*
	* assuming every packet in ln_hold has the
	* same IP header
	*/
	m0 = m->m_nextpkt;
	m->m_nextpkt = NULL;
	icmp6_error2(m, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_ADDR, 0, rt->rt_ifp);

	ln->ln_hold = m0;
	clear_llinfo_pqueue(ln);
	}
	if (rt && rt->rt_llinfo)
	(void)nd6_free(rt, 0);
	ln = NULL;
	}
	break;
	case ND6_LLINFO_REACHABLE:
	if (!ND6_LLINFO_PERMANENT(ln)) {
	ln->ln_state = ND6_LLINFO_STALE;
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
	}
	break;

	case ND6_LLINFO_STALE:
	/* Garbage Collection(RFC 2461 5.3) */
	if (!ND6_LLINFO_PERMANENT(ln)) {
	if (rt && rt->rt_llinfo)
	(void)nd6_free(rt, 1);
	ln = NULL;
	}
	break;

	case ND6_LLINFO_DELAY:
	if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
	/* We need NUD */
	ln->ln_asked = 1;
	ln->ln_state = ND6_LLINFO_PROBE;
	nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000);
	nd6_ns_output(ifp, dst, dst, ln, 0);
	} else {
	ln->ln_state = ND6_LLINFO_STALE; /* XXX */
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
	}
	break;
	case ND6_LLINFO_PROBE:
	if (ln->ln_asked < V_nd6_umaxtries) {
	ln->ln_asked++;
	nd6_llinfo_settimer(ln, (long)ndi->retrans * hz / 1000);
	nd6_ns_output(ifp, dst, dst, ln, 0);
	} else if (rt->rt_ifa != NULL &&
	rt->rt_ifa->ifa_addr->sa_family == AF_INET6 &&
	(((struct in6_ifaddr *)rt->rt_ifa)->ia_flags & IFA_ROUTE)) {
	/*
	* This is an unreachable neighbor whose address is
	* specified as the destination of a p2p interface
	* (see in6_ifinit()). We should not free the entry
	* since this is sort of a "static" entry generated
	* via interface address configuration.
	*/
	ln->ln_asked = 0;
	ln->ln_expire = 0; /* make it permanent */
	ln->ln_state = ND6_LLINFO_STALE;
	} else {
	if (rt && rt->rt_llinfo)
	(void)nd6_free(rt, 0);
	ln = NULL;
	}
	break;
	}
	+ CURVNET_RESTORE();
	}


	/*
	* ND6 timer routine to expire default route list and prefix list
	*/
	void
	-nd6_timer(void *ignored_arg)
	+nd6_timer(void *arg)
	{
	+ CURVNET_SET_QUIET((struct vnet *) arg);
	+ INIT_VNET_INET6((struct vnet *) arg);
	int s;
	struct nd_defrouter *dr;
	struct nd_prefix *pr;
	struct in6_ifaddr ia6, nia6;
	struct in6_addrlifetime *lt6;

	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
	nd6_timer, NULL);

	/* expire default router list */
	s = splnet();
	dr = TAILQ_FIRST(&V_nd_defrouter);
	while (dr) {
	if (dr->expire && dr->expire < time_second) {
	struct nd_defrouter *t;
	t = TAILQ_NEXT(dr, dr_entry);
	defrtrlist_del(dr);
	dr = t;
	} else {
	dr = TAILQ_NEXT(dr, dr_entry);
	}
	}

	/*
	* expire interface addresses.
	* in the past the loop was inside prefix expiry processing.
	* However, from a stricter speci-confrmance standpoint, we should
	* rather separate address lifetimes and prefix lifetimes.
	*/
	addrloop:
	for (ia6 = V_in6_ifaddr; ia6; ia6 = nia6) {
	nia6 = ia6->ia_next;
	/* check address lifetime */
	lt6 = &ia6->ia6_lifetime;
	if (IFA6_IS_INVALID(ia6)) {
	int regen = 0;

	/*
	* If the expiring address is temporary, try
	* regenerating a new one. This would be useful when
	* we suspended a laptop PC, then turned it on after a
	* period that could invalidate all temporary
	* addresses. Although we may have to restart the
	* loop (see below), it must be after purging the
	* address. Otherwise, we'd see an infinite loop of
	* regeneration.
	*/
	if (V_ip6_use_tempaddr &&
	(ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
	if (regen_tmpaddr(ia6) == 0)
	regen = 1;
	}

	in6_purgeaddr(&ia6->ia_ifa);

	if (regen)
	goto addrloop; /* XXX: see below */
	} else if (IFA6_IS_DEPRECATED(ia6)) {
	int oldflags = ia6->ia6_flags;

	ia6->ia6_flags \|= IN6_IFF_DEPRECATED;

	/*
	* If a temporary address has just become deprecated,
	* regenerate a new one if possible.
	*/
	if (V_ip6_use_tempaddr &&
	(ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
	(oldflags & IN6_IFF_DEPRECATED) == 0) {

	if (regen_tmpaddr(ia6) == 0) {
	/*
	* A new temporary address is
	* generated.
	* XXX: this means the address chain
	* has changed while we are still in
	* the loop. Although the change
	* would not cause disaster (because
	* it's not a deletion, but an
	* addition,) we'd rather restart the
	* loop just for safety. Or does this
	* significantly reduce performance??
	*/
	goto addrloop;
	}
	}
	} else {
	/*
	* A new RA might have made a deprecated address
	* preferred.
	*/
	ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
	}
	}

	/* expire prefix list */
	pr = V_nd_prefix.lh_first;
	while (pr) {
	/*
	* check prefix lifetime.
	* since pltime is just for autoconf, pltime processing for
	* prefix is not necessary.
	*/
	if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME &&
	time_second - pr->ndpr_lastupdate > pr->ndpr_vltime) {
	struct nd_prefix *t;
	t = pr->ndpr_next;

	/*
	* address expiration and prefix expiration are
	* separate. NEVER perform in6_purgeaddr here.
	*/

	prelist_remove(pr);
	pr = t;
	} else
	pr = pr->ndpr_next;
	}
	splx(s);
	+ CURVNET_RESTORE();
	}

	/*
	* ia6 - deprecated/invalidated temporary address
	*/
	static int
	regen_tmpaddr(struct in6_ifaddr *ia6)
	{
	struct ifaddr *ifa;
	struct ifnet *ifp;
	struct in6_ifaddr *public_ifa6 = NULL;

	ifp = ia6->ia_ifa.ifa_ifp;
	for (ifa = ifp->if_addrlist.tqh_first; ifa;
	ifa = ifa->ifa_list.tqe_next) {
	struct in6_ifaddr *it6;

	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;

	it6 = (struct in6_ifaddr *)ifa;

	/* ignore no autoconf addresses. */
	if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
	continue;

	/* ignore autoconf addresses with different prefixes. */
	if (it6->ia6_ndpr == NULL \|\| it6->ia6_ndpr != ia6->ia6_ndpr)
	continue;

	/*
	* Now we are looking at an autoconf address with the same
	* prefix as ours. If the address is temporary and is still
	* preferred, do not create another one. It would be rare, but
	* could happen, for example, when we resume a laptop PC after
	* a long period.
	*/
	if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
	!IFA6_IS_DEPRECATED(it6)) {
	public_ifa6 = NULL;
	break;
	}

	/*
	* This is a public autoconf address that has the same prefix
	* as ours. If it is preferred, keep it. We can't break the
	* loop here, because there may be a still-preferred temporary
	* address with the prefix.
	*/
	if (!IFA6_IS_DEPRECATED(it6))
	public_ifa6 = it6;
	}

	if (public_ifa6 != NULL) {
	int e;

	if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
	log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
	" tmp addr,errno=%d\n", e);
	return (-1);
	}
	return (0);
	}

	return (-1);
	}

	/*
	* Nuke neighbor cache/prefix/default router management table, right before
	* ifp goes away.
	*/
	void
	nd6_purge(struct ifnet *ifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct llinfo_nd6 ln, nln;
	struct nd_defrouter dr, ndr;
	struct nd_prefix pr, npr;

	/*
	* Nuke default router list entries toward ifp.
	* We defer removal of default router list entries that is installed
	* in the routing table, in order to keep additional side effects as
	* small as possible.
	*/
	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = ndr) {
	ndr = TAILQ_NEXT(dr, dr_entry);
	if (dr->installed)
	continue;

	if (dr->ifp == ifp)
	defrtrlist_del(dr);
	}

	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = ndr) {
	ndr = TAILQ_NEXT(dr, dr_entry);
	if (!dr->installed)
	continue;

	if (dr->ifp == ifp)
	defrtrlist_del(dr);
	}

	/* Nuke prefix list entries toward ifp */
	for (pr = V_nd_prefix.lh_first; pr; pr = npr) {
	npr = pr->ndpr_next;
	if (pr->ndpr_ifp == ifp) {
	/*
	* Because if_detach() does not release prefixes
	* while purging addresses the reference count will
	* still be above zero. We therefore reset it to
	* make sure that the prefix really gets purged.
	*/
	pr->ndpr_refcnt = 0;

	/*
	* Previously, pr->ndpr_addr is removed as well,
	* but I strongly believe we don't have to do it.
	* nd6_purge() is only called from in6_ifdetach(),
	* which removes all the associated interface addresses
	* by itself.
	* (jinmei@kame.net 20010129)
	*/
	prelist_remove(pr);
	}
	}

	/* cancel default outgoing interface setting */
	if (V_nd6_defifindex == ifp->if_index)
	nd6_setdefaultiface(0);

	if (!V_ip6_forwarding && V_ip6_accept_rtadv) { /* XXX: too restrictive? */
	/* refresh default router list */
	defrouter_select();
	}

	/*
	* Nuke neighbor cache entries for the ifp.
	* Note that rt->rt_ifp may not be the same as ifp,
	* due to KAME goto ours hack. See RTM_RESOLVE case in
	* nd6_rtrequest(), and ip6_input().
	*/
	ln = V_llinfo_nd6.ln_next;
	while (ln && ln != &V_llinfo_nd6) {
	struct rtentry *rt;
	struct sockaddr_dl *sdl;

	nln = ln->ln_next;
	rt = ln->ln_rt;
	if (rt && rt->rt_gateway &&
	rt->rt_gateway->sa_family == AF_LINK) {
	sdl = (struct sockaddr_dl *)rt->rt_gateway;
	if (sdl->sdl_index == ifp->if_index)
	nln = nd6_free(rt, 0);
	}
	ln = nln;
	}
	}

	struct rtentry *
	nd6_lookup(struct in6_addr addr6, int create, struct ifnet ifp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct rtentry *rt;
	struct sockaddr_in6 sin6;
	char ip6buf[INET6_ADDRSTRLEN];

	bzero(&sin6, sizeof(sin6));
	sin6.sin6_len = sizeof(struct sockaddr_in6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_addr = *addr6;
	rt = rtalloc1((struct sockaddr *)&sin6, create, 0UL);
	if (rt) {
	if ((rt->rt_flags & RTF_LLINFO) == 0 && create) {
	/*
	* This is the case for the default route.
	* If we want to create a neighbor cache for the
	* address, we should free the route for the
	* destination and allocate an interface route.
	*/
	RTFREE_LOCKED(rt);
	rt = NULL;
	}
	}
	if (rt == NULL) {
	if (create && ifp) {
	int e;

	/*
	* If no route is available and create is set,
	* we allocate a host route for the destination
	* and treat it like an interface route.
	* This hack is necessary for a neighbor which can't
	* be covered by our own prefix.
	*/
	struct ifaddr *ifa =
	ifaof_ifpforaddr((struct sockaddr *)&sin6, ifp);
	if (ifa == NULL)
	return (NULL);

	/*
	* Create a new route. RTF_LLINFO is necessary
	* to create a Neighbor Cache entry for the
	* destination in nd6_rtrequest which will be
	* called in rtrequest via ifa->ifa_rtrequest.
	*/
	if ((e = rtrequest(RTM_ADD, (struct sockaddr *)&sin6,
	ifa->ifa_addr, (struct sockaddr *)&all1_sa,
	(ifa->ifa_flags \| RTF_HOST \| RTF_LLINFO) &
	~RTF_CLONING, &rt)) != 0) {
	log(LOG_ERR,
	"nd6_lookup: failed to add route for a "
	"neighbor(%s), errno=%d\n",
	ip6_sprintf(ip6buf, addr6), e);
	}
	if (rt == NULL)
	return (NULL);
	RT_LOCK(rt);
	if (rt->rt_llinfo) {
	struct llinfo_nd6 *ln =
	(struct llinfo_nd6 *)rt->rt_llinfo;
	ln->ln_state = ND6_LLINFO_NOSTATE;
	}
	} else
	return (NULL);
	}
	RT_LOCK_ASSERT(rt);
	RT_REMREF(rt);
	/*
	* Validation for the entry.
	* Note that the check for rt_llinfo is necessary because a cloned
	* route from a parent route that has the L flag (e.g. the default
	* route to a p2p interface) may have the flag, too, while the
	* destination is not actually a neighbor.
	* XXX: we can't use rt->rt_ifp to check for the interface, since
	* it might be the loopback interface if the entry is for our
	* own address on a non-loopback interface. Instead, we should
	* use rt->rt_ifa->ifa_ifp, which would specify the REAL
	* interface.
	* Note also that ifa_ifp and ifp may differ when we connect two
	* interfaces to a same link, install a link prefix to an interface,
	* and try to install a neighbor cache on an interface that does not
	* have a route to the prefix.
	*/
	if ((rt->rt_flags & RTF_GATEWAY) \|\| (rt->rt_flags & RTF_LLINFO) == 0 \|\|
	rt->rt_gateway->sa_family != AF_LINK \|\| rt->rt_llinfo == NULL \|\|
	(ifp && rt->rt_ifa->ifa_ifp != ifp)) {
	if (create) {
	nd6log((LOG_DEBUG,
	"nd6_lookup: failed to lookup %s (if = %s)\n",
	ip6_sprintf(ip6buf, addr6),
	ifp ? if_name(ifp) : "unspec"));
	}
	RT_UNLOCK(rt);
	return (NULL);
	}
	RT_UNLOCK(rt); /* XXX not ready to return rt locked */
	return (rt);
	}

	/*
	* Test whether a given IPv6 address is a neighbor or not, ignoring
	* the actual neighbor cache. The neighbor cache is ignored in order
	* to not reenter the routing code from within itself.
	*/
	static int
	nd6_is_new_addr_neighbor(struct sockaddr_in6 addr, struct ifnet ifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct nd_prefix *pr;
	struct ifaddr *dstaddr;

	/*
	* A link-local address is always a neighbor.
	* XXX: a link does not necessarily specify a single interface.
	*/
	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
	struct sockaddr_in6 sin6_copy;
	u_int32_t zone;

	/*
	* We need sin6_copy since sa6_recoverscope() may modify the
	* content (XXX).
	*/
	sin6_copy = *addr;
	if (sa6_recoverscope(&sin6_copy))
	return (0); /* XXX: should be impossible */
	if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
	return (0);
	if (sin6_copy.sin6_scope_id == zone)
	return (1);
	else
	return (0);
	}

	/*
	* If the address matches one of our addresses,
	* it should be a neighbor.
	* If the address matches one of our on-link prefixes, it should be a
	* neighbor.
	*/
	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	if (pr->ndpr_ifp != ifp)
	continue;

	if (!(pr->ndpr_stateflags & NDPRF_ONLINK))
	continue;

	if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
	&addr->sin6_addr, &pr->ndpr_mask))
	return (1);
	}

	/*
	* If the address is assigned on the node of the other side of
	* a p2p interface, the address should be a neighbor.
	*/
	dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr);
	if ((dstaddr != NULL) && (dstaddr->ifa_ifp == ifp))
	return (1);

	/*
	* If the default router list is empty, all addresses are regarded
	* as on-link, and thus, as a neighbor.
	* XXX: we restrict the condition to hosts, because routers usually do
	* not have the "default router list".
	*/
	if (!V_ip6_forwarding && TAILQ_FIRST(&V_nd_defrouter) == NULL &&
	V_nd6_defifindex == ifp->if_index) {
	return (1);
	}

	return (0);
	}


	/*
	* Detect if a given IPv6 address identifies a neighbor on a given link.
	* XXX: should take care of the destination of a p2p link?
	*/
	int
	nd6_is_addr_neighbor(struct sockaddr_in6 addr, struct ifnet ifp)
	{

	if (nd6_is_new_addr_neighbor(addr, ifp))
	return (1);

	/*
	* Even if the address matches none of our addresses, it might be
	* in the neighbor cache.
	*/
	if (nd6_lookup(&addr->sin6_addr, 0, ifp) != NULL)
	return (1);

	return (0);
	}

	/*
	* Free an nd6 llinfo entry.
	* Since the function would cause significant changes in the kernel, DO NOT
	* make it global, unless you have a strong reason for the change, and are sure
	* that the change is safe.
	*/
	static struct llinfo_nd6 *
	nd6_free(struct rtentry *rt, int gc)
	{
	+ INIT_VNET_INET6(curvnet);
	struct llinfo_nd6 ln = (struct llinfo_nd6 )rt->rt_llinfo, *next;
	struct in6_addr in6 = ((struct sockaddr_in6 *)rt_key(rt))->sin6_addr;
	struct nd_defrouter *dr;

	/*
	* we used to have pfctlinput(PRC_HOSTDEAD) here.
	* even though it is not harmful, it was not really necessary.
	*/

	/* cancel timer */
	nd6_llinfo_settimer(ln, -1);

	if (!V_ip6_forwarding) {
	int s;
	s = splnet();
	dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))->sin6_addr,
	rt->rt_ifp);

	if (dr != NULL && dr->expire &&
	ln->ln_state == ND6_LLINFO_STALE && gc) {
	/*
	* If the reason for the deletion is just garbage
	* collection, and the neighbor is an active default
	* router, do not delete it. Instead, reset the GC
	* timer using the router's lifetime.
	* Simply deleting the entry would affect default
	* router selection, which is not necessarily a good
	* thing, especially when we're using router preference
	* values.
	* XXX: the check for ln_state would be redundant,
	* but we intentionally keep it just in case.
	*/
	if (dr->expire > time_second)
	nd6_llinfo_settimer(ln,
	(dr->expire - time_second) * hz);
	else
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
	splx(s);
	return (ln->ln_next);
	}

	if (ln->ln_router \|\| dr) {
	/*
	* rt6_flush must be called whether or not the neighbor
	* is in the Default Router List.
	* See a corresponding comment in nd6_na_input().
	*/
	rt6_flush(&in6, rt->rt_ifp);
	}

	if (dr) {
	/*
	* Unreachablity of a router might affect the default
	* router selection and on-link detection of advertised
	* prefixes.
	*/

	/*
	* Temporarily fake the state to choose a new default
	* router and to perform on-link determination of
	* prefixes correctly.
	* Below the state will be set correctly,
	* or the entry itself will be deleted.
	*/
	ln->ln_state = ND6_LLINFO_INCOMPLETE;

	/*
	* Since defrouter_select() does not affect the
	* on-link determination and MIP6 needs the check
	* before the default router selection, we perform
	* the check now.
	*/
	pfxlist_onlink_check();

	/*
	* refresh default router list
	*/
	defrouter_select();
	}
	splx(s);
	}

	/*
	* Before deleting the entry, remember the next entry as the
	* return value. We need this because pfxlist_onlink_check() above
	* might have freed other entries (particularly the old next entry) as
	* a side effect (XXX).
	*/
	next = ln->ln_next;

	/*
	* Detach the route from the routing tree and the list of neighbor
	* caches, and disable the route entry not to be used in already
	* cached routes.
	*/
	rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0,
	rt_mask(rt), 0, (struct rtentry **)0);

	return (next);
	}

	/*
	* Upper-layer reachability hint for Neighbor Unreachability Detection.
	*
	* XXX cost-effective methods?
	*/
	void
	nd6_nud_hint(struct rtentry rt, struct in6_addr dst6, int force)
	{
	+ INIT_VNET_INET6(curvnet);
	struct llinfo_nd6 *ln;

	/*
	* If the caller specified "rt", use that. Otherwise, resolve the
	* routing table by supplied "dst6".
	*/
	if (rt == NULL) {
	if (dst6 == NULL)
	return;
	if ((rt = nd6_lookup(dst6, 0, NULL)) == NULL)
	return;
	}

	if ((rt->rt_flags & RTF_GATEWAY) != 0 \|\|
	(rt->rt_flags & RTF_LLINFO) == 0 \|\|
	rt->rt_llinfo == NULL \|\| rt->rt_gateway == NULL \|\|
	rt->rt_gateway->sa_family != AF_LINK) {
	/* This is not a host route. */
	return;
	}

	ln = (struct llinfo_nd6 *)rt->rt_llinfo;
	if (ln->ln_state < ND6_LLINFO_REACHABLE)
	return;

	/*
	* if we get upper-layer reachability confirmation many times,
	* it is possible we have false information.
	*/
	if (!force) {
	ln->ln_byhint++;
	if (ln->ln_byhint > V_nd6_maxnudhint)
	return;
	}

	ln->ln_state = ND6_LLINFO_REACHABLE;
	if (!ND6_LLINFO_PERMANENT(ln)) {
	nd6_llinfo_settimer(ln,
	(long)ND_IFINFO(rt->rt_ifp)->reachable * hz);
	}
	}

	/*
	* info - XXX unused
	*/
	void
	nd6_rtrequest(int req, struct rtentry rt, struct rt_addrinfo info)
	{
	struct sockaddr *gate = rt->rt_gateway;
	struct llinfo_nd6 ln = (struct llinfo_nd6 )rt->rt_llinfo;
	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};
	struct ifnet *ifp = rt->rt_ifp;
	struct ifaddr *ifa;
	+ INIT_VNET_NET(ifp->if_vnet);
	+ INIT_VNET_INET6(ifp->if_vnet);

	RT_LOCK_ASSERT(rt);

	if ((rt->rt_flags & RTF_GATEWAY) != 0)
	return;

	if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) {
	/*
	* This is probably an interface direct route for a link
	* which does not need neighbor caches (e.g. fe80::%lo0/64).
	* We do not need special treatment below for such a route.
	* Moreover, the RTF_LLINFO flag which would be set below
	* would annoy the ndp(8) command.
	*/
	return;
	}

	if (req == RTM_RESOLVE &&
	(nd6_need_cache(ifp) == 0 \|\| /* stf case */
	!nd6_is_new_addr_neighbor((struct sockaddr_in6 *)rt_key(rt),
	ifp))) {
	/*
	* FreeBSD and BSD/OS often make a cloned host route based
	* on a less-specific route (e.g. the default route).
	* If the less specific route does not have a "gateway"
	* (this is the case when the route just goes to a p2p or an
	* stf interface), we'll mistakenly make a neighbor cache for
	* the host route, and will see strange neighbor solicitation
	* for the corresponding destination. In order to avoid the
	* confusion, we check if the destination of the route is
	* a neighbor in terms of neighbor discovery, and stop the
	* process if not. Additionally, we remove the LLINFO flag
	* so that ndp(8) will not try to get the neighbor information
	* of the destination.
	*/
	rt->rt_flags &= ~RTF_LLINFO;
	return;
	}

	switch (req) {
	case RTM_ADD:
	/*
	* There is no backward compatibility :)
	*
	* if ((rt->rt_flags & RTF_HOST) == 0 &&
	* SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff)
	* rt->rt_flags \|= RTF_CLONING;
	*/
	if ((rt->rt_flags & RTF_CLONING) \|\|
	((rt->rt_flags & RTF_LLINFO) && ln == NULL)) {
	/*
	* Case 1: This route should come from a route to
	* interface (RTF_CLONING case) or the route should be
	* treated as on-link but is currently not
	* (RTF_LLINFO && ln == NULL case).
	*/
	rt_setgate(rt, rt_key(rt),
	(struct sockaddr *)&null_sdl);
	gate = rt->rt_gateway;
	SDL(gate)->sdl_type = ifp->if_type;
	SDL(gate)->sdl_index = ifp->if_index;
	if (ln)
	nd6_llinfo_settimer(ln, 0);
	if ((rt->rt_flags & RTF_CLONING) != 0)
	break;
	}
	/*
	* In IPv4 code, we try to annonuce new RTF_ANNOUNCE entry here.
	* We don't do that here since llinfo is not ready yet.
	*
	* There are also couple of other things to be discussed:
	* - unsolicited NA code needs improvement beforehand
	* - RFC2461 says we MAY send multicast unsolicited NA
	* (7.2.6 paragraph 4), however, it also says that we
	* SHOULD provide a mechanism to prevent multicast NA storm.
	* we don't have anything like it right now.
	* note that the mechanism needs a mutual agreement
	* between proxies, which means that we need to implement
	* a new protocol, or a new kludge.
	* - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA.
	* we need to check ip6forwarding before sending it.
	* (or should we allow proxy ND configuration only for
	* routers? there's no mention about proxy ND from hosts)
	*/
	/* FALLTHROUGH */
	case RTM_RESOLVE:
	if ((ifp->if_flags & (IFF_POINTOPOINT \| IFF_LOOPBACK)) == 0) {
	/*
	* Address resolution isn't necessary for a point to
	* point link, so we can skip this test for a p2p link.
	*/
	if (gate->sa_family != AF_LINK \|\|
	gate->sa_len < sizeof(null_sdl)) {
	log(LOG_DEBUG,
	"nd6_rtrequest: bad gateway value: %s\n",
	if_name(ifp));
	break;
	}
	SDL(gate)->sdl_type = ifp->if_type;
	SDL(gate)->sdl_index = ifp->if_index;
	}
	if (ln != NULL)
	break; /* This happens on a route change */
	/*
	* Case 2: This route may come from cloning, or a manual route
	* add with a LL address.
	*/
	R_Malloc(ln, struct llinfo_nd6 , sizeof(ln));
	rt->rt_llinfo = (caddr_t)ln;
	if (ln == NULL) {
	log(LOG_DEBUG, "nd6_rtrequest: malloc failed\n");
	break;
	}
	V_nd6_inuse++;
	V_nd6_allocated++;
	bzero(ln, sizeof(*ln));
	RT_ADDREF(rt);
	ln->ln_rt = rt;
	callout_init(&ln->ln_timer_ch, 0);

	/* this is required for "ndp" command. - shin */
	if (req == RTM_ADD) {
	/*
	* gate should have some valid AF_LINK entry,
	* and ln->ln_expire should have some lifetime
	* which is specified by ndp command.
	*/
	ln->ln_state = ND6_LLINFO_REACHABLE;
	ln->ln_byhint = 0;
	} else {
	/*
	* When req == RTM_RESOLVE, rt is created and
	* initialized in rtrequest(), so rt_expire is 0.
	*/
	ln->ln_state = ND6_LLINFO_NOSTATE;
	nd6_llinfo_settimer(ln, 0);
	}
	rt->rt_flags \|= RTF_LLINFO;
	ln->ln_next = V_llinfo_nd6.ln_next;
	V_llinfo_nd6.ln_next = ln;
	ln->ln_prev = &V_llinfo_nd6;
	ln->ln_next->ln_prev = ln;

	/*
	* check if rt_key(rt) is one of my address assigned
	* to the interface.
	*/
	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(rt->rt_ifp,
	&SIN6(rt_key(rt))->sin6_addr);
	if (ifa) {
	caddr_t macp = nd6_ifptomac(ifp);
	nd6_llinfo_settimer(ln, -1);
	ln->ln_state = ND6_LLINFO_REACHABLE;
	ln->ln_byhint = 0;
	if (macp) {
	bcopy(macp, LLADDR(SDL(gate)), ifp->if_addrlen);
	SDL(gate)->sdl_alen = ifp->if_addrlen;
	}
	if (V_nd6_useloopback) {
	rt->rt_ifp = &V_loif[0]; /* XXX */
	/*
	* Make sure rt_ifa be equal to the ifaddr
	* corresponding to the address.
	* We need this because when we refer
	* rt_ifa->ia6_flags in ip6_input, we assume
	* that the rt_ifa points to the address instead
	* of the loopback address.
	*/
	if (ifa != rt->rt_ifa) {
	IFAFREE(rt->rt_ifa);
	IFAREF(ifa);
	rt->rt_ifa = ifa;
	}
	}
	} else if (rt->rt_flags & RTF_ANNOUNCE) {
	nd6_llinfo_settimer(ln, -1);
	ln->ln_state = ND6_LLINFO_REACHABLE;
	ln->ln_byhint = 0;

	/* join solicited node multicast for proxy ND */
	if (ifp->if_flags & IFF_MULTICAST) {
	struct in6_addr llsol;
	int error;

	llsol = SIN6(rt_key(rt))->sin6_addr;
	llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL;
	llsol.s6_addr32[1] = 0;
	llsol.s6_addr32[2] = htonl(1);
	llsol.s6_addr8[12] = 0xff;
	if (in6_setscope(&llsol, ifp, NULL))
	break;
	if (in6_addmulti(&llsol, ifp,
	&error, 0) == NULL) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_ERR, "%s: failed to join "
	"%s (errno=%d)\n", if_name(ifp),
	ip6_sprintf(ip6buf, &llsol),
	error));
	}
	}
	}
	break;

	case RTM_DELETE:
	if (ln == NULL)
	break;
	/* leave from solicited node multicast for proxy ND */
	if ((rt->rt_flags & RTF_ANNOUNCE) != 0 &&
	(ifp->if_flags & IFF_MULTICAST) != 0) {
	struct in6_addr llsol;
	struct in6_multi *in6m;

	llsol = SIN6(rt_key(rt))->sin6_addr;
	llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL;
	llsol.s6_addr32[1] = 0;
	llsol.s6_addr32[2] = htonl(1);
	llsol.s6_addr8[12] = 0xff;
	if (in6_setscope(&llsol, ifp, NULL) == 0) {
	IN6_LOOKUP_MULTI(llsol, ifp, in6m);
	if (in6m)
	in6_delmulti(in6m);
	} else
	; /* XXX: should not happen. bark here? */
	}
	V_nd6_inuse--;
	ln->ln_next->ln_prev = ln->ln_prev;
	ln->ln_prev->ln_next = ln->ln_next;
	ln->ln_prev = NULL;
	nd6_llinfo_settimer(ln, -1);
	RT_REMREF(rt);
	rt->rt_llinfo = 0;
	rt->rt_flags &= ~RTF_LLINFO;
	clear_llinfo_pqueue(ln);
	Free((caddr_t)ln);
	}
	}

	int
	nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct in6_drlist drl = (struct in6_drlist )data;
	struct in6_oprlist oprl = (struct in6_oprlist )data;
	struct in6_ndireq ndi = (struct in6_ndireq )data;
	struct in6_nbrinfo nbi = (struct in6_nbrinfo )data;
	struct in6_ndifreq ndif = (struct in6_ndifreq )data;
	struct nd_defrouter *dr;
	struct nd_prefix *pr;
	struct rtentry *rt;
	int i = 0, error = 0;
	int s;

	switch (cmd) {
	case SIOCGDRLST_IN6:
	/*
	* obsolete API, use sysctl under net.inet6.icmp6
	*/
	bzero(drl, sizeof(*drl));
	s = splnet();
	dr = TAILQ_FIRST(&V_nd_defrouter);
	while (dr && i < DRLSTSIZ) {
	drl->defrouter[i].rtaddr = dr->rtaddr;
	in6_clearscope(&drl->defrouter[i].rtaddr);

	drl->defrouter[i].flags = dr->flags;
	drl->defrouter[i].rtlifetime = dr->rtlifetime;
	drl->defrouter[i].expire = dr->expire;
	drl->defrouter[i].if_index = dr->ifp->if_index;
	i++;
	dr = TAILQ_NEXT(dr, dr_entry);
	}
	splx(s);
	break;
	case SIOCGPRLST_IN6:
	/*
	* obsolete API, use sysctl under net.inet6.icmp6
	*
	* XXX the structure in6_prlist was changed in backward-
	* incompatible manner. in6_oprlist is used for SIOCGPRLST_IN6,
	* in6_prlist is used for nd6_sysctl() - fill_prlist().
	*/
	/*
	* XXX meaning of fields, especialy "raflags", is very
	* differnet between RA prefix list and RR/static prefix list.
	* how about separating ioctls into two?
	*/
	bzero(oprl, sizeof(*oprl));
	s = splnet();
	pr = V_nd_prefix.lh_first;
	while (pr && i < PRLSTSIZ) {
	struct nd_pfxrouter *pfr;
	int j;

	oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr;
	oprl->prefix[i].raflags = pr->ndpr_raf;
	oprl->prefix[i].prefixlen = pr->ndpr_plen;
	oprl->prefix[i].vltime = pr->ndpr_vltime;
	oprl->prefix[i].pltime = pr->ndpr_pltime;
	oprl->prefix[i].if_index = pr->ndpr_ifp->if_index;
	if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
	oprl->prefix[i].expire = 0;
	else {
	time_t maxexpire;

	/* XXX: we assume time_t is signed. */
	maxexpire = (-1) &
	~((time_t)1 <<
	((sizeof(maxexpire) * 8) - 1));
	if (pr->ndpr_vltime <
	maxexpire - pr->ndpr_lastupdate) {
	oprl->prefix[i].expire =
	pr->ndpr_lastupdate +
	pr->ndpr_vltime;
	} else
	oprl->prefix[i].expire = maxexpire;
	}

	pfr = pr->ndpr_advrtrs.lh_first;
	j = 0;
	while (pfr) {
	if (j < DRLSTSIZ) {
	#define RTRADDR oprl->prefix[i].advrtr[j]
	RTRADDR = pfr->router->rtaddr;
	in6_clearscope(&RTRADDR);
	#undef RTRADDR
	}
	j++;
	pfr = pfr->pfr_next;
	}
	oprl->prefix[i].advrtrs = j;
	oprl->prefix[i].origin = PR_ORIG_RA;

	i++;
	pr = pr->ndpr_next;
	}
	splx(s);

	break;
	case OSIOCGIFINFO_IN6:
	#define ND ndi->ndi
	/* XXX: old ndp(8) assumes a positive value for linkmtu. */
	bzero(&ND, sizeof(ND));
	ND.linkmtu = IN6_LINKMTU(ifp);
	ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
	ND.basereachable = ND_IFINFO(ifp)->basereachable;
	ND.reachable = ND_IFINFO(ifp)->reachable;
	ND.retrans = ND_IFINFO(ifp)->retrans;
	ND.flags = ND_IFINFO(ifp)->flags;
	ND.recalctm = ND_IFINFO(ifp)->recalctm;
	ND.chlim = ND_IFINFO(ifp)->chlim;
	break;
	case SIOCGIFINFO_IN6:
	ND = *ND_IFINFO(ifp);
	break;
	case SIOCSIFINFO_IN6:
	/*
	* used to change host variables from userland.
	* intented for a use on router to reflect RA configurations.
	*/
	/* 0 means 'unspecified' */
	if (ND.linkmtu != 0) {
	if (ND.linkmtu < IPV6_MMTU \|\|
	ND.linkmtu > IN6_LINKMTU(ifp)) {
	error = EINVAL;
	break;
	}
	ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
	}

	if (ND.basereachable != 0) {
	int obasereachable = ND_IFINFO(ifp)->basereachable;

	ND_IFINFO(ifp)->basereachable = ND.basereachable;
	if (ND.basereachable != obasereachable)
	ND_IFINFO(ifp)->reachable =
	ND_COMPUTE_RTIME(ND.basereachable);
	}
	if (ND.retrans != 0)
	ND_IFINFO(ifp)->retrans = ND.retrans;
	if (ND.chlim != 0)
	ND_IFINFO(ifp)->chlim = ND.chlim;
	/* FALLTHROUGH */
	case SIOCSIFINFO_FLAGS:
	ND_IFINFO(ifp)->flags = ND.flags;
	break;
	#undef ND
	case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */
	/* sync kernel routing table with the default router list */
	defrouter_reset();
	defrouter_select();
	break;
	case SIOCSPFXFLUSH_IN6:
	{
	/* flush all the prefix advertised by routers */
	struct nd_prefix pr, next;

	s = splnet();
	for (pr = V_nd_prefix.lh_first; pr; pr = next) {
	struct in6_ifaddr ia, ia_next;

	next = pr->ndpr_next;

	if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
	continue; /* XXX */

	/* do we really have to remove addresses as well? */
	for (ia = V_in6_ifaddr; ia; ia = ia_next) {
	/* ia might be removed. keep the next ptr. */
	ia_next = ia->ia_next;

	if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
	continue;

	if (ia->ia6_ndpr == pr)
	in6_purgeaddr(&ia->ia_ifa);
	}
	prelist_remove(pr);
	}
	splx(s);
	break;
	}
	case SIOCSRTRFLUSH_IN6:
	{
	/* flush all the default routers */
	struct nd_defrouter dr, next;

	s = splnet();
	defrouter_reset();
	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr; dr = next) {
	next = TAILQ_NEXT(dr, dr_entry);
	defrtrlist_del(dr);
	}
	defrouter_select();
	splx(s);
	break;
	}
	case SIOCGNBRINFO_IN6:
	{
	struct llinfo_nd6 *ln;
	struct in6_addr nb_addr = nbi->addr; /* make local for safety */

	if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
	return (error);

	s = splnet();
	if ((rt = nd6_lookup(&nb_addr, 0, ifp)) == NULL) {
	error = EINVAL;
	splx(s);
	break;
	}
	ln = (struct llinfo_nd6 *)rt->rt_llinfo;
	nbi->state = ln->ln_state;
	nbi->asked = ln->ln_asked;
	nbi->isrouter = ln->ln_router;
	nbi->expire = ln->ln_expire;
	splx(s);

	break;
	}
	case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */
	ndif->ifindex = V_nd6_defifindex;
	break;
	case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */
	return (nd6_setdefaultiface(ndif->ifindex));
	}
	return (error);
	}

	/*
	* Create neighbor cache entry and cache link-layer address,
	* on reception of inbound ND6 packets. (RS/RA/NS/redirect)
	*
	* type - ICMP6 type
	* code - type dependent information
	*/
	struct rtentry *
	nd6_cache_lladdr(struct ifnet ifp, struct in6_addr from, char *lladdr,
	int lladdrlen, int type, int code)
	{
	+ INIT_VNET_INET6(curvnet);
	struct rtentry *rt = NULL;
	struct llinfo_nd6 *ln = NULL;
	int is_newentry;
	struct sockaddr_dl *sdl = NULL;
	int do_update;
	int olladdr;
	int llchange;
	int newstate = 0;

	if (ifp == NULL)
	panic("ifp == NULL in nd6_cache_lladdr");
	if (from == NULL)
	panic("from == NULL in nd6_cache_lladdr");

	/* nothing must be updated for unspecified address */
	if (IN6_IS_ADDR_UNSPECIFIED(from))
	return NULL;

	/*
	* Validation about ifp->if_addrlen and lladdrlen must be done in
	* the caller.
	*
	* XXX If the link does not have link-layer adderss, what should
	* we do? (ifp->if_addrlen == 0)
	* Spec says nothing in sections for RA, RS and NA. There's small
	* description on it in NS section (RFC 2461 7.2.3).
	*/

	rt = nd6_lookup(from, 0, ifp);
	if (rt == NULL) {
	rt = nd6_lookup(from, 1, ifp);
	is_newentry = 1;
	} else {
	/* do nothing if static ndp is set */
	if (rt->rt_flags & RTF_STATIC)
	return NULL;
	is_newentry = 0;
	}

	if (rt == NULL)
	return NULL;
	if ((rt->rt_flags & (RTF_GATEWAY \| RTF_LLINFO)) != RTF_LLINFO) {
	fail:
	(void)nd6_free(rt, 0);
	return NULL;
	}
	ln = (struct llinfo_nd6 *)rt->rt_llinfo;
	if (ln == NULL)
	goto fail;
	if (rt->rt_gateway == NULL)
	goto fail;
	if (rt->rt_gateway->sa_family != AF_LINK)
	goto fail;
	sdl = SDL(rt->rt_gateway);

	olladdr = (sdl->sdl_alen) ? 1 : 0;
	if (olladdr && lladdr) {
	if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen))
	llchange = 1;
	else
	llchange = 0;
	} else
	llchange = 0;

	/*
	* newentry olladdr lladdr llchange (*=record)
	* 0 n n -- (1)
	* 0 y n -- (2)
	* 0 n y -- (3) * STALE
	* 0 y y n (4) *
	* 0 y y y (5) * STALE
	* 1 -- n -- (6) NOSTATE(= PASSIVE)
	* 1 -- y -- (7) * STALE
	*/

	if (lladdr) { /* (3-5) and (7) */
	/*
	* Record source link-layer address
	* XXX is it dependent to ifp->if_type?
	*/
	sdl->sdl_alen = ifp->if_addrlen;
	bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
	}

	if (!is_newentry) {
	if ((!olladdr && lladdr != NULL) \|\| /* (3) */
	(olladdr && lladdr != NULL && llchange)) { /* (5) */
	do_update = 1;
	newstate = ND6_LLINFO_STALE;
	} else /* (1-2,4) */
	do_update = 0;
	} else {
	do_update = 1;
	if (lladdr == NULL) /* (6) */
	newstate = ND6_LLINFO_NOSTATE;
	else /* (7) */
	newstate = ND6_LLINFO_STALE;
	}

	if (do_update) {
	/*
	* Update the state of the neighbor cache.
	*/
	ln->ln_state = newstate;

	if (ln->ln_state == ND6_LLINFO_STALE) {
	/*
	* XXX: since nd6_output() below will cause
	* state tansition to DELAY and reset the timer,
	* we must set the timer now, although it is actually
	* meaningless.
	*/
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);

	if (ln->ln_hold) {
	struct mbuf m_hold, m_hold_next;

	/*
	* reset the ln_hold in advance, to explicitly
	* prevent a ln_hold lookup in nd6_output()
	* (wouldn't happen, though...)
	*/
	for (m_hold = ln->ln_hold, ln->ln_hold = NULL;
	m_hold; m_hold = m_hold_next) {
	m_hold_next = m_hold->m_nextpkt;
	m_hold->m_nextpkt = NULL;

	/*
	* we assume ifp is not a p2p here, so
	* just set the 2nd argument as the
	* 1st one.
	*/
	nd6_output(ifp, ifp, m_hold,
	(struct sockaddr_in6 *)rt_key(rt),
	rt);
	}
	}
	} else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
	/* probe right away */
	nd6_llinfo_settimer((void *)ln, 0);
	}
	}

	/*
	* ICMP6 type dependent behavior.
	*
	* NS: clear IsRouter if new entry
	* RS: clear IsRouter
	* RA: set IsRouter if there's lladdr
	* redir: clear IsRouter if new entry
	*
	* RA case, (1):
	* The spec says that we must set IsRouter in the following cases:
	* - If lladdr exist, set IsRouter. This means (1-5).
	* - If it is old entry (!newentry), set IsRouter. This means (7).
	* So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
	* A quetion arises for (1) case. (1) case has no lladdr in the
	* neighbor cache, this is similar to (6).
	* This case is rare but we figured that we MUST NOT set IsRouter.
	*
	* newentry olladdr lladdr llchange NS RS RA redir
	* D R
	* 0 n n -- (1) c ? s
	* 0 y n -- (2) c s s
	* 0 n y -- (3) c s s
	* 0 y y n (4) c s s
	* 0 y y y (5) c s s
	* 1 -- n -- (6) c c c s
	* 1 -- y -- (7) c c s c s
	*
	* (c=clear s=set)
	*/
	switch (type & 0xff) {
	case ND_NEIGHBOR_SOLICIT:
	/*
	* New entry must have is_router flag cleared.
	*/
	if (is_newentry) /* (6-7) */
	ln->ln_router = 0;
	break;
	case ND_REDIRECT:
	/*
	* If the icmp is a redirect to a better router, always set the
	* is_router flag. Otherwise, if the entry is newly created,
	* clear the flag. [RFC 2461, sec 8.3]
	*/
	if (code == ND_REDIRECT_ROUTER)
	ln->ln_router = 1;
	else if (is_newentry) /* (6-7) */
	ln->ln_router = 0;
	break;
	case ND_ROUTER_SOLICIT:
	/*
	* is_router flag must always be cleared.
	*/
	ln->ln_router = 0;
	break;
	case ND_ROUTER_ADVERT:
	/*
	* Mark an entry with lladdr as a router.
	*/
	if ((!is_newentry && (olladdr \|\| lladdr)) \|\| /* (2-5) */
	(is_newentry && lladdr)) { /* (7) */
	ln->ln_router = 1;
	}
	break;
	}

	/*
	* When the link-layer address of a router changes, select the
	* best router again. In particular, when the neighbor entry is newly
	* created, it might affect the selection policy.
	* Question: can we restrict the first condition to the "is_newentry"
	* case?
	* XXX: when we hear an RA from a new router with the link-layer
	* address option, defrouter_select() is called twice, since
	* defrtrlist_update called the function as well. However, I believe
	* we can compromise the overhead, since it only happens the first
	* time.
	* XXX: although defrouter_select() should not have a bad effect
	* for those are not autoconfigured hosts, we explicitly avoid such
	* cases for safety.
	*/
	if (do_update && ln->ln_router && !V_ip6_forwarding && V_ip6_accept_rtadv)
	defrouter_select();

	return rt;
	}

	static void
	-nd6_slowtimo(void *ignored_arg)
	+nd6_slowtimo(void *arg)
	{
	+ CURVNET_SET((struct vnet *) arg);
	+ INIT_VNET_NET((struct vnet *) arg);
	+ INIT_VNET_INET6((struct vnet *) arg);
	struct nd_ifinfo *nd6if;
	struct ifnet *ifp;

	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
	nd6_slowtimo, NULL);
	IFNET_RLOCK();
	for (ifp = TAILQ_FIRST(&V_ifnet); ifp;
	ifp = TAILQ_NEXT(ifp, if_list)) {
	nd6if = ND_IFINFO(ifp);
	if (nd6if->basereachable && /* already initialized */
	(nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
	/*
	* Since reachable time rarely changes by router
	* advertisements, we SHOULD insure that a new random
	* value gets recomputed at least once every few hours.
	* (RFC 2461, 6.3.4)
	*/
	nd6if->recalctm = V_nd6_recalc_reachtm_interval;
	nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
	}
	}
	IFNET_RUNLOCK();
	+ CURVNET_RESTORE();
	}

	#define senderr(e) { error = (e); goto bad;}
	int
	nd6_output(struct ifnet ifp, struct ifnet origifp, struct mbuf *m0,
	struct sockaddr_in6 dst, struct rtentry rt0)
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf *m = m0;
	struct rtentry *rt = rt0;
	struct sockaddr_in6 *gw6 = NULL;
	struct llinfo_nd6 *ln = NULL;
	int error = 0;

	if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr))
	goto sendpkt;

	if (nd6_need_cache(ifp) == 0)
	goto sendpkt;

	/*
	* next hop determination. This routine is derived from ether_output.
	*/
	/* NB: the locking here is tortuous... */
	if (rt != NULL)
	RT_LOCK(rt);
	again:
	if (rt != NULL) {
	if ((rt->rt_flags & RTF_UP) == 0) {
	RT_UNLOCK(rt);
	rt0 = rt = rtalloc1((struct sockaddr *)dst, 1, 0UL);
	if (rt != NULL) {
	RT_REMREF(rt);
	if (rt->rt_ifp != ifp)
	/*
	* XXX maybe we should update ifp too,
	* but the original code didn't and I
	* don't know what is correct here.
	*/
	goto again;
	} else
	senderr(EHOSTUNREACH);
	}

	if (rt->rt_flags & RTF_GATEWAY) {
	gw6 = (struct sockaddr_in6 *)rt->rt_gateway;

	/*
	* We skip link-layer address resolution and NUD
	* if the gateway is not a neighbor from ND point
	* of view, regardless of the value of nd_ifinfo.flags.
	* The second condition is a bit tricky; we skip
	* if the gateway is our own address, which is
	* sometimes used to install a route to a p2p link.
	*/
	if (!nd6_is_addr_neighbor(gw6, ifp) \|\|
	in6ifa_ifpwithaddr(ifp, &gw6->sin6_addr)) {
	RT_UNLOCK(rt);
	/*
	* We allow this kind of tricky route only
	* when the outgoing interface is p2p.
	* XXX: we may need a more generic rule here.
	*/
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
	senderr(EHOSTUNREACH);

	goto sendpkt;
	}

	if (rt->rt_gwroute == NULL)
	goto lookup;
	rt = rt->rt_gwroute;
	RT_LOCK(rt); /* NB: gwroute */
	if ((rt->rt_flags & RTF_UP) == 0) {
	RTFREE_LOCKED(rt); /* unlock gwroute */
	rt = rt0;
	rt0->rt_gwroute = NULL;
	lookup:
	RT_UNLOCK(rt0);
	rt = rtalloc1(rt->rt_gateway, 1, 0UL);
	if (rt == rt0) {
	RT_REMREF(rt0);
	RT_UNLOCK(rt0);
	senderr(EHOSTUNREACH);
	}
	RT_LOCK(rt0);
	if (rt0->rt_gwroute != NULL)
	RTFREE(rt0->rt_gwroute);
	rt0->rt_gwroute = rt;
	if (rt == NULL) {
	RT_UNLOCK(rt0);
	senderr(EHOSTUNREACH);
	}
	}
	RT_UNLOCK(rt0);
	}
	RT_UNLOCK(rt);
	}

	/*
	* Address resolution or Neighbor Unreachability Detection
	* for the next hop.
	* At this point, the destination of the packet must be a unicast
	* or an anycast address(i.e. not a multicast).
	*/

	/* Look up the neighbor cache for the nexthop */
	if (rt && (rt->rt_flags & RTF_LLINFO) != 0)
	ln = (struct llinfo_nd6 *)rt->rt_llinfo;
	else {
	/*
	* Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
	* the condition below is not very efficient. But we believe
	* it is tolerable, because this should be a rare case.
	*/
	if (nd6_is_addr_neighbor(dst, ifp) &&
	(rt = nd6_lookup(&dst->sin6_addr, 1, ifp)) != NULL)
	ln = (struct llinfo_nd6 *)rt->rt_llinfo;
	}
	if (ln == NULL \|\| rt == NULL) {
	if ((ifp->if_flags & IFF_POINTOPOINT) == 0 &&
	!(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) {
	char ip6buf[INET6_ADDRSTRLEN];
	log(LOG_DEBUG,
	"nd6_output: can't allocate llinfo for %s "
	"(ln=%p, rt=%p)\n",
	ip6_sprintf(ip6buf, &dst->sin6_addr), ln, rt);
	senderr(EIO); /* XXX: good error? */
	}

	goto sendpkt; /* send anyway */
	}

	/* We don't have to do link-layer address resolution on a p2p link. */
	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
	ln->ln_state < ND6_LLINFO_REACHABLE) {
	ln->ln_state = ND6_LLINFO_STALE;
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
	}

	/*
	* The first time we send a packet to a neighbor whose entry is
	* STALE, we have to change the state to DELAY and a sets a timer to
	* expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
	* neighbor unreachability detection on expiration.
	* (RFC 2461 7.3.3)
	*/
	if (ln->ln_state == ND6_LLINFO_STALE) {
	ln->ln_asked = 0;
	ln->ln_state = ND6_LLINFO_DELAY;
	nd6_llinfo_settimer(ln, (long)V_nd6_delay * hz);
	}

	/*
	* If the neighbor cache entry has a state other than INCOMPLETE
	* (i.e. its link-layer address is already resolved), just
	* send the packet.
	*/
	if (ln->ln_state > ND6_LLINFO_INCOMPLETE)
	goto sendpkt;

	/*
	* There is a neighbor cache entry, but no ethernet address
	* response yet. Append this latest packet to the end of the
	* packet queue in the mbuf, unless the number of the packet
	* does not exceed nd6_maxqueuelen. When it exceeds nd6_maxqueuelen,
	* the oldest packet in the queue will be removed.
	*/
	if (ln->ln_state == ND6_LLINFO_NOSTATE)
	ln->ln_state = ND6_LLINFO_INCOMPLETE;
	if (ln->ln_hold) {
	struct mbuf *m_hold;
	int i;

	i = 0;
	for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold->m_nextpkt) {
	i++;
	if (m_hold->m_nextpkt == NULL) {
	m_hold->m_nextpkt = m;
	break;
	}
	}
	while (i >= V_nd6_maxqueuelen) {
	m_hold = ln->ln_hold;
	ln->ln_hold = ln->ln_hold->m_nextpkt;
	m_freem(m_hold);
	i--;
	}
	} else {
	ln->ln_hold = m;
	}

	/*
	* If there has been no NS for the neighbor after entering the
	* INCOMPLETE state, send the first solicitation.
	*/
	if (!ND6_LLINFO_PERMANENT(ln) && ln->ln_asked == 0) {
	ln->ln_asked++;
	nd6_llinfo_settimer(ln,
	(long)ND_IFINFO(ifp)->retrans * hz / 1000);
	nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0);
	}
	return (0);

	sendpkt:
	/* discard the packet if IPv6 operation is disabled on the interface */
	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
	error = ENETDOWN; /* better error? */
	goto bad;
	}

	#ifdef MAC
	mac_netinet6_nd6_send(ifp, m);
	#endif
	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
	return ((ifp->if_output)(origifp, m, (struct sockaddr )dst,
	rt));
	}
	return ((ifp->if_output)(ifp, m, (struct sockaddr )dst, rt));

	bad:
	if (m)
	m_freem(m);
	return (error);
	}
	#undef senderr

	int
	nd6_need_cache(struct ifnet *ifp)
	{
	/*
	* XXX: we currently do not make neighbor cache on any interface
	* other than ARCnet, Ethernet, FDDI and GIF.
	*
	* RFC2893 says:
	* - unidirectional tunnels needs no ND
	*/
	switch (ifp->if_type) {
	case IFT_ARCNET:
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_IEEE1394:
	#ifdef IFT_L2VLAN
	case IFT_L2VLAN:
	#endif
	#ifdef IFT_IEEE80211
	case IFT_IEEE80211:
	#endif
	#ifdef IFT_CARP
	case IFT_CARP:
	#endif
	case IFT_GIF: /* XXX need more cases? */
	case IFT_PPP:
	case IFT_TUNNEL:
	case IFT_BRIDGE:
	case IFT_PROPVIRTUAL:
	return (1);
	default:
	return (0);
	}
	}

	int
	nd6_storelladdr(struct ifnet ifp, struct rtentry rt0, struct mbuf *m,
	struct sockaddr dst, u_char desten)
	{
	struct sockaddr_dl *sdl;
	struct rtentry *rt;
	int error;

	if (m->m_flags & M_MCAST) {
	int i;

	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_FDDI:
	#ifdef IFT_L2VLAN
	case IFT_L2VLAN:
	#endif
	#ifdef IFT_IEEE80211
	case IFT_IEEE80211:
	#endif
	case IFT_BRIDGE:
	case IFT_ISO88025:
	ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr,
	desten);
	return (0);
	case IFT_IEEE1394:
	/*
	* netbsd can use if_broadcastaddr, but we don't do so
	* to reduce # of ifdef.
	*/
	for (i = 0; i < ifp->if_addrlen; i++)
	desten[i] = ~0;
	return (0);
	case IFT_ARCNET:
	*desten = 0;
	return (0);
	default:
	m_freem(m);
	return (EAFNOSUPPORT);
	}
	}

	if (rt0 == NULL) {
	/* this could happen, if we could not allocate memory */
	m_freem(m);
	return (ENOMEM);
	}

	error = rt_check(&rt, &rt0, dst);
	if (error) {
	m_freem(m);
	return (error);
	}
	RT_UNLOCK(rt);

	if (rt->rt_gateway->sa_family != AF_LINK) {
	printf("nd6_storelladdr: something odd happens\n");
	m_freem(m);
	return (EINVAL);
	}
	sdl = SDL(rt->rt_gateway);
	if (sdl->sdl_alen == 0) {
	/* this should be impossible, but we bark here for debugging */
	printf("nd6_storelladdr: sdl_alen == 0\n");
	m_freem(m);
	return (EINVAL);
	}

	bcopy(LLADDR(sdl), desten, sdl->sdl_alen);
	return (0);
	}

	static void
	clear_llinfo_pqueue(struct llinfo_nd6 *ln)
	{
	struct mbuf m_hold, m_hold_next;

	for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold_next) {
	m_hold_next = m_hold->m_nextpkt;
	m_hold->m_nextpkt = NULL;
	m_freem(m_hold);
	}

	ln->ln_hold = NULL;
	return;
	}

	static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS);
	static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS);
	#ifdef SYSCTL_DECL
	SYSCTL_DECL(_net_inet6_icmp6);
	#endif
	SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
	CTLFLAG_RD, nd6_sysctl_drlist, "");
	SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
	CTLFLAG_RD, nd6_sysctl_prlist, "");
	-SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
	- CTLFLAG_RW, &nd6_maxqueuelen, 1, "");
	+SYSCTL_V_INT(V_NET, vnet_inet6, _net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN,
	+ nd6_maxqueuelen, CTLFLAG_RW, nd6_maxqueuelen, 1, "");

	static int
	nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET6(curvnet);
	int error;
	char buf[1024] __aligned(4);
	struct in6_defrouter d, de;
	struct nd_defrouter *dr;

	if (req->newptr)
	return EPERM;
	error = 0;

	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
	dr = TAILQ_NEXT(dr, dr_entry)) {
	d = (struct in6_defrouter *)buf;
	de = (struct in6_defrouter *)(buf + sizeof(buf));

	if (d + 1 <= de) {
	bzero(d, sizeof(*d));
	d->rtaddr.sin6_family = AF_INET6;
	d->rtaddr.sin6_len = sizeof(d->rtaddr);
	d->rtaddr.sin6_addr = dr->rtaddr;
	error = sa6_recoverscope(&d->rtaddr);
	if (error != 0)
	return (error);
	d->flags = dr->flags;
	d->rtlifetime = dr->rtlifetime;
	d->expire = dr->expire;
	d->if_index = dr->ifp->if_index;
	} else
	panic("buffer too short");

	error = SYSCTL_OUT(req, buf, sizeof(*d));
	if (error)
	break;
	}

	return (error);
	}

	static int
	nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET6(curvnet);
	int error;
	char buf[1024] __aligned(4);
	struct in6_prefix p, pe;
	struct nd_prefix *pr;
	char ip6buf[INET6_ADDRSTRLEN];

	if (req->newptr)
	return EPERM;
	error = 0;

	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	u_short advrtrs;
	size_t advance;
	struct sockaddr_in6 sin6, s6;
	struct nd_pfxrouter *pfr;

	p = (struct in6_prefix *)buf;
	pe = (struct in6_prefix *)(buf + sizeof(buf));

	if (p + 1 <= pe) {
	bzero(p, sizeof(*p));
	sin6 = (struct sockaddr_in6 *)(p + 1);

	p->prefix = pr->ndpr_prefix;
	if (sa6_recoverscope(&p->prefix)) {
	log(LOG_ERR,
	"scope error in prefix list (%s)\n",
	ip6_sprintf(ip6buf, &p->prefix.sin6_addr));
	/* XXX: press on... */
	}
	p->raflags = pr->ndpr_raf;
	p->prefixlen = pr->ndpr_plen;
	p->vltime = pr->ndpr_vltime;
	p->pltime = pr->ndpr_pltime;
	p->if_index = pr->ndpr_ifp->if_index;
	if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
	p->expire = 0;
	else {
	time_t maxexpire;

	/* XXX: we assume time_t is signed. */
	maxexpire = (-1) &
	~((time_t)1 <<
	((sizeof(maxexpire) * 8) - 1));
	if (pr->ndpr_vltime <
	maxexpire - pr->ndpr_lastupdate) {
	p->expire = pr->ndpr_lastupdate +
	pr->ndpr_vltime;
	} else
	p->expire = maxexpire;
	}
	p->refcnt = pr->ndpr_refcnt;
	p->flags = pr->ndpr_stateflags;
	p->origin = PR_ORIG_RA;
	advrtrs = 0;
	for (pfr = pr->ndpr_advrtrs.lh_first; pfr;
	pfr = pfr->pfr_next) {
	if ((void )&sin6[advrtrs + 1] > (void )pe) {
	advrtrs++;
	continue;
	}
	s6 = &sin6[advrtrs];
	bzero(s6, sizeof(*s6));
	s6->sin6_family = AF_INET6;
	s6->sin6_len = sizeof(*sin6);
	s6->sin6_addr = pfr->router->rtaddr;
	if (sa6_recoverscope(s6)) {
	log(LOG_ERR,
	"scope error in "
	"prefix list (%s)\n",
	ip6_sprintf(ip6buf,
	&pfr->router->rtaddr));
	}
	advrtrs++;
	}
	p->advrtrs = advrtrs;
	} else
	panic("buffer too short");

	advance = sizeof(p) + sizeof(sin6) * advrtrs;
	error = SYSCTL_OUT(req, buf, advance);
	if (error)
	break;
	}

	return (error);
	}
	Index: head/sys/netinet6/nd6_nbr.c
	===================================================================
	--- head/sys/netinet6/nd6_nbr.c (revision 183549)
	+++ head/sys/netinet6/nd6_nbr.c (revision 183550)
	@@ -1,1488 +1,1500 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_carp.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/queue.h>
	#include <sys/callout.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_dl.h>
	#include <net/if_var.h>
	#include <net/route.h>
	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/icmp6.h>

	#ifdef DEV_CARP
	#include <netinet/ip_carp.h>
	#endif

	#define SDL(s) ((struct sockaddr_dl *)s)

	struct dadq;
	static struct dadq nd6_dad_find(struct ifaddr );
	static void nd6_dad_starttimer(struct dadq *, int);
	static void nd6_dad_stoptimer(struct dadq *);
	static void nd6_dad_timer(struct ifaddr *);
	static void nd6_dad_ns_output(struct dadq , struct ifaddr );
	static void nd6_dad_ns_input(struct ifaddr *);
	static void nd6_dad_na_input(struct ifaddr *);

	static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/
	static int dad_maxtry = 15; /* max # of tries to transmit DAD packet */

	/*
	* Input a Neighbor Solicitation Message.
	*
	* Based on RFC 2461
	* Based on RFC 2462 (duplicate address detection)
	*/
	void
	nd6_ns_input(struct mbuf *m, int off, int icmp6len)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_neighbor_solicit *nd_ns;
	struct in6_addr saddr6 = ip6->ip6_src;
	struct in6_addr daddr6 = ip6->ip6_dst;
	struct in6_addr taddr6;
	struct in6_addr myaddr6;
	char *lladdr = NULL;
	struct ifaddr *ifa = NULL;
	int lladdrlen = 0;
	int anycast = 0, proxy = 0, tentative = 0;
	int tlladdr;
	union nd_opts ndopts;
	struct sockaddr_dl *proxydl = NULL;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_ns, struct nd_neighbor_solicit *, m, off, icmp6len);
	if (nd_ns == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif
	ip6 = mtod(m, struct ip6_hdr ); / adjust pointer for safety */
	taddr6 = nd_ns->nd_ns_target;
	if (in6_setscope(&taddr6, ifp, NULL) != 0)
	goto bad;

	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
	ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
	goto bad;
	}

	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
	/* dst has to be a solicited node multicast address. */
	if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL &&
	/* don't check ifindex portion */
	daddr6.s6_addr32[1] == 0 &&
	daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE &&
	daddr6.s6_addr8[12] == 0xff) {
	; /* good */
	} else {
	nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
	"(wrong ip6 dst)\n"));
	goto bad;
	}
	} else if (!nd6_onlink_ns_rfc4861) {
	struct sockaddr_in6 src_sa6;

	/*
	* According to recent IETF discussions, it is not a good idea
	* to accept a NS from an address which would not be deemed
	* to be a neighbor otherwise. This point is expected to be
	* clarified in future revisions of the specification.
	*/
	bzero(&src_sa6, sizeof(src_sa6));
	src_sa6.sin6_family = AF_INET6;
	src_sa6.sin6_len = sizeof(src_sa6);
	src_sa6.sin6_addr = saddr6;
	if (!nd6_is_addr_neighbor(&src_sa6, ifp)) {
	nd6log((LOG_INFO, "nd6_ns_input: "
	"NS packet from non-neighbor\n"));
	goto bad;
	}
	}

	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
	nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
	goto bad;
	}

	icmp6len -= sizeof(*nd_ns);
	nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO,
	"nd6_ns_input: invalid ND option, ignored\n"));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	if (ndopts.nd_opts_src_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
	lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
	}

	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
	nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
	"(link-layer address option)\n"));
	goto bad;
	}

	/*
	* Attaching target link-layer address to the NA?
	* (RFC 2461 7.2.4)
	*
	* NS IP dst is unicast/anycast MUST NOT add
	* NS IP dst is solicited-node multicast MUST add
	*
	* In implementation, we add target link-layer address by default.
	* We do not add one in MUST NOT cases.
	*/
	if (!IN6_IS_ADDR_MULTICAST(&daddr6))
	tlladdr = 0;
	else
	tlladdr = 1;

	/*
	* Target address (taddr6) must be either:
	* (1) Valid unicast/anycast address for my receiving interface,
	* (2) Unicast address for which I'm offering proxy service, or
	* (3) "tentative" address on which DAD is being performed.
	*/
	/* (1) and (3) check. */
	#ifdef DEV_CARP
	if (ifp->if_carp)
	ifa = carp_iamatch6(ifp->if_carp, &taddr6);
	if (ifa == NULL)
	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
	#else
	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
	#endif

	/* (2) check. */
	if (ifa == NULL) {
	struct rtentry *rt;
	struct sockaddr_in6 tsin6;
	int need_proxy;
	#ifdef RADIX_MPATH
	struct route_in6 ro;
	#endif

	bzero(&tsin6, sizeof tsin6);
	tsin6.sin6_len = sizeof(struct sockaddr_in6);
	tsin6.sin6_family = AF_INET6;
	tsin6.sin6_addr = taddr6;

	#ifdef RADIX_MPATH
	bzero(&ro, sizeof(ro));
	ro.ro_dst = tsin6;
	rtalloc_mpath((struct route *)&ro, RTF_ANNOUNCE);
	rt = ro.ro_rt;
	#else
	rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0);
	#endif
	need_proxy = (rt && (rt->rt_flags & RTF_ANNOUNCE) != 0 &&
	rt->rt_gateway->sa_family == AF_LINK);
	if (rt)
	rtfree(rt);
	if (need_proxy) {
	/*
	* proxy NDP for single entry
	*/
	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
	IN6_IFF_NOTREADY\|IN6_IFF_ANYCAST);
	if (ifa) {
	proxy = 1;
	proxydl = SDL(rt->rt_gateway);
	}
	}
	}
	if (ifa == NULL) {
	/*
	* We've got an NS packet, and we don't have that adddress
	* assigned for us. We MUST silently ignore it.
	* See RFC2461 7.2.3.
	*/
	goto freeit;
	}
	myaddr6 = *IFA_IN6(ifa);
	anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
	tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
	goto freeit;

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s "
	"(if %d, NS packet %d)\n",
	ip6_sprintf(ip6bufs, &taddr6),
	ifp->if_addrlen, lladdrlen - 2));
	goto bad;
	}

	if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
	nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n",
	ip6_sprintf(ip6bufs, &saddr6)));
	goto freeit;
	}

	/*
	* We have neighbor solicitation packet, with target address equals to
	* one of my tentative address.
	*
	* src addr how to process?
	* --- ---
	* multicast of course, invalid (rejected in ip6_input)
	* unicast somebody is doing address resolution -> ignore
	* unspec dup address detection
	*
	* The processing is defined in RFC 2462.
	*/
	if (tentative) {
	/*
	* If source address is unspecified address, it is for
	* duplicate address detection.
	*
	* If not, the packet is for addess resolution;
	* silently ignore it.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
	nd6_dad_ns_input(ifa);

	goto freeit;
	}

	/*
	* If the source address is unspecified address, entries must not
	* be created or updated.
	* It looks that sender is performing DAD. Output NA toward
	* all-node multicast address, to tell the sender that I'm using
	* the address.
	* S bit ("solicited") must be zero.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
	struct in6_addr in6_all;

	in6_all = in6addr_linklocal_allnodes;
	if (in6_setscope(&in6_all, ifp, NULL) != 0)
	goto bad;
	nd6_na_output(ifp, &in6_all, &taddr6,
	((anycast \|\| proxy \|\| !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) \|
	(V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0),
	tlladdr, (struct sockaddr *)proxydl);
	goto freeit;
	}

	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
	ND_NEIGHBOR_SOLICIT, 0);

	nd6_na_output(ifp, &saddr6, &taddr6,
	((anycast \|\| proxy \|\| !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) \|
	(V_ip6_forwarding ? ND_NA_FLAG_ROUTER : 0) \| ND_NA_FLAG_SOLICITED,
	tlladdr, (struct sockaddr *)proxydl);
	freeit:
	m_freem(m);
	return;

	bad:
	nd6log((LOG_ERR, "nd6_ns_input: src=%s\n",
	ip6_sprintf(ip6bufs, &saddr6)));
	nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n",
	ip6_sprintf(ip6bufs, &daddr6)));
	nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n",
	ip6_sprintf(ip6bufs, &taddr6)));
	V_icmp6stat.icp6s_badns++;
	m_freem(m);
	}

	/*
	* Output a Neighbor Solicitation Message. Caller specifies:
	* - ICMP6 header source IP6 address
	* - ND6 header target IP6 address
	* - ND6 header source datalink address
	*
	* Based on RFC 2461
	* Based on RFC 2462 (duplicate address detection)
	*
	* ln - for source address determination
	* dad - duplicate address detection
	*/
	void
	nd6_ns_output(struct ifnet ifp, const struct in6_addr daddr6,
	const struct in6_addr taddr6, struct llinfo_nd6 ln, int dad)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct mbuf *m;
	struct ip6_hdr *ip6;
	struct nd_neighbor_solicit *nd_ns;
	struct in6_addr *src, src_in;
	struct ip6_moptions im6o;
	int icmp6len;
	int maxlen;
	caddr_t mac;
	struct route_in6 ro;

	bzero(&ro, sizeof(ro));

	if (IN6_IS_ADDR_MULTICAST(taddr6))
	return;

	/* estimate the size of message */
	maxlen = sizeof(ip6) + sizeof(nd_ns);
	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
	if (max_linkhdr + maxlen >= MCLBYTES) {
	#ifdef DIAGNOSTIC
	printf("nd6_ns_output: max_linkhdr + maxlen >= MCLBYTES "
	"(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
	#endif
	return;
	}

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m && max_linkhdr + maxlen >= MHLEN) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	m_free(m);
	m = NULL;
	}
	}
	if (m == NULL)
	return;
	m->m_pkthdr.rcvif = NULL;

	if (daddr6 == NULL \|\| IN6_IS_ADDR_MULTICAST(daddr6)) {
	m->m_flags \|= M_MCAST;
	im6o.im6o_multicast_ifp = ifp;
	im6o.im6o_multicast_hlim = 255;
	im6o.im6o_multicast_loop = 0;
	}

	icmp6len = sizeof(*nd_ns);
	m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
	m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */

	/* fill neighbor solicitation packet */
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	/* ip6->ip6_plen will be set later */
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_hlim = 255;
	if (daddr6)
	ip6->ip6_dst = *daddr6;
	else {
	ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
	ip6->ip6_dst.s6_addr16[1] = 0;
	ip6->ip6_dst.s6_addr32[1] = 0;
	ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
	ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
	ip6->ip6_dst.s6_addr8[12] = 0xff;
	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
	goto bad;
	}
	if (!dad) {
	/*
	* RFC2461 7.2.2:
	* "If the source address of the packet prompting the
	* solicitation is the same as one of the addresses assigned
	* to the outgoing interface, that address SHOULD be placed
	* in the IP Source Address of the outgoing solicitation.
	* Otherwise, any one of the addresses assigned to the
	* interface should be used."
	*
	* We use the source address for the prompting packet
	* (saddr6), if:
	* - saddr6 is given from the caller (by giving "ln"), and
	* - saddr6 belongs to the outgoing interface.
	* Otherwise, we perform the source address selection as usual.
	*/
	struct ip6_hdr hip6; / hold ip6 */
	struct in6_addr *hsrc = NULL;

	if (ln && ln->ln_hold) {
	/*
	* assuming every packet in ln_hold has the same IP
	* header
	*/
	hip6 = mtod(ln->ln_hold, struct ip6_hdr *);
	/* XXX pullup? */
	if (sizeof(*hip6) < ln->ln_hold->m_len)
	hsrc = &hip6->ip6_src;
	else
	hsrc = NULL;
	}
	if (hsrc && in6ifa_ifpwithaddr(ifp, hsrc))
	src = hsrc;
	else {
	int error;
	struct sockaddr_in6 dst_sa;

	bzero(&dst_sa, sizeof(dst_sa));
	dst_sa.sin6_family = AF_INET6;
	dst_sa.sin6_len = sizeof(dst_sa);
	dst_sa.sin6_addr = ip6->ip6_dst;

	src = in6_selectsrc(&dst_sa, NULL,
	NULL, &ro, NULL, NULL, &error);
	if (src == NULL) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_DEBUG,
	"nd6_ns_output: source can't be "
	"determined: dst=%s, error=%d\n",
	ip6_sprintf(ip6buf, &dst_sa.sin6_addr),
	error));
	goto bad;
	}
	}
	} else {
	/*
	* Source address for DAD packet must always be IPv6
	* unspecified address. (0::0)
	* We actually don't have to 0-clear the address (we did it
	* above), but we do so here explicitly to make the intention
	* clearer.
	*/
	bzero(&src_in, sizeof(src_in));
	src = &src_in;
	}
	ip6->ip6_src = *src;
	nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
	nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
	nd_ns->nd_ns_code = 0;
	nd_ns->nd_ns_reserved = 0;
	nd_ns->nd_ns_target = *taddr6;
	in6_clearscope(&nd_ns->nd_ns_target); /* XXX */

	/*
	* Add source link-layer address option.
	*
	* spec implementation
	* --- ---
	* DAD packet MUST NOT do not add the option
	* there's no link layer address:
	* impossible do not add the option
	* there's link layer address:
	* Multicast NS MUST add one add the option
	* Unicast NS SHOULD add one add the option
	*/
	if (!dad && (mac = nd6_ifptomac(ifp))) {
	int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
	struct nd_opt_hdr nd_opt = (struct nd_opt_hdr )(nd_ns + 1);
	/* 8 byte alignments... */
	optlen = (optlen + 7) & ~7;

	m->m_pkthdr.len += optlen;
	m->m_len += optlen;
	icmp6len += optlen;
	bzero((caddr_t)nd_opt, optlen);
	nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
	nd_opt->nd_opt_len = optlen >> 3;
	bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
	}

	ip6->ip6_plen = htons((u_short)icmp6len);
	nd_ns->nd_ns_cksum = 0;
	nd_ns->nd_ns_cksum =
	in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);

	ip6_output(m, NULL, &ro, dad ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL);
	icmp6_ifstat_inc(ifp, ifs6_out_msg);
	icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit);
	V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++;

	if (ro.ro_rt) { /* we don't cache this route. */
	RTFREE(ro.ro_rt);
	}
	return;

	bad:
	if (ro.ro_rt) {
	RTFREE(ro.ro_rt);
	}
	m_freem(m);
	return;
	}

	/*
	* Neighbor advertisement input handling.
	*
	* Based on RFC 2461
	* Based on RFC 2462 (duplicate address detection)
	*
	* the following items are not implemented yet:
	* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
	* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
	*/
	void
	nd6_na_input(struct mbuf *m, int off, int icmp6len)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_neighbor_advert *nd_na;
	struct in6_addr daddr6 = ip6->ip6_dst;
	struct in6_addr taddr6;
	int flags;
	int is_router;
	int is_solicited;
	int is_override;
	char *lladdr = NULL;
	int lladdrlen = 0;
	struct ifaddr *ifa;
	struct llinfo_nd6 *ln;
	struct rtentry *rt;
	struct sockaddr_dl *sdl;
	union nd_opts ndopts;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
	ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
	goto bad;
	}

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_na, struct nd_neighbor_advert *, m, off, icmp6len);
	if (nd_na == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif

	flags = nd_na->nd_na_flags_reserved;
	is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
	is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
	is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);

	taddr6 = nd_na->nd_na_target;
	if (in6_setscope(&taddr6, ifp, NULL))
	goto bad; /* XXX: impossible */

	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
	nd6log((LOG_ERR,
	"nd6_na_input: invalid target address %s\n",
	ip6_sprintf(ip6bufs, &taddr6)));
	goto bad;
	}
	if (IN6_IS_ADDR_MULTICAST(&daddr6))
	if (is_solicited) {
	nd6log((LOG_ERR,
	"nd6_na_input: a solicited adv is multicasted\n"));
	goto bad;
	}

	icmp6len -= sizeof(*nd_na);
	nd6_option_init(nd_na + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO,
	"nd6_na_input: invalid ND option, ignored\n"));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	if (ndopts.nd_opts_tgt_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
	lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
	}

	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);

	/*
	* Target address matches one of my interface address.
	*
	* If my address is tentative, this means that there's somebody
	* already using the same address as mine. This indicates DAD failure.
	* This is defined in RFC 2462.
	*
	* Otherwise, process as defined in RFC 2461.
	*/
	if (ifa
	&& (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
	nd6_dad_na_input(ifa);
	goto freeit;
	}

	/* Just for safety, maybe unnecessary. */
	if (ifa) {
	log(LOG_ERR,
	"nd6_na_input: duplicate IP6 address %s\n",
	ip6_sprintf(ip6bufs, &taddr6));
	goto freeit;
	}

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s "
	"(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6),
	ifp->if_addrlen, lladdrlen - 2));
	goto bad;
	}

	/*
	* If no neighbor cache entry is found, NA SHOULD silently be
	* discarded.
	*/
	rt = nd6_lookup(&taddr6, 0, ifp);
	if ((rt == NULL) \|\|
	((ln = (struct llinfo_nd6 *)rt->rt_llinfo) == NULL) \|\|
	((sdl = SDL(rt->rt_gateway)) == NULL))
	goto freeit;

	if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
	/*
	* If the link-layer has address, and no lladdr option came,
	* discard the packet.
	*/
	if (ifp->if_addrlen && lladdr == NULL)
	goto freeit;

	/*
	* Record link-layer address, and update the state.
	*/
	sdl->sdl_alen = ifp->if_addrlen;
	bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
	if (is_solicited) {
	ln->ln_state = ND6_LLINFO_REACHABLE;
	ln->ln_byhint = 0;
	if (!ND6_LLINFO_PERMANENT(ln)) {
	nd6_llinfo_settimer(ln,
	(long)ND_IFINFO(rt->rt_ifp)->reachable * hz);
	}
	} else {
	ln->ln_state = ND6_LLINFO_STALE;
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
	}
	if ((ln->ln_router = is_router) != 0) {
	/*
	* This means a router's state has changed from
	* non-reachable to probably reachable, and might
	* affect the status of associated prefixes..
	*/
	pfxlist_onlink_check();
	}
	} else {
	int llchange;

	/*
	* Check if the link-layer address has changed or not.
	*/
	if (lladdr == NULL)
	llchange = 0;
	else {
	if (sdl->sdl_alen) {
	if (bcmp(lladdr, LLADDR(sdl), ifp->if_addrlen))
	llchange = 1;
	else
	llchange = 0;
	} else
	llchange = 1;
	}

	/*
	* This is VERY complex. Look at it with care.
	*
	* override solicit lladdr llchange action
	* (L: record lladdr)
	*
	* 0 0 n -- (2c)
	* 0 0 y n (2b) L
	* 0 0 y y (1) REACHABLE->STALE
	* 0 1 n -- (2c) *->REACHABLE
	* 0 1 y n (2b) L *->REACHABLE
	* 0 1 y y (1) REACHABLE->STALE
	* 1 0 n -- (2a)
	* 1 0 y n (2a) L
	* 1 0 y y (2a) L *->STALE
	* 1 1 n -- (2a) *->REACHABLE
	* 1 1 y n (2a) L *->REACHABLE
	* 1 1 y y (2a) L *->REACHABLE
	*/
	if (!is_override && (lladdr != NULL && llchange)) { /* (1) */
	/*
	* If state is REACHABLE, make it STALE.
	* no other updates should be done.
	*/
	if (ln->ln_state == ND6_LLINFO_REACHABLE) {
	ln->ln_state = ND6_LLINFO_STALE;
	nd6_llinfo_settimer(ln, (long)V_nd6_gctimer * hz);
	}
	goto freeit;
	} else if (is_override /* (2a) */
	\|\| (!is_override && (lladdr != NULL && !llchange)) /* (2b) */
	\|\| lladdr == NULL) { /* (2c) */
	/*
	* Update link-local address, if any.
	*/
	if (lladdr != NULL) {
	sdl->sdl_alen = ifp->if_addrlen;
	bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen);
	}

	/*
	* If solicited, make the state REACHABLE.
	* If not solicited and the link-layer address was
	* changed, make it STALE.
	*/
	if (is_solicited) {
	ln->ln_state = ND6_LLINFO_REACHABLE;
	ln->ln_byhint = 0;
	if (!ND6_LLINFO_PERMANENT(ln)) {
	nd6_llinfo_settimer(ln,
	(long)ND_IFINFO(ifp)->reachable * hz);
	}
	} else {
	if (lladdr != NULL && llchange) {
	ln->ln_state = ND6_LLINFO_STALE;
	nd6_llinfo_settimer(ln,
	(long)V_nd6_gctimer * hz);
	}
	}
	}

	if (ln->ln_router && !is_router) {
	/*
	* The peer dropped the router flag.
	* Remove the sender from the Default Router List and
	* update the Destination Cache entries.
	*/
	struct nd_defrouter *dr;
	struct in6_addr *in6;
	int s;

	in6 = &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr;

	/*
	* Lock to protect the default router list.
	* XXX: this might be unnecessary, since this function
	* is only called under the network software interrupt
	* context. However, we keep it just for safety.
	*/
	s = splnet();
	dr = defrouter_lookup(in6, ifp);
	if (dr)
	defrtrlist_del(dr);
	else if (!V_ip6_forwarding) {
	/*
	* Even if the neighbor is not in the default
	* router list, the neighbor may be used
	* as a next hop for some destinations
	* (e.g. redirect case). So we must
	* call rt6_flush explicitly.
	*/
	rt6_flush(&ip6->ip6_src, ifp);
	}
	splx(s);
	}
	ln->ln_router = is_router;
	}
	rt->rt_flags &= ~RTF_REJECT;
	ln->ln_asked = 0;
	if (ln->ln_hold) {
	struct mbuf m_hold, m_hold_next;

	/*
	* reset the ln_hold in advance, to explicitly
	* prevent a ln_hold lookup in nd6_output()
	* (wouldn't happen, though...)
	*/
	for (m_hold = ln->ln_hold;
	m_hold; m_hold = m_hold_next) {
	m_hold_next = m_hold->m_nextpkt;
	m_hold->m_nextpkt = NULL;
	/*
	* we assume ifp is not a loopback here, so just set
	* the 2nd argument as the 1st one.
	*/
	nd6_output(ifp, ifp, m_hold,
	(struct sockaddr_in6 *)rt_key(rt), rt);
	}
	ln->ln_hold = NULL;
	}

	freeit:
	m_freem(m);
	return;

	bad:
	V_icmp6stat.icp6s_badna++;
	m_freem(m);
	}

	/*
	* Neighbor advertisement output handling.
	*
	* Based on RFC 2461
	*
	* the following items are not implemented yet:
	* - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
	* - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
	*
	* tlladdr - 1 if include target link-layer address
	* sdl0 - sockaddr_dl (= proxy NA) or NULL
	*/
	void
	nd6_na_output(struct ifnet ifp, const struct in6_addr daddr6_0,
	const struct in6_addr *taddr6, u_long flags, int tlladdr,
	struct sockaddr *sdl0)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct mbuf *m;
	struct ip6_hdr *ip6;
	struct nd_neighbor_advert *nd_na;
	struct ip6_moptions im6o;
	struct in6_addr *src, daddr6;
	struct sockaddr_in6 dst_sa;
	int icmp6len, maxlen, error;
	caddr_t mac = NULL;
	struct route_in6 ro;

	bzero(&ro, sizeof(ro));

	daddr6 = daddr6_0; / make a local copy for modification */

	/* estimate the size of message */
	maxlen = sizeof(ip6) + sizeof(nd_na);
	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
	if (max_linkhdr + maxlen >= MCLBYTES) {
	#ifdef DIAGNOSTIC
	printf("nd6_na_output: max_linkhdr + maxlen >= MCLBYTES "
	"(%d + %d > %d)\n", max_linkhdr, maxlen, MCLBYTES);
	#endif
	return;
	}

	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m && max_linkhdr + maxlen >= MHLEN) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	m_free(m);
	m = NULL;
	}
	}
	if (m == NULL)
	return;
	m->m_pkthdr.rcvif = NULL;

	if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
	m->m_flags \|= M_MCAST;
	im6o.im6o_multicast_ifp = ifp;
	im6o.im6o_multicast_hlim = 255;
	im6o.im6o_multicast_loop = 0;
	}

	icmp6len = sizeof(*nd_na);
	m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
	m->m_data += max_linkhdr; /* or MH_ALIGN() equivalent? */

	/* fill neighbor advertisement packet */
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = 0;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	ip6->ip6_nxt = IPPROTO_ICMPV6;
	ip6->ip6_hlim = 255;
	if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
	/* reply to DAD */
	daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
	daddr6.s6_addr16[1] = 0;
	daddr6.s6_addr32[1] = 0;
	daddr6.s6_addr32[2] = 0;
	daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
	if (in6_setscope(&daddr6, ifp, NULL))
	goto bad;

	flags &= ~ND_NA_FLAG_SOLICITED;
	}
	ip6->ip6_dst = daddr6;
	bzero(&dst_sa, sizeof(struct sockaddr_in6));
	dst_sa.sin6_family = AF_INET6;
	dst_sa.sin6_len = sizeof(struct sockaddr_in6);
	dst_sa.sin6_addr = daddr6;

	/*
	* Select a source whose scope is the same as that of the dest.
	*/
	bcopy(&dst_sa, &ro.ro_dst, sizeof(dst_sa));
	src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, NULL, &error);
	if (src == NULL) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
	"determined: dst=%s, error=%d\n",
	ip6_sprintf(ip6buf, &dst_sa.sin6_addr), error));
	goto bad;
	}
	ip6->ip6_src = *src;
	nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
	nd_na->nd_na_code = 0;
	nd_na->nd_na_target = *taddr6;
	in6_clearscope(&nd_na->nd_na_target); /* XXX */

	/*
	* "tlladdr" indicates NS's condition for adding tlladdr or not.
	* see nd6_ns_input() for details.
	* Basically, if NS packet is sent to unicast/anycast addr,
	* target lladdr option SHOULD NOT be included.
	*/
	if (tlladdr) {
	/*
	* sdl0 != NULL indicates proxy NA. If we do proxy, use
	* lladdr in sdl0. If we are not proxying (sending NA for
	* my address) use lladdr configured for the interface.
	*/
	if (sdl0 == NULL) {
	#ifdef DEV_CARP
	if (ifp->if_carp)
	mac = carp_macmatch6(ifp->if_carp, m, taddr6);
	if (mac == NULL)
	mac = nd6_ifptomac(ifp);
	#else
	mac = nd6_ifptomac(ifp);
	#endif
	} else if (sdl0->sa_family == AF_LINK) {
	struct sockaddr_dl *sdl;
	sdl = (struct sockaddr_dl *)sdl0;
	if (sdl->sdl_alen == ifp->if_addrlen)
	mac = LLADDR(sdl);
	}
	}
	if (tlladdr && mac) {
	int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
	struct nd_opt_hdr nd_opt = (struct nd_opt_hdr )(nd_na + 1);

	/* roundup to 8 bytes alignment! */
	optlen = (optlen + 7) & ~7;

	m->m_pkthdr.len += optlen;
	m->m_len += optlen;
	icmp6len += optlen;
	bzero((caddr_t)nd_opt, optlen);
	nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
	nd_opt->nd_opt_len = optlen >> 3;
	bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
	} else
	flags &= ~ND_NA_FLAG_OVERRIDE;

	ip6->ip6_plen = htons((u_short)icmp6len);
	nd_na->nd_na_flags_reserved = flags;
	nd_na->nd_na_cksum = 0;
	nd_na->nd_na_cksum =
	in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);

	ip6_output(m, NULL, &ro, 0, &im6o, NULL, NULL);
	icmp6_ifstat_inc(ifp, ifs6_out_msg);
	icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert);
	V_icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++;

	if (ro.ro_rt) { /* we don't cache this route. */
	RTFREE(ro.ro_rt);
	}
	return;

	bad:
	if (ro.ro_rt) {
	RTFREE(ro.ro_rt);
	}
	m_freem(m);
	return;
	}

	caddr_t
	nd6_ifptomac(struct ifnet *ifp)
	{
	switch (ifp->if_type) {
	case IFT_ARCNET:
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_IEEE1394:
	#ifdef IFT_L2VLAN
	case IFT_L2VLAN:
	#endif
	#ifdef IFT_IEEE80211
	case IFT_IEEE80211:
	#endif
	#ifdef IFT_CARP
	case IFT_CARP:
	#endif
	case IFT_BRIDGE:
	case IFT_ISO88025:
	return IF_LLADDR(ifp);
	default:
	return NULL;
	}
	}

	TAILQ_HEAD(dadq_head, dadq);
	struct dadq {
	TAILQ_ENTRY(dadq) dad_list;
	struct ifaddr *dad_ifa;
	int dad_count; /* max NS to send */
	int dad_ns_tcount; /* # of trials to send NS */
	int dad_ns_ocount; /* NS sent so far */
	int dad_ns_icount;
	int dad_na_icount;
	struct callout dad_timer_ch;
	};

	static struct dadq_head dadq;
	static int dad_init = 0;

	static struct dadq *
	nd6_dad_find(struct ifaddr *ifa)
	{
	+ INIT_VNET_INET6(curvnet);
	struct dadq *dp;

	for (dp = V_dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) {
	if (dp->dad_ifa == ifa)
	return dp;
	}
	return NULL;
	}

	static void
	nd6_dad_starttimer(struct dadq *dp, int ticks)
	{

	callout_reset(&dp->dad_timer_ch, ticks,
	(void ()(void ))nd6_dad_timer, (void *)dp->dad_ifa);
	}

	static void
	nd6_dad_stoptimer(struct dadq *dp)
	{

	callout_stop(&dp->dad_timer_ch);
	}

	/*
	* Start Duplicate Address Detection (DAD) for specified interface address.
	*/
	void
	nd6_dad_start(struct ifaddr *ifa, int delay)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	struct dadq *dp;
	char ip6buf[INET6_ADDRSTRLEN];

	if (!V_dad_init) {
	TAILQ_INIT(&V_dadq);
	V_dad_init++;
	}

	/*
	* If we don't need DAD, don't do it.
	* There are several cases:
	* - DAD is disabled (ip6_dad_count == 0)
	* - the interface address is anycast
	*/
	if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) {
	log(LOG_DEBUG,
	"nd6_dad_start: called with non-tentative address "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	return;
	}
	if (ia->ia6_flags & IN6_IFF_ANYCAST) {
	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
	return;
	}
	if (!V_ip6_dad_count) {
	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
	return;
	}
	if (ifa->ifa_ifp == NULL)
	panic("nd6_dad_start: ifa->ifa_ifp == NULL");
	if (!(ifa->ifa_ifp->if_flags & IFF_UP)) {
	return;
	}
	if (nd6_dad_find(ifa) != NULL) {
	/* DAD already in progress */
	return;
	}

	dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT);
	if (dp == NULL) {
	log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	return;
	}
	bzero(dp, sizeof(*dp));
	callout_init(&dp->dad_timer_ch, 0);
	TAILQ_INSERT_TAIL(&V_dadq, (struct dadq *)dp, dad_list);

	nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));

	/*
	* Send NS packet for DAD, ip6_dad_count times.
	* Note that we must delay the first transmission, if this is the
	* first packet to be sent from the interface after interface
	* (re)initialization.
	*/
	dp->dad_ifa = ifa;
	IFAREF(ifa); /* just for safety */
	dp->dad_count = V_ip6_dad_count;
	dp->dad_ns_icount = dp->dad_na_icount = 0;
	dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
	if (delay == 0) {
	nd6_dad_ns_output(dp, ifa);
	nd6_dad_starttimer(dp,
	(long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
	} else {
	nd6_dad_starttimer(dp, delay);
	}
	}

	/*
	* terminate DAD unconditionally. used for address removals.
	*/
	void
	nd6_dad_stop(struct ifaddr *ifa)
	{
	+ INIT_VNET_INET6(curvnet);
	struct dadq *dp;

	if (!V_dad_init)
	return;
	dp = nd6_dad_find(ifa);
	if (!dp) {
	/* DAD wasn't started yet */
	return;
	}

	nd6_dad_stoptimer(dp);

	TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
	free(dp, M_IP6NDP);
	dp = NULL;
	IFAFREE(ifa);
	}

	static void
	nd6_dad_timer(struct ifaddr *ifa)
	{
	+ CURVNET_SET(dp->dad_vnet);
	+ INIT_VNET_INET6(curvnet);
	int s;
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	struct dadq *dp;
	char ip6buf[INET6_ADDRSTRLEN];

	s = splnet(); /* XXX */

	/* Sanity check */
	if (ia == NULL) {
	log(LOG_ERR, "nd6_dad_timer: called with null parameter\n");
	goto done;
	}
	dp = nd6_dad_find(ifa);
	if (dp == NULL) {
	log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n");
	goto done;
	}
	if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
	log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	goto done;
	}
	if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
	log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
	"%s(%s)\n",
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
	goto done;
	}

	/* timeouted with IFF_{RUNNING,UP} check */
	if (dp->dad_ns_tcount > V_dad_maxtry) {
	nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n",
	if_name(ifa->ifa_ifp)));

	TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
	free(dp, M_IP6NDP);
	dp = NULL;
	IFAFREE(ifa);
	goto done;
	}

	/* Need more checks? */
	if (dp->dad_ns_ocount < dp->dad_count) {
	/*
	* We have more NS to go. Send NS packet for DAD.
	*/
	nd6_dad_ns_output(dp, ifa);
	nd6_dad_starttimer(dp,
	(long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
	} else {
	/*
	* We have transmitted sufficient number of DAD packets.
	* See what we've got.
	*/
	int duplicate;

	duplicate = 0;

	if (dp->dad_na_icount) {
	/*
	* the check is in nd6_dad_na_input(),
	* but just in case
	*/
	duplicate++;
	}

	if (dp->dad_ns_icount) {
	/* We've seen NS, means DAD has failed. */
	duplicate++;
	}

	if (duplicate) {
	/* (dp) will be freed in nd6_dad_duplicated() /
	dp = NULL;
	nd6_dad_duplicated(ifa);
	} else {
	/*
	* We are done with DAD. No NA came, no NS came.
	* No duplicate address found.
	*/
	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;

	nd6log((LOG_DEBUG,
	"%s: DAD complete for %s - no duplicates found\n",
	if_name(ifa->ifa_ifp),
	ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));

	TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
	free(dp, M_IP6NDP);
	dp = NULL;
	IFAFREE(ifa);
	}
	}

	done:
	splx(s);
	+ CURVNET_RESTORE();
	}

	void
	nd6_dad_duplicated(struct ifaddr *ifa)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	struct ifnet *ifp;
	struct dadq *dp;
	char ip6buf[INET6_ADDRSTRLEN];

	dp = nd6_dad_find(ifa);
	if (dp == NULL) {
	log(LOG_ERR, "nd6_dad_duplicated: DAD structure not found\n");
	return;
	}

	log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
	"NS in/out=%d/%d, NA in=%d\n",
	if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
	dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount);

	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
	ia->ia6_flags \|= IN6_IFF_DUPLICATED;

	/* We are done with DAD, with duplicate address found. (failure) */
	nd6_dad_stoptimer(dp);

	ifp = ifa->ifa_ifp;
	log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
	if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr));
	log(LOG_ERR, "%s: manual intervention required\n",
	if_name(ifp));

	/*
	* If the address is a link-local address formed from an interface
	* identifier based on the hardware address which is supposed to be
	* uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
	* operation on the interface SHOULD be disabled.
	* [rfc2462bis-03 Section 5.4.5]
	*/
	if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) {
	struct in6_addr in6;

	/*
	* To avoid over-reaction, we only apply this logic when we are
	* very sure that hardware addresses are supposed to be unique.
	*/
	switch (ifp->if_type) {
	case IFT_ETHER:
	case IFT_FDDI:
	case IFT_ATM:
	case IFT_IEEE1394:
	#ifdef IFT_IEEE80211
	case IFT_IEEE80211:
	#endif
	in6 = ia->ia_addr.sin6_addr;
	if (in6_get_hw_ifid(ifp, &in6) == 0 &&
	IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) {
	ND_IFINFO(ifp)->flags \|= ND6_IFF_IFDISABLED;
	log(LOG_ERR, "%s: possible hardware address "
	"duplication detected, disable IPv6\n",
	if_name(ifp));
	}
	break;
	}
	}

	TAILQ_REMOVE(&V_dadq, (struct dadq *)dp, dad_list);
	free(dp, M_IP6NDP);
	dp = NULL;
	IFAFREE(ifa);
	}

	static void
	nd6_dad_ns_output(struct dadq dp, struct ifaddr ifa)
	{
	struct in6_ifaddr ia = (struct in6_ifaddr )ifa;
	struct ifnet *ifp = ifa->ifa_ifp;

	dp->dad_ns_tcount++;
	if ((ifp->if_flags & IFF_UP) == 0) {
	return;
	}
	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
	return;
	}

	dp->dad_ns_ocount++;
	nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1);
	}

	static void
	nd6_dad_ns_input(struct ifaddr *ifa)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr *ia;
	struct ifnet *ifp;
	const struct in6_addr *taddr6;
	struct dadq *dp;
	int duplicate;

	if (ifa == NULL)
	panic("ifa == NULL in nd6_dad_ns_input");

	ia = (struct in6_ifaddr *)ifa;
	ifp = ifa->ifa_ifp;
	taddr6 = &ia->ia_addr.sin6_addr;
	duplicate = 0;
	dp = nd6_dad_find(ifa);

	/* Quickhack - completely ignore DAD NS packets */
	if (V_dad_ignore_ns) {
	char ip6buf[INET6_ADDRSTRLEN];
	nd6log((LOG_INFO,
	"nd6_dad_ns_input: ignoring DAD NS packet for "
	"address %s(%s)\n", ip6_sprintf(ip6buf, taddr6),
	if_name(ifa->ifa_ifp)));
	return;
	}

	/*
	* if I'm yet to start DAD, someone else started using this address
	* first. I have a duplicate and you win.
	*/
	if (dp == NULL \|\| dp->dad_ns_ocount == 0)
	duplicate++;

	/* XXX more checks for loopback situation - see nd6_dad_timer too */

	if (duplicate) {
	dp = NULL; /* will be freed in nd6_dad_duplicated() */
	nd6_dad_duplicated(ifa);
	} else {
	/*
	* not sure if I got a duplicate.
	* increment ns count and see what happens.
	*/
	if (dp)
	dp->dad_ns_icount++;
	}
	}

	static void
	nd6_dad_na_input(struct ifaddr *ifa)
	{
	struct dadq *dp;

	if (ifa == NULL)
	panic("ifa == NULL in nd6_dad_na_input");

	dp = nd6_dad_find(ifa);
	if (dp)
	dp->dad_na_icount++;

	/* remove the address. */
	nd6_dad_duplicated(ifa);
	}
	Index: head/sys/netinet6/nd6_rtr.c
	===================================================================
	--- head/sys/netinet6/nd6_rtr.c (revision 183549)
	+++ head/sys/netinet6/nd6_rtr.c (revision 183550)
	@@ -1,2100 +1,2118 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/queue.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/radix.h>

	#include <netinet/in.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/in6_ifattach.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet/icmp6.h>
	#include <netinet6/scope6_var.h>

	#define SDL(s) ((struct sockaddr_dl *)s)

	static int rtpref(struct nd_defrouter *);
	static struct nd_defrouter defrtrlist_update(struct nd_defrouter );
	static int prelist_update __P((struct nd_prefixctl , struct nd_defrouter ,
	struct mbuf *, int));
	static struct in6_ifaddr in6_ifadd(struct nd_prefixctl , int);
	static struct nd_pfxrouter pfxrtr_lookup __P((struct nd_prefix ,
	struct nd_defrouter *));
	static void pfxrtr_add(struct nd_prefix , struct nd_defrouter );
	static void pfxrtr_del(struct nd_pfxrouter *);
	static struct nd_pfxrouter *find_pfxlist_reachable_router
	(struct nd_prefix *);
	static void defrouter_delreq(struct nd_defrouter *);
	static void nd6_rtmsg(int, struct rtentry *);

	static int in6_init_prefix_ltimes(struct nd_prefix *);
	static void in6_init_address_ltimes __P((struct nd_prefix *,
	struct in6_addrlifetime *));

	static int rt6_deleteroute(struct radix_node , void );

	extern int nd6_recalc_reachtm_interval;

	static struct ifnet *nd6_defifp;
	int nd6_defifindex;

	int ip6_use_tempaddr = 0;

	int ip6_desync_factor;
	u_int32_t ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME;
	u_int32_t ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME;
	/*
	* shorter lifetimes for debugging purposes.
	int ip6_temp_preferred_lifetime = 800;
	static int ip6_temp_valid_lifetime = 1800;
	*/
	int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE;

	/* RTPREF_MEDIUM has to be 0! */
	#define RTPREF_HIGH 1
	#define RTPREF_MEDIUM 0
	#define RTPREF_LOW (-1)
	#define RTPREF_RESERVED (-2)
	#define RTPREF_INVALID (-3) /* internal */

	/*
	* Receive Router Solicitation Message - just for routers.
	* Router solicitation/advertisement is mostly managed by userland program
	* (rtadvd) so here we have no function like nd6_ra_output().
	*
	* Based on RFC 2461
	*/
	void
	nd6_rs_input(struct mbuf *m, int off, int icmp6len)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_router_solicit *nd_rs;
	struct in6_addr saddr6 = ip6->ip6_src;
	char *lladdr = NULL;
	int lladdrlen = 0;
	union nd_opts ndopts;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	/* If I'm not a router, ignore it. */
	if (V_ip6_accept_rtadv != 0 \|\| V_ip6_forwarding != 1)
	goto freeit;

	/* Sanity checks */
	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"nd6_rs_input: invalid hlim (%d) from %s to %s on %s\n",
	ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
	goto bad;
	}

	/*
	* Don't update the neighbor cache, if src = ::.
	* This indicates that the src has no IP address assigned yet.
	*/
	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
	goto freeit;

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_rs, struct nd_router_solicit *, m, off, icmp6len);
	if (nd_rs == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif

	icmp6len -= sizeof(*nd_rs);
	nd6_option_init(nd_rs + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO,
	"nd6_rs_input: invalid ND option, ignored\n"));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	if (ndopts.nd_opts_src_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
	lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
	}

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO,
	"nd6_rs_input: lladdrlen mismatch for %s "
	"(if %d, RS packet %d)\n",
	ip6_sprintf(ip6bufs, &saddr6),
	ifp->if_addrlen, lladdrlen - 2));
	goto bad;
	}

	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0);

	freeit:
	m_freem(m);
	return;

	bad:
	V_icmp6stat.icp6s_badrs++;
	m_freem(m);
	}

	/*
	* Receive Router Advertisement Message.
	*
	* Based on RFC 2461
	* TODO: on-link bit on prefix information
	* TODO: ND_RA_FLAG_{OTHER,MANAGED} processing
	*/
	void
	nd6_ra_input(struct mbuf *m, int off, int icmp6len)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = m->m_pkthdr.rcvif;
	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	struct nd_router_advert *nd_ra;
	struct in6_addr saddr6 = ip6->ip6_src;
	int mcast = 0;
	union nd_opts ndopts;
	struct nd_defrouter *dr;
	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];

	/*
	* We only accept RAs only when
	* the system-wide variable allows the acceptance, and
	* per-interface variable allows RAs on the receiving interface.
	*/
	if (V_ip6_accept_rtadv == 0)
	goto freeit;
	if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV))
	goto freeit;

	if (ip6->ip6_hlim != 255) {
	nd6log((LOG_ERR,
	"nd6_ra_input: invalid hlim (%d) from %s to %s on %s\n",
	ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
	goto bad;
	}

	if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) {
	nd6log((LOG_ERR,
	"nd6_ra_input: src %s is not link-local\n",
	ip6_sprintf(ip6bufs, &saddr6)));
	goto bad;
	}

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, icmp6len,);
	nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(nd_ra, struct nd_router_advert *, m, off, icmp6len);
	if (nd_ra == NULL) {
	V_icmp6stat.icp6s_tooshort++;
	return;
	}
	#endif

	icmp6len -= sizeof(*nd_ra);
	nd6_option_init(nd_ra + 1, icmp6len, &ndopts);
	if (nd6_options(&ndopts) < 0) {
	nd6log((LOG_INFO,
	"nd6_ra_input: invalid ND option, ignored\n"));
	/* nd6_options have incremented stats */
	goto freeit;
	}

	{
	struct nd_defrouter dr0;
	u_int32_t advreachable = nd_ra->nd_ra_reachable;

	/* remember if this is a multicasted advertisement */
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
	mcast = 1;

	bzero(&dr0, sizeof(dr0));
	dr0.rtaddr = saddr6;
	dr0.flags = nd_ra->nd_ra_flags_reserved;
	dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime);
	dr0.expire = time_second + dr0.rtlifetime;
	dr0.ifp = ifp;
	/* unspecified or not? (RFC 2461 6.3.4) */
	if (advreachable) {
	advreachable = ntohl(advreachable);
	if (advreachable <= MAX_REACHABLE_TIME &&
	ndi->basereachable != advreachable) {
	ndi->basereachable = advreachable;
	ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable);
	ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */
	}
	}
	if (nd_ra->nd_ra_retransmit)
	ndi->retrans = ntohl(nd_ra->nd_ra_retransmit);
	if (nd_ra->nd_ra_curhoplimit)
	ndi->chlim = nd_ra->nd_ra_curhoplimit;
	dr = defrtrlist_update(&dr0);
	}

	/*
	* prefix
	*/
	if (ndopts.nd_opts_pi) {
	struct nd_opt_hdr *pt;
	struct nd_opt_prefix_info *pi = NULL;
	struct nd_prefixctl pr;

	for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi;
	pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end;
	pt = (struct nd_opt_hdr *)((caddr_t)pt +
	(pt->nd_opt_len << 3))) {
	if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION)
	continue;
	pi = (struct nd_opt_prefix_info *)pt;

	if (pi->nd_opt_pi_len != 4) {
	nd6log((LOG_INFO,
	"nd6_ra_input: invalid option "
	"len %d for prefix information option, "
	"ignored\n", pi->nd_opt_pi_len));
	continue;
	}

	if (128 < pi->nd_opt_pi_prefix_len) {
	nd6log((LOG_INFO,
	"nd6_ra_input: invalid prefix "
	"len %d for prefix information option, "
	"ignored\n", pi->nd_opt_pi_prefix_len));
	continue;
	}

	if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix)
	\|\| IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) {
	nd6log((LOG_INFO,
	"nd6_ra_input: invalid prefix "
	"%s, ignored\n",
	ip6_sprintf(ip6bufs,
	&pi->nd_opt_pi_prefix)));
	continue;
	}

	bzero(&pr, sizeof(pr));
	pr.ndpr_prefix.sin6_family = AF_INET6;
	pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix);
	pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix;
	pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif;

	pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved &
	ND_OPT_PI_FLAG_ONLINK) ? 1 : 0;
	pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved &
	ND_OPT_PI_FLAG_AUTO) ? 1 : 0;
	pr.ndpr_plen = pi->nd_opt_pi_prefix_len;
	pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time);
	pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time);
	(void)prelist_update(&pr, dr, m, mcast);
	}
	}

	/*
	* MTU
	*/
	if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) {
	u_long mtu;
	u_long maxmtu;

	mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu);

	/* lower bound */
	if (mtu < IPV6_MMTU) {
	nd6log((LOG_INFO, "nd6_ra_input: bogus mtu option "
	"mtu=%lu sent from %s, ignoring\n",
	mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src)));
	goto skip;
	}

	/* upper bound */
	maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu)
	? ndi->maxmtu : ifp->if_mtu;
	if (mtu <= maxmtu) {
	int change = (ndi->linkmtu != mtu);

	ndi->linkmtu = mtu;
	if (change) /* in6_maxmtu may change */
	in6_setmaxmtu();
	} else {
	nd6log((LOG_INFO, "nd6_ra_input: bogus mtu "
	"mtu=%lu sent from %s; "
	"exceeds maxmtu %lu, ignoring\n",
	mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu));
	}
	}

	skip:

	/*
	* Source link layer address
	*/
	{
	char *lladdr = NULL;
	int lladdrlen = 0;

	if (ndopts.nd_opts_src_lladdr) {
	lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
	lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
	}

	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
	nd6log((LOG_INFO,
	"nd6_ra_input: lladdrlen mismatch for %s "
	"(if %d, RA packet %d)\n", ip6_sprintf(ip6bufs, &saddr6),
	ifp->if_addrlen, lladdrlen - 2));
	goto bad;
	}

	nd6_cache_lladdr(ifp, &saddr6, lladdr,
	lladdrlen, ND_ROUTER_ADVERT, 0);

	/*
	* Installing a link-layer address might change the state of the
	* router's neighbor cache, which might also affect our on-link
	* detection of adveritsed prefixes.
	*/
	pfxlist_onlink_check();
	}

	freeit:
	m_freem(m);
	return;

	bad:
	V_icmp6stat.icp6s_badra++;
	m_freem(m);
	}

	/*
	* default router list proccessing sub routines
	*/

	/* tell the change to user processes watching the routing socket. */
	static void
	nd6_rtmsg(int cmd, struct rtentry *rt)
	{
	struct rt_addrinfo info;

	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_DST] = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
	if (rt->rt_ifp) {
	info.rti_info[RTAX_IFP] =
	TAILQ_FIRST(&rt->rt_ifp->if_addrlist)->ifa_addr;
	info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
	}

	rt_missmsg(cmd, &info, rt->rt_flags, 0);
	}

	void
	defrouter_addreq(struct nd_defrouter *new)
	{
	struct sockaddr_in6 def, mask, gate;
	struct rtentry *newrt = NULL;
	int s;
	int error;

	bzero(&def, sizeof(def));
	bzero(&mask, sizeof(mask));
	bzero(&gate, sizeof(gate));

	def.sin6_len = mask.sin6_len = gate.sin6_len =
	sizeof(struct sockaddr_in6);
	def.sin6_family = gate.sin6_family = AF_INET6;
	gate.sin6_addr = new->rtaddr;

	s = splnet();
	error = rtrequest(RTM_ADD, (struct sockaddr *)&def,
	(struct sockaddr )&gate, (struct sockaddr )&mask,
	RTF_GATEWAY, &newrt);
	if (newrt) {
	RT_LOCK(newrt);
	nd6_rtmsg(RTM_ADD, newrt); /* tell user process */
	RT_REMREF(newrt);
	RT_UNLOCK(newrt);
	}
	if (error == 0)
	new->installed = 1;
	splx(s);
	return;
	}

	struct nd_defrouter *
	defrouter_lookup(struct in6_addr addr, struct ifnet ifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	struct nd_defrouter *dr;

	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
	dr = TAILQ_NEXT(dr, dr_entry)) {
	if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr))
	return (dr);
	}

	return (NULL); /* search failed */
	}

	/*
	* Remove the default route for a given router.
	* This is just a subroutine function for defrouter_select(), and should
	* not be called from anywhere else.
	*/
	static void
	defrouter_delreq(struct nd_defrouter *dr)
	{
	struct sockaddr_in6 def, mask, gate;
	struct rtentry *oldrt = NULL;

	bzero(&def, sizeof(def));
	bzero(&mask, sizeof(mask));
	bzero(&gate, sizeof(gate));

	def.sin6_len = mask.sin6_len = gate.sin6_len =
	sizeof(struct sockaddr_in6);
	def.sin6_family = gate.sin6_family = AF_INET6;
	gate.sin6_addr = dr->rtaddr;

	rtrequest(RTM_DELETE, (struct sockaddr *)&def,
	(struct sockaddr *)&gate,
	(struct sockaddr *)&mask, RTF_GATEWAY, &oldrt);
	if (oldrt) {
	nd6_rtmsg(RTM_DELETE, oldrt);
	RTFREE(oldrt);
	}

	dr->installed = 0;
	}

	/*
	* remove all default routes from default router list
	*/
	void
	defrouter_reset(void)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_defrouter *dr;

	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
	dr = TAILQ_NEXT(dr, dr_entry))
	defrouter_delreq(dr);

	/*
	* XXX should we also nuke any default routers in the kernel, by
	* going through them by rtalloc1()?
	*/
	}

	void
	defrtrlist_del(struct nd_defrouter *dr)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_defrouter *deldr = NULL;
	struct nd_prefix *pr;

	/*
	* Flush all the routing table entries that use the router
	* as a next hop.
	*/
	if (!V_ip6_forwarding && V_ip6_accept_rtadv) /* XXX: better condition? */
	rt6_flush(&dr->rtaddr, dr->ifp);

	if (dr->installed) {
	deldr = dr;
	defrouter_delreq(dr);
	}
	TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry);

	/*
	* Also delete all the pointers to the router in each prefix lists.
	*/
	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	struct nd_pfxrouter *pfxrtr;
	if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL)
	pfxrtr_del(pfxrtr);
	}
	pfxlist_onlink_check();

	/*
	* If the router is the primary one, choose a new one.
	* Note that defrouter_select() will remove the current gateway
	* from the routing table.
	*/
	if (deldr)
	defrouter_select();

	free(dr, M_IP6NDP);
	}

	/*
	* Default Router Selection according to Section 6.3.6 of RFC 2461 and
	* draft-ietf-ipngwg-router-selection:
	* 1) Routers that are reachable or probably reachable should be preferred.
	* If we have more than one (probably) reachable router, prefer ones
	* with the highest router preference.
	* 2) When no routers on the list are known to be reachable or
	* probably reachable, routers SHOULD be selected in a round-robin
	* fashion, regardless of router preference values.
	* 3) If the Default Router List is empty, assume that all
	* destinations are on-link.
	*
	* We assume nd_defrouter is sorted by router preference value.
	* Since the code below covers both with and without router preference cases,
	* we do not need to classify the cases by ifdef.
	*
	* At this moment, we do not try to install more than one default router,
	* even when the multipath routing is available, because we're not sure about
	* the benefits for stub hosts comparing to the risk of making the code
	* complicated and the possibility of introducing bugs.
	*/
	void
	defrouter_select(void)
	{
	+ INIT_VNET_INET6(curvnet);
	int s = splnet();
	struct nd_defrouter dr, selected_dr = NULL, *installed_dr = NULL;
	struct rtentry *rt = NULL;
	struct llinfo_nd6 *ln = NULL;

	/*
	* This function should be called only when acting as an autoconfigured
	* host. Although the remaining part of this function is not effective
	* if the node is not an autoconfigured host, we explicitly exclude
	* such cases here for safety.
	*/
	if (V_ip6_forwarding \|\| !V_ip6_accept_rtadv) {
	nd6log((LOG_WARNING,
	"defrouter_select: called unexpectedly (forwarding=%d, "
	"accept_rtadv=%d)\n", V_ip6_forwarding, V_ip6_accept_rtadv));
	splx(s);
	return;
	}

	/*
	* Let's handle easy case (3) first:
	* If default router list is empty, there's nothing to be done.
	*/
	if (!TAILQ_FIRST(&V_nd_defrouter)) {
	splx(s);
	return;
	}

	/*
	* Search for a (probably) reachable router from the list.
	* We just pick up the first reachable one (if any), assuming that
	* the ordering rule of the list described in defrtrlist_update().
	*/
	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
	dr = TAILQ_NEXT(dr, dr_entry)) {
	if (selected_dr == NULL &&
	(rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp)) &&
	(ln = (struct llinfo_nd6 *)rt->rt_llinfo) &&
	ND6_IS_LLINFO_PROBREACH(ln)) {
	selected_dr = dr;
	}

	if (dr->installed && installed_dr == NULL)
	installed_dr = dr;
	else if (dr->installed && installed_dr) {
	/* this should not happen. warn for diagnosis. */
	log(LOG_ERR, "defrouter_select: more than one router"
	" is installed\n");
	}
	}
	/*
	* If none of the default routers was found to be reachable,
	* round-robin the list regardless of preference.
	* Otherwise, if we have an installed router, check if the selected
	* (reachable) router should really be preferred to the installed one.
	* We only prefer the new router when the old one is not reachable
	* or when the new one has a really higher preference value.
	*/
	if (selected_dr == NULL) {
	if (installed_dr == NULL \|\| !TAILQ_NEXT(installed_dr, dr_entry))
	selected_dr = TAILQ_FIRST(&V_nd_defrouter);
	else
	selected_dr = TAILQ_NEXT(installed_dr, dr_entry);
	} else if (installed_dr &&
	(rt = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp)) &&
	(ln = (struct llinfo_nd6 *)rt->rt_llinfo) &&
	ND6_IS_LLINFO_PROBREACH(ln) &&
	rtpref(selected_dr) <= rtpref(installed_dr)) {
	selected_dr = installed_dr;
	}

	/*
	* If the selected router is different than the installed one,
	* remove the installed router and install the selected one.
	* Note that the selected router is never NULL here.
	*/
	if (installed_dr != selected_dr) {
	if (installed_dr)
	defrouter_delreq(installed_dr);
	defrouter_addreq(selected_dr);
	}

	splx(s);
	return;
	}

	/*
	* for default router selection
	* regards router-preference field as a 2-bit signed integer
	*/
	static int
	rtpref(struct nd_defrouter *dr)
	{
	switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) {
	case ND_RA_FLAG_RTPREF_HIGH:
	return (RTPREF_HIGH);
	case ND_RA_FLAG_RTPREF_MEDIUM:
	case ND_RA_FLAG_RTPREF_RSV:
	return (RTPREF_MEDIUM);
	case ND_RA_FLAG_RTPREF_LOW:
	return (RTPREF_LOW);
	default:
	/*
	* This case should never happen. If it did, it would mean a
	* serious bug of kernel internal. We thus always bark here.
	* Or, can we even panic?
	*/
	log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags);
	return (RTPREF_INVALID);
	}
	/* NOTREACHED */
	}

	static struct nd_defrouter *
	defrtrlist_update(struct nd_defrouter *new)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_defrouter dr, n;
	int s = splnet();

	if ((dr = defrouter_lookup(&new->rtaddr, new->ifp)) != NULL) {
	/* entry exists */
	if (new->rtlifetime == 0) {
	defrtrlist_del(dr);
	dr = NULL;
	} else {
	int oldpref = rtpref(dr);

	/* override */
	dr->flags = new->flags; /* xxx flag check */
	dr->rtlifetime = new->rtlifetime;
	dr->expire = new->expire;

	/*
	* If the preference does not change, there's no need
	* to sort the entries.
	*/
	if (rtpref(new) == oldpref) {
	splx(s);
	return (dr);
	}

	/*
	* preferred router may be changed, so relocate
	* this router.
	* XXX: calling TAILQ_REMOVE directly is a bad manner.
	* However, since defrtrlist_del() has many side
	* effects, we intentionally do so here.
	* defrouter_select() below will handle routing
	* changes later.
	*/
	TAILQ_REMOVE(&V_nd_defrouter, dr, dr_entry);
	n = dr;
	goto insert;
	}
	splx(s);
	return (dr);
	}

	/* entry does not exist */
	if (new->rtlifetime == 0) {
	splx(s);
	return (NULL);
	}

	n = (struct nd_defrouter )malloc(sizeof(n), M_IP6NDP, M_NOWAIT);
	if (n == NULL) {
	splx(s);
	return (NULL);
	}
	bzero(n, sizeof(*n));
	n = new;

	insert:
	/*
	* Insert the new router in the Default Router List;
	* The Default Router List should be in the descending order
	* of router-preferece. Routers with the same preference are
	* sorted in the arriving time order.
	*/

	/* insert at the end of the group */
	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
	dr = TAILQ_NEXT(dr, dr_entry)) {
	if (rtpref(n) > rtpref(dr))
	break;
	}
	if (dr)
	TAILQ_INSERT_BEFORE(dr, n, dr_entry);
	else
	TAILQ_INSERT_TAIL(&V_nd_defrouter, n, dr_entry);

	defrouter_select();

	splx(s);

	return (n);
	}

	static struct nd_pfxrouter *
	pfxrtr_lookup(struct nd_prefix pr, struct nd_defrouter dr)
	{
	struct nd_pfxrouter *search;

	for (search = pr->ndpr_advrtrs.lh_first; search; search = search->pfr_next) {
	if (search->router == dr)
	break;
	}

	return (search);
	}

	static void
	pfxrtr_add(struct nd_prefix pr, struct nd_defrouter dr)
	{
	struct nd_pfxrouter *new;

	new = (struct nd_pfxrouter )malloc(sizeof(new), M_IP6NDP, M_NOWAIT);
	if (new == NULL)
	return;
	bzero(new, sizeof(*new));
	new->router = dr;

	LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry);

	pfxlist_onlink_check();
	}

	static void
	pfxrtr_del(struct nd_pfxrouter *pfr)
	{
	LIST_REMOVE(pfr, pfr_entry);
	free(pfr, M_IP6NDP);
	}

	struct nd_prefix *
	nd6_prefix_lookup(struct nd_prefixctl *key)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_prefix *search;

	for (search = V_nd_prefix.lh_first;
	search; search = search->ndpr_next) {
	if (key->ndpr_ifp == search->ndpr_ifp &&
	key->ndpr_plen == search->ndpr_plen &&
	in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr,
	&search->ndpr_prefix.sin6_addr, key->ndpr_plen)) {
	break;
	}
	}

	return (search);
	}

	int
	nd6_prelist_add(struct nd_prefixctl pr, struct nd_defrouter dr,
	struct nd_prefix **newp)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_prefix *new = NULL;
	int error = 0;
	int i, s;
	char ip6buf[INET6_ADDRSTRLEN];

	new = (struct nd_prefix )malloc(sizeof(new), M_IP6NDP, M_NOWAIT);
	if (new == NULL)
	return(ENOMEM);
	bzero(new, sizeof(*new));
	new->ndpr_ifp = pr->ndpr_ifp;
	new->ndpr_prefix = pr->ndpr_prefix;
	new->ndpr_plen = pr->ndpr_plen;
	new->ndpr_vltime = pr->ndpr_vltime;
	new->ndpr_pltime = pr->ndpr_pltime;
	new->ndpr_flags = pr->ndpr_flags;
	if ((error = in6_init_prefix_ltimes(new)) != 0) {
	free(new, M_IP6NDP);
	return(error);
	}
	new->ndpr_lastupdate = time_second;
	if (newp != NULL)
	*newp = new;

	/* initialization */
	LIST_INIT(&new->ndpr_advrtrs);
	in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen);
	/* make prefix in the canonical form */
	for (i = 0; i < 4; i++)
	new->ndpr_prefix.sin6_addr.s6_addr32[i] &=
	new->ndpr_mask.s6_addr32[i];

	s = splnet();
	/* link ndpr_entry to nd_prefix list */
	LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry);
	splx(s);

	/* ND_OPT_PI_FLAG_ONLINK processing */
	if (new->ndpr_raf_onlink) {
	int e;

	if ((e = nd6_prefix_onlink(new)) != 0) {
	nd6log((LOG_ERR, "nd6_prelist_add: failed to make "
	"the prefix %s/%d on-link on %s (errno=%d)\n",
	ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
	/* proceed anyway. XXX: is it correct? */
	}
	}

	if (dr)
	pfxrtr_add(new, dr);

	return 0;
	}

	void
	prelist_remove(struct nd_prefix *pr)
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_pfxrouter pfr, next;
	int e, s;
	char ip6buf[INET6_ADDRSTRLEN];

	/* make sure to invalidate the prefix until it is really freed. */
	pr->ndpr_vltime = 0;
	pr->ndpr_pltime = 0;

	/*
	* Though these flags are now meaningless, we'd rather keep the value
	* of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users
	* when executing "ndp -p".
	*/

	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 &&
	(e = nd6_prefix_offlink(pr)) != 0) {
	nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink "
	"on %s, errno=%d\n",
	ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
	/* what should we do? */
	}

	if (pr->ndpr_refcnt > 0)
	return; /* notice here? */

	s = splnet();

	/* unlink ndpr_entry from nd_prefix list */
	LIST_REMOVE(pr, ndpr_entry);

	/* free list of routers that adversed the prefix */
	for (pfr = pr->ndpr_advrtrs.lh_first; pfr; pfr = next) {
	next = pfr->pfr_next;

	free(pfr, M_IP6NDP);
	}
	splx(s);

	free(pr, M_IP6NDP);

	pfxlist_onlink_check();
	}

	/*
	* dr - may be NULL
	*/

	static int
	prelist_update(struct nd_prefixctl new, struct nd_defrouter dr,
	struct mbuf *m, int mcast)
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr ia6 = NULL, ia6_match = NULL;
	struct ifaddr *ifa;
	struct ifnet *ifp = new->ndpr_ifp;
	struct nd_prefix *pr;
	int s = splnet();
	int error = 0;
	int newprefix = 0;
	int auth;
	struct in6_addrlifetime lt6_tmp;
	char ip6buf[INET6_ADDRSTRLEN];

	auth = 0;
	if (m) {
	/*
	* Authenticity for NA consists authentication for
	* both IP header and IP datagrams, doesn't it ?
	*/
	#if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM)
	auth = ((m->m_flags & M_AUTHIPHDR) &&
	(m->m_flags & M_AUTHIPDGM));
	#endif
	}

	if ((pr = nd6_prefix_lookup(new)) != NULL) {
	/*
	* nd6_prefix_lookup() ensures that pr and new have the same
	* prefix on a same interface.
	*/

	/*
	* Update prefix information. Note that the on-link (L) bit
	* and the autonomous (A) bit should NOT be changed from 1
	* to 0.
	*/
	if (new->ndpr_raf_onlink == 1)
	pr->ndpr_raf_onlink = 1;
	if (new->ndpr_raf_auto == 1)
	pr->ndpr_raf_auto = 1;
	if (new->ndpr_raf_onlink) {
	pr->ndpr_vltime = new->ndpr_vltime;
	pr->ndpr_pltime = new->ndpr_pltime;
	(void)in6_init_prefix_ltimes(pr); /* XXX error case? */
	pr->ndpr_lastupdate = time_second;
	}

	if (new->ndpr_raf_onlink &&
	(pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
	int e;

	if ((e = nd6_prefix_onlink(pr)) != 0) {
	nd6log((LOG_ERR,
	"prelist_update: failed to make "
	"the prefix %s/%d on-link on %s "
	"(errno=%d)\n",
	ip6_sprintf(ip6buf,
	&pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
	/* proceed anyway. XXX: is it correct? */
	}
	}

	if (dr && pfxrtr_lookup(pr, dr) == NULL)
	pfxrtr_add(pr, dr);
	} else {
	struct nd_prefix *newpr = NULL;

	newprefix = 1;

	if (new->ndpr_vltime == 0)
	goto end;
	if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0)
	goto end;

	error = nd6_prelist_add(new, dr, &newpr);
	if (error != 0 \|\| newpr == NULL) {
	nd6log((LOG_NOTICE, "prelist_update: "
	"nd6_prelist_add failed for %s/%d on %s "
	"errno=%d, returnpr=%p\n",
	ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr),
	new->ndpr_plen, if_name(new->ndpr_ifp),
	error, newpr));
	goto end; /* we should just give up in this case. */
	}

	/*
	* XXX: from the ND point of view, we can ignore a prefix
	* with the on-link bit being zero. However, we need a
	* prefix structure for references from autoconfigured
	* addresses. Thus, we explicitly make sure that the prefix
	* itself expires now.
	*/
	if (newpr->ndpr_raf_onlink == 0) {
	newpr->ndpr_vltime = 0;
	newpr->ndpr_pltime = 0;
	in6_init_prefix_ltimes(newpr);
	}

	pr = newpr;
	}

	/*
	* Address autoconfiguration based on Section 5.5.3 of RFC 2462.
	* Note that pr must be non NULL at this point.
	*/

	/* 5.5.3 (a). Ignore the prefix without the A bit set. */
	if (!new->ndpr_raf_auto)
	goto end;

	/*
	* 5.5.3 (b). the link-local prefix should have been ignored in
	* nd6_ra_input.
	*/

	/* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */
	if (new->ndpr_pltime > new->ndpr_vltime) {
	error = EINVAL; /* XXX: won't be used */
	goto end;
	}

	/*
	* 5.5.3 (d). If the prefix advertised is not equal to the prefix of
	* an address configured by stateless autoconfiguration already in the
	* list of addresses associated with the interface, and the Valid
	* Lifetime is not 0, form an address. We first check if we have
	* a matching prefix.
	* Note: we apply a clarification in rfc2462bis-02 here. We only
	* consider autoconfigured addresses while RFC2462 simply said
	* "address".
	*/
	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	struct in6_ifaddr *ifa6;
	u_int32_t remaininglifetime;

	if (ifa->ifa_addr->sa_family != AF_INET6)
	continue;

	ifa6 = (struct in6_ifaddr *)ifa;

	/*
	* We only consider autoconfigured addresses as per rfc2462bis.
	*/
	if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF))
	continue;

	/*
	* Spec is not clear here, but I believe we should concentrate
	* on unicast (i.e. not anycast) addresses.
	* XXX: other ia6_flags? detached or duplicated?
	*/
	if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0)
	continue;

	/*
	* Ignore the address if it is not associated with a prefix
	* or is associated with a prefix that is different from this
	* one. (pr is never NULL here)
	*/
	if (ifa6->ia6_ndpr != pr)
	continue;

	if (ia6_match == NULL) /* remember the first one */
	ia6_match = ifa6;

	/*
	* An already autoconfigured address matched. Now that we
	* are sure there is at least one matched address, we can
	* proceed to 5.5.3. (e): update the lifetimes according to the
	* "two hours" rule and the privacy extension.
	* We apply some clarifications in rfc2462bis:
	* - use remaininglifetime instead of storedlifetime as a
	* variable name
	* - remove the dead code in the "two-hour" rule
	*/
	#define TWOHOUR (120*60)
	lt6_tmp = ifa6->ia6_lifetime;

	if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME)
	remaininglifetime = ND6_INFINITE_LIFETIME;
	else if (time_second - ifa6->ia6_updatetime >
	lt6_tmp.ia6t_vltime) {
	/*
	* The case of "invalid" address. We should usually
	* not see this case.
	*/
	remaininglifetime = 0;
	} else
	remaininglifetime = lt6_tmp.ia6t_vltime -
	(time_second - ifa6->ia6_updatetime);

	/* when not updating, keep the current stored lifetime. */
	lt6_tmp.ia6t_vltime = remaininglifetime;

	if (TWOHOUR < new->ndpr_vltime \|\|
	remaininglifetime < new->ndpr_vltime) {
	lt6_tmp.ia6t_vltime = new->ndpr_vltime;
	} else if (remaininglifetime <= TWOHOUR) {
	if (auth) {
	lt6_tmp.ia6t_vltime = new->ndpr_vltime;
	}
	} else {
	/*
	* new->ndpr_vltime <= TWOHOUR &&
	* TWOHOUR < remaininglifetime
	*/
	lt6_tmp.ia6t_vltime = TWOHOUR;
	}

	/* The 2 hour rule is not imposed for preferred lifetime. */
	lt6_tmp.ia6t_pltime = new->ndpr_pltime;

	in6_init_address_ltimes(pr, &lt6_tmp);

	/*
	* We need to treat lifetimes for temporary addresses
	* differently, according to
	* draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1);
	* we only update the lifetimes when they are in the maximum
	* intervals.
	*/
	if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
	u_int32_t maxvltime, maxpltime;

	if (V_ip6_temp_valid_lifetime >
	(u_int32_t)((time_second - ifa6->ia6_createtime) +
	V_ip6_desync_factor)) {
	maxvltime = V_ip6_temp_valid_lifetime -
	(time_second - ifa6->ia6_createtime) -
	V_ip6_desync_factor;
	} else
	maxvltime = 0;
	if (V_ip6_temp_preferred_lifetime >
	(u_int32_t)((time_second - ifa6->ia6_createtime) +
	V_ip6_desync_factor)) {
	maxpltime = V_ip6_temp_preferred_lifetime -
	(time_second - ifa6->ia6_createtime) -
	V_ip6_desync_factor;
	} else
	maxpltime = 0;

	if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME \|\|
	lt6_tmp.ia6t_vltime > maxvltime) {
	lt6_tmp.ia6t_vltime = maxvltime;
	}
	if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME \|\|
	lt6_tmp.ia6t_pltime > maxpltime) {
	lt6_tmp.ia6t_pltime = maxpltime;
	}
	}
	ifa6->ia6_lifetime = lt6_tmp;
	ifa6->ia6_updatetime = time_second;
	}
	if (ia6_match == NULL && new->ndpr_vltime) {
	int ifidlen;

	/*
	* 5.5.3 (d) (continued)
	* No address matched and the valid lifetime is non-zero.
	* Create a new address.
	*/

	/*
	* Prefix Length check:
	* If the sum of the prefix length and interface identifier
	* length does not equal 128 bits, the Prefix Information
	* option MUST be ignored. The length of the interface
	* identifier is defined in a separate link-type specific
	* document.
	*/
	ifidlen = in6_if2idlen(ifp);
	if (ifidlen < 0) {
	/* this should not happen, so we always log it. */
	log(LOG_ERR, "prelist_update: IFID undefined (%s)\n",
	if_name(ifp));
	goto end;
	}
	if (ifidlen + pr->ndpr_plen != 128) {
	nd6log((LOG_INFO,
	"prelist_update: invalid prefixlen "
	"%d for %s, ignored\n",
	pr->ndpr_plen, if_name(ifp)));
	goto end;
	}

	if ((ia6 = in6_ifadd(new, mcast)) != NULL) {
	/*
	* note that we should use pr (not new) for reference.
	*/
	pr->ndpr_refcnt++;
	ia6->ia6_ndpr = pr;

	/*
	* RFC 3041 3.3 (2).
	* When a new public address is created as described
	* in RFC2462, also create a new temporary address.
	*
	* RFC 3041 3.5.
	* When an interface connects to a new link, a new
	* randomized interface identifier should be generated
	* immediately together with a new set of temporary
	* addresses. Thus, we specifiy 1 as the 2nd arg of
	* in6_tmpifadd().
	*/
	if (V_ip6_use_tempaddr) {
	int e;
	if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) {
	nd6log((LOG_NOTICE, "prelist_update: "
	"failed to create a temporary "
	"address, errno=%d\n",
	e));
	}
	}

	/*
	* A newly added address might affect the status
	* of other addresses, so we check and update it.
	* XXX: what if address duplication happens?
	*/
	pfxlist_onlink_check();
	} else {
	/* just set an error. do not bark here. */
	error = EADDRNOTAVAIL; /* XXX: might be unused. */
	}
	}

	end:
	splx(s);
	return error;
	}

	/*
	* A supplement function used in the on-link detection below;
	* detect if a given prefix has a (probably) reachable advertising router.
	* XXX: lengthy function name...
	*/
	static struct nd_pfxrouter *
	find_pfxlist_reachable_router(struct nd_prefix *pr)
	{
	struct nd_pfxrouter *pfxrtr;
	struct rtentry *rt;
	struct llinfo_nd6 *ln;

	for (pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); pfxrtr;
	pfxrtr = LIST_NEXT(pfxrtr, pfr_entry)) {
	if ((rt = nd6_lookup(&pfxrtr->router->rtaddr, 0,
	pfxrtr->router->ifp)) &&
	(ln = (struct llinfo_nd6 *)rt->rt_llinfo) &&
	ND6_IS_LLINFO_PROBREACH(ln))
	break; /* found */
	}

	return (pfxrtr);
	}

	/*
	* Check if each prefix in the prefix list has at least one available router
	* that advertised the prefix (a router is "available" if its neighbor cache
	* entry is reachable or probably reachable).
	* If the check fails, the prefix may be off-link, because, for example,
	* we have moved from the network but the lifetime of the prefix has not
	* expired yet. So we should not use the prefix if there is another prefix
	* that has an available router.
	* But, if there is no prefix that has an available router, we still regards
	* all the prefixes as on-link. This is because we can't tell if all the
	* routers are simply dead or if we really moved from the network and there
	* is no router around us.
	*/
	void
	pfxlist_onlink_check()
	{
	+ INIT_VNET_INET6(curvnet);
	struct nd_prefix *pr;
	struct in6_ifaddr *ifa;
	struct nd_defrouter *dr;
	struct nd_pfxrouter *pfxrtr = NULL;

	/*
	* Check if there is a prefix that has a reachable advertising
	* router.
	*/
	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr))
	break;
	}

	/*
	* If we have no such prefix, check whether we still have a router
	* that does not advertise any prefixes.
	*/
	if (pr == NULL) {
	for (dr = TAILQ_FIRST(&V_nd_defrouter); dr;
	dr = TAILQ_NEXT(dr, dr_entry)) {
	struct nd_prefix *pr0;

	for (pr0 = V_nd_prefix.lh_first; pr0;
	pr0 = pr0->ndpr_next) {
	if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL)
	break;
	}
	if (pfxrtr != NULL)
	break;
	}
	}
	if (pr != NULL \|\| (TAILQ_FIRST(&V_nd_defrouter) && pfxrtr == NULL)) {
	/*
	* There is at least one prefix that has a reachable router,
	* or at least a router which probably does not advertise
	* any prefixes. The latter would be the case when we move
	* to a new link where we have a router that does not provide
	* prefixes and we configure an address by hand.
	* Detach prefixes which have no reachable advertising
	* router, and attach other prefixes.
	*/
	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	/* XXX: a link-local prefix should never be detached */
	if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
	continue;

	/*
	* we aren't interested in prefixes without the L bit
	* set.
	*/
	if (pr->ndpr_raf_onlink == 0)
	continue;

	if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
	find_pfxlist_reachable_router(pr) == NULL)
	pr->ndpr_stateflags \|= NDPRF_DETACHED;
	if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
	find_pfxlist_reachable_router(pr) != 0)
	pr->ndpr_stateflags &= ~NDPRF_DETACHED;
	}
	} else {
	/* there is no prefix that has a reachable router */
	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
	continue;

	if (pr->ndpr_raf_onlink == 0)
	continue;

	if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0)
	pr->ndpr_stateflags &= ~NDPRF_DETACHED;
	}
	}

	/*
	* Remove each interface route associated with a (just) detached
	* prefix, and reinstall the interface route for a (just) attached
	* prefix. Note that all attempt of reinstallation does not
	* necessarily success, when a same prefix is shared among multiple
	* interfaces. Such cases will be handled in nd6_prefix_onlink,
	* so we don't have to care about them.
	*/
	for (pr = V_nd_prefix.lh_first; pr; pr = pr->ndpr_next) {
	int e;
	char ip6buf[INET6_ADDRSTRLEN];

	if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr))
	continue;

	if (pr->ndpr_raf_onlink == 0)
	continue;

	if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
	(pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
	if ((e = nd6_prefix_offlink(pr)) != 0) {
	nd6log((LOG_ERR,
	"pfxlist_onlink_check: failed to "
	"make %s/%d offlink, errno=%d\n",
	ip6_sprintf(ip6buf,
	&pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, e));
	}
	}
	if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
	(pr->ndpr_stateflags & NDPRF_ONLINK) == 0 &&
	pr->ndpr_raf_onlink) {
	if ((e = nd6_prefix_onlink(pr)) != 0) {
	nd6log((LOG_ERR,
	"pfxlist_onlink_check: failed to "
	"make %s/%d onlink, errno=%d\n",
	ip6_sprintf(ip6buf,
	&pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, e));
	}
	}
	}

	/*
	* Changes on the prefix status might affect address status as well.
	* Make sure that all addresses derived from an attached prefix are
	* attached, and that all addresses derived from a detached prefix are
	* detached. Note, however, that a manually configured address should
	* always be attached.
	* The precise detection logic is same as the one for prefixes.
	*/
	for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) {
	if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF))
	continue;

	if (ifa->ia6_ndpr == NULL) {
	/*
	* This can happen when we first configure the address
	* (i.e. the address exists, but the prefix does not).
	* XXX: complicated relationships...
	*/
	continue;
	}

	if (find_pfxlist_reachable_router(ifa->ia6_ndpr))
	break;
	}
	if (ifa) {
	for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) {
	if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
	continue;

	if (ifa->ia6_ndpr == NULL) /* XXX: see above. */
	continue;

	if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) {
	if (ifa->ia6_flags & IN6_IFF_DETACHED) {
	ifa->ia6_flags &= ~IN6_IFF_DETACHED;
	ifa->ia6_flags \|= IN6_IFF_TENTATIVE;
	nd6_dad_start((struct ifaddr *)ifa, 0);
	}
	} else {
	ifa->ia6_flags \|= IN6_IFF_DETACHED;
	}
	}
	}
	else {
	for (ifa = V_in6_ifaddr; ifa; ifa = ifa->ia_next) {
	if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
	continue;

	if (ifa->ia6_flags & IN6_IFF_DETACHED) {
	ifa->ia6_flags &= ~IN6_IFF_DETACHED;
	ifa->ia6_flags \|= IN6_IFF_TENTATIVE;
	/* Do we need a delay in this case? */
	nd6_dad_start((struct ifaddr *)ifa, 0);
	}
	}
	}
	}

	int
	nd6_prefix_onlink(struct nd_prefix *pr)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifaddr *ifa;
	struct ifnet *ifp = pr->ndpr_ifp;
	struct sockaddr_in6 mask6;
	struct nd_prefix *opr;
	u_long rtflags;
	int error = 0;
	struct rtentry *rt = NULL;
	char ip6buf[INET6_ADDRSTRLEN];

	/* sanity check */
	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
	nd6log((LOG_ERR,
	"nd6_prefix_onlink: %s/%d is already on-link\n",
	ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen));
	return (EEXIST);
	}

	/*
	* Add the interface route associated with the prefix. Before
	* installing the route, check if there's the same prefix on another
	* interface, and the prefix has already installed the interface route.
	* Although such a configuration is expected to be rare, we explicitly
	* allow it.
	*/
	for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) {
	if (opr == pr)
	continue;

	if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0)
	continue;

	if (opr->ndpr_plen == pr->ndpr_plen &&
	in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
	&opr->ndpr_prefix.sin6_addr, pr->ndpr_plen))
	return (0);
	}

	/*
	* We prefer link-local addresses as the associated interface address.
	*/
	/* search for a link-local addr */
	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
	IN6_IFF_NOTREADY \| IN6_IFF_ANYCAST);
	if (ifa == NULL) {
	/* XXX: freebsd does not have ifa_ifwithaf */
	TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) {
	if (ifa->ifa_addr->sa_family == AF_INET6)
	break;
	}
	/* should we care about ia6_flags? */
	}
	if (ifa == NULL) {
	/*
	* This can still happen, when, for example, we receive an RA
	* containing a prefix with the L bit set and the A bit clear,
	* after removing all IPv6 addresses on the receiving
	* interface. This should, of course, be rare though.
	*/
	nd6log((LOG_NOTICE,
	"nd6_prefix_onlink: failed to find any ifaddr"
	" to add route for a prefix(%s/%d) on %s\n",
	ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, if_name(ifp)));
	return (0);
	}

	/*
	* in6_ifinit() sets nd6_rtrequest to ifa_rtrequest for all ifaddrs.
	* ifa->ifa_rtrequest = nd6_rtrequest;
	*/
	bzero(&mask6, sizeof(mask6));
	mask6.sin6_len = sizeof(mask6);
	mask6.sin6_addr = pr->ndpr_mask;
	rtflags = ifa->ifa_flags \| RTF_CLONING \| RTF_UP;
	if (nd6_need_cache(ifp)) {
	/* explicitly set in case ifa_flags does not set the flag. */
	rtflags \|= RTF_CLONING;
	} else {
	/*
	* explicitly clear the cloning bit in case ifa_flags sets it.
	*/
	rtflags &= ~RTF_CLONING;
	}
	error = rtrequest(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix,
	ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt);
	if (error == 0) {
	if (rt != NULL) /* this should be non NULL, though */
	nd6_rtmsg(RTM_ADD, rt);
	pr->ndpr_stateflags \|= NDPRF_ONLINK;
	} else {
	char ip6bufg[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
	nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a"
	" prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx "
	"errno = %d\n",
	ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen, if_name(ifp),
	ip6_sprintf(ip6bufg, &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr),
	ip6_sprintf(ip6bufm, &mask6.sin6_addr), rtflags, error));
	}

	if (rt != NULL) {
	RT_LOCK(rt);
	RT_REMREF(rt);
	RT_UNLOCK(rt);
	}

	return (error);
	}

	int
	nd6_prefix_offlink(struct nd_prefix *pr)
	{
	+ INIT_VNET_INET6(curvnet);
	int error = 0;
	struct ifnet *ifp = pr->ndpr_ifp;
	struct nd_prefix *opr;
	struct sockaddr_in6 sa6, mask6;
	struct rtentry *rt = NULL;
	char ip6buf[INET6_ADDRSTRLEN];

	/* sanity check */
	if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
	nd6log((LOG_ERR,
	"nd6_prefix_offlink: %s/%d is already off-link\n",
	ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
	pr->ndpr_plen));
	return (EEXIST);
	}

	bzero(&sa6, sizeof(sa6));
	sa6.sin6_family = AF_INET6;
	sa6.sin6_len = sizeof(sa6);
	bcopy(&pr->ndpr_prefix.sin6_addr, &sa6.sin6_addr,
	sizeof(struct in6_addr));
	bzero(&mask6, sizeof(mask6));
	mask6.sin6_family = AF_INET6;
	mask6.sin6_len = sizeof(sa6);
	bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr));
	error = rtrequest(RTM_DELETE, (struct sockaddr *)&sa6, NULL,
	(struct sockaddr *)&mask6, 0, &rt);
	if (error == 0) {
	pr->ndpr_stateflags &= ~NDPRF_ONLINK;

	/* report the route deletion to the routing socket. */
	if (rt != NULL)
	nd6_rtmsg(RTM_DELETE, rt);

	/*
	* There might be the same prefix on another interface,
	* the prefix which could not be on-link just because we have
	* the interface route (see comments in nd6_prefix_onlink).
	* If there's one, try to make the prefix on-link on the
	* interface.
	*/
	for (opr = V_nd_prefix.lh_first; opr; opr = opr->ndpr_next) {
	if (opr == pr)
	continue;

	if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0)
	continue;

	/*
	* KAME specific: detached prefixes should not be
	* on-link.
	*/
	if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0)
	continue;

	if (opr->ndpr_plen == pr->ndpr_plen &&
	in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
	&opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
	int e;

	if ((e = nd6_prefix_onlink(opr)) != 0) {
	nd6log((LOG_ERR,
	"nd6_prefix_offlink: failed to "
	"recover a prefix %s/%d from %s "
	"to %s (errno = %d)\n",
	ip6_sprintf(ip6buf,
	&opr->ndpr_prefix.sin6_addr),
	opr->ndpr_plen, if_name(ifp),
	if_name(opr->ndpr_ifp), e));
	}
	}
	}
	} else {
	/* XXX: can we still set the NDPRF_ONLINK flag? */
	nd6log((LOG_ERR,
	"nd6_prefix_offlink: failed to delete route: "
	"%s/%d on %s (errno = %d)\n",
	ip6_sprintf(ip6buf, &sa6.sin6_addr), pr->ndpr_plen,
	if_name(ifp), error));
	}

	if (rt != NULL) {
	RTFREE(rt);
	}

	return (error);
	}

	static struct in6_ifaddr *
	in6_ifadd(struct nd_prefixctl *pr, int mcast)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = pr->ndpr_ifp;
	struct ifaddr *ifa;
	struct in6_aliasreq ifra;
	struct in6_ifaddr ia, ib;
	int error, plen0;
	struct in6_addr mask;
	int prefixlen = pr->ndpr_plen;
	int updateflags;
	char ip6buf[INET6_ADDRSTRLEN];

	in6_prefixlen2mask(&mask, prefixlen);

	/*
	* find a link-local address (will be interface ID).
	* Is it really mandatory? Theoretically, a global or a site-local
	* address can be configured without a link-local address, if we
	* have a unique interface identifier...
	*
	* it is not mandatory to have a link-local address, we can generate
	* interface identifier on the fly. we do this because:
	* (1) it should be the easiest way to find interface identifier.
	* (2) RFC2462 5.4 suggesting the use of the same interface identifier
	* for multiple addresses on a single interface, and possible shortcut
	* of DAD. we omitted DAD for this reason in the past.
	* (3) a user can prevent autoconfiguration of global address
	* by removing link-local address by hand (this is partly because we
	* don't have other way to control the use of IPv6 on an interface.
	* this has been our design choice - cf. NRL's "ifconfig auto").
	* (4) it is easier to manage when an interface has addresses
	* with the same interface identifier, than to have multiple addresses
	* with different interface identifiers.
	*/
	ifa = (struct ifaddr )in6ifa_ifpforlinklocal(ifp, 0); / 0 is OK? */
	if (ifa)
	ib = (struct in6_ifaddr *)ifa;
	else
	return NULL;

	/* prefixlen + ifidlen must be equal to 128 */
	plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL);
	if (prefixlen != plen0) {
	nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s "
	"(prefix=%d ifid=%d)\n",
	if_name(ifp), prefixlen, 128 - plen0));
	return NULL;
	}

	/* make ifaddr */

	bzero(&ifra, sizeof(ifra));
	/*
	* in6_update_ifa() does not use ifra_name, but we accurately set it
	* for safety.
	*/
	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
	ifra.ifra_addr.sin6_family = AF_INET6;
	ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
	/* prefix */
	ifra.ifra_addr.sin6_addr = pr->ndpr_prefix.sin6_addr;
	ifra.ifra_addr.sin6_addr.s6_addr32[0] &= mask.s6_addr32[0];
	ifra.ifra_addr.sin6_addr.s6_addr32[1] &= mask.s6_addr32[1];
	ifra.ifra_addr.sin6_addr.s6_addr32[2] &= mask.s6_addr32[2];
	ifra.ifra_addr.sin6_addr.s6_addr32[3] &= mask.s6_addr32[3];

	/* interface ID */
	ifra.ifra_addr.sin6_addr.s6_addr32[0] \|=
	(ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]);
	ifra.ifra_addr.sin6_addr.s6_addr32[1] \|=
	(ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]);
	ifra.ifra_addr.sin6_addr.s6_addr32[2] \|=
	(ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]);
	ifra.ifra_addr.sin6_addr.s6_addr32[3] \|=
	(ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]);

	/* new prefix mask. */
	ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
	ifra.ifra_prefixmask.sin6_family = AF_INET6;
	bcopy(&mask, &ifra.ifra_prefixmask.sin6_addr,
	sizeof(ifra.ifra_prefixmask.sin6_addr));

	/* lifetimes. */
	ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime;
	ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime;

	/* XXX: scope zone ID? */

	ifra.ifra_flags \|= IN6_IFF_AUTOCONF; /* obey autoconf */

	/*
	* Make sure that we do not have this address already. This should
	* usually not happen, but we can still see this case, e.g., if we
	* have manually configured the exact address to be configured.
	*/
	if (in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr) != NULL) {
	/* this should be rare enough to make an explicit log */
	log(LOG_INFO, "in6_ifadd: %s is already configured\n",
	ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr));
	return (NULL);
	}

	/*
	* Allocate ifaddr structure, link into chain, etc.
	* If we are going to create a new address upon receiving a multicasted
	* RA, we need to impose a random delay before starting DAD.
	* [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2]
	*/
	updateflags = 0;
	if (mcast)
	updateflags \|= IN6_IFAUPDATE_DADDELAY;
	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) {
	nd6log((LOG_ERR,
	"in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n",
	ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr),
	if_name(ifp), error));
	return (NULL); /* ifaddr must not have been allocated. */
	}

	ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);

	return (ia); /* this is always non-NULL */
	}

	/*
	* ia0 - corresponding public address
	*/
	int
	in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ifnet *ifp = ia0->ia_ifa.ifa_ifp;
	struct in6_ifaddr newia, ia;
	struct in6_aliasreq ifra;
	int i, error;
	int trylimit = 3; /* XXX: adhoc value */
	int updateflags;
	u_int32_t randid[2];
	time_t vltime0, pltime0;

	bzero(&ifra, sizeof(ifra));
	strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
	ifra.ifra_addr = ia0->ia_addr;
	/* copy prefix mask */
	ifra.ifra_prefixmask = ia0->ia_prefixmask;
	/* clear the old IFID */
	for (i = 0; i < 4; i++) {
	ifra.ifra_addr.sin6_addr.s6_addr32[i] &=
	ifra.ifra_prefixmask.sin6_addr.s6_addr32[i];
	}

	again:
	if (in6_get_tmpifid(ifp, (u_int8_t *)randid,
	(const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) {
	nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find a good "
	"random IFID\n"));
	return (EINVAL);
	}
	ifra.ifra_addr.sin6_addr.s6_addr32[2] \|=
	(randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2]));
	ifra.ifra_addr.sin6_addr.s6_addr32[3] \|=
	(randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3]));

	/*
	* in6_get_tmpifid() quite likely provided a unique interface ID.
	* However, we may still have a chance to see collision, because
	* there may be a time lag between generation of the ID and generation
	* of the address. So, we'll do one more sanity check.
	*/
	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
	&ifra.ifra_addr.sin6_addr)) {
	if (trylimit-- == 0) {
	/*
	* Give up. Something strange should have
	* happened.
	*/
	nd6log((LOG_NOTICE, "in6_tmpifadd: failed to "
	"find a unique random IFID\n"));
	return (EEXIST);
	}
	forcegen = 1;
	goto again;
	}
	}

	/*
	* The Valid Lifetime is the lower of the Valid Lifetime of the
	* public address or TEMP_VALID_LIFETIME.
	* The Preferred Lifetime is the lower of the Preferred Lifetime
	* of the public address or TEMP_PREFERRED_LIFETIME -
	* DESYNC_FACTOR.
	*/
	if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
	vltime0 = IFA6_IS_INVALID(ia0) ? 0 :
	(ia0->ia6_lifetime.ia6t_vltime -
	(time_second - ia0->ia6_updatetime));
	if (vltime0 > V_ip6_temp_valid_lifetime)
	vltime0 = V_ip6_temp_valid_lifetime;
	} else
	vltime0 = V_ip6_temp_valid_lifetime;
	if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
	pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 :
	(ia0->ia6_lifetime.ia6t_pltime -
	(time_second - ia0->ia6_updatetime));
	if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){
	pltime0 = V_ip6_temp_preferred_lifetime -
	V_ip6_desync_factor;
	}
	} else
	pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor;
	ifra.ifra_lifetime.ia6t_vltime = vltime0;
	ifra.ifra_lifetime.ia6t_pltime = pltime0;

	/*
	* A temporary address is created only if this calculated Preferred
	* Lifetime is greater than REGEN_ADVANCE time units.
	*/
	if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance)
	return (0);

	/* XXX: scope zone ID? */

	ifra.ifra_flags \|= (IN6_IFF_AUTOCONF\|IN6_IFF_TEMPORARY);

	/* allocate ifaddr structure, link into chain, etc. */
	updateflags = 0;
	if (delay)
	updateflags \|= IN6_IFAUPDATE_DADDELAY;
	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0)
	return (error);

	newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
	if (newia == NULL) { /* XXX: can it happen? */
	nd6log((LOG_ERR,
	"in6_tmpifadd: ifa update succeeded, but we got "
	"no ifaddr\n"));
	return (EINVAL); /* XXX */
	}
	newia->ia6_ndpr = ia0->ia6_ndpr;
	newia->ia6_ndpr->ndpr_refcnt++;

	/*
	* A newly added address might affect the status of other addresses.
	* XXX: when the temporary address is generated with a new public
	* address, the onlink check is redundant. However, it would be safe
	* to do the check explicitly everywhere a new address is generated,
	* and, in fact, we surely need the check when we create a new
	* temporary address due to deprecation of an old temporary address.
	*/
	pfxlist_onlink_check();

	return (0);
	}

	static int
	in6_init_prefix_ltimes(struct nd_prefix *ndpr)
	{
	if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME)
	ndpr->ndpr_preferred = 0;
	else
	ndpr->ndpr_preferred = time_second + ndpr->ndpr_pltime;
	if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME)
	ndpr->ndpr_expire = 0;
	else
	ndpr->ndpr_expire = time_second + ndpr->ndpr_vltime;

	return 0;
	}

	static void
	in6_init_address_ltimes(struct nd_prefix new, struct in6_addrlifetime lt6)
	{
	/* init ia6t_expire */
	if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME)
	lt6->ia6t_expire = 0;
	else {
	lt6->ia6t_expire = time_second;
	lt6->ia6t_expire += lt6->ia6t_vltime;
	}

	/* init ia6t_preferred */
	if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME)
	lt6->ia6t_preferred = 0;
	else {
	lt6->ia6t_preferred = time_second;
	lt6->ia6t_preferred += lt6->ia6t_pltime;
	}
	}

	/*
	* Delete all the routing table entries that use the specified gateway.
	* XXX: this function causes search through all entries of routing table, so
	* it shouldn't be called when acting as a router.
	*/
	void
	rt6_flush(struct in6_addr gateway, struct ifnet ifp)
	{
	-
	+ INIT_VNET_NET(curvnet);
	struct radix_node_head *rnh = V_rt_tables[0][AF_INET6];
	int s = splnet();

	/* We'll care only link-local addresses */
	if (!IN6_IS_ADDR_LINKLOCAL(gateway)) {
	splx(s);
	return;
	}

	RADIX_NODE_HEAD_LOCK(rnh);
	rnh->rnh_walktree(rnh, rt6_deleteroute, (void *)gateway);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	splx(s);
	}

	static int
	rt6_deleteroute(struct radix_node rn, void arg)
	{
	#define SIN6(s) ((struct sockaddr_in6 *)s)
	struct rtentry rt = (struct rtentry )rn;
	struct in6_addr gate = (struct in6_addr )arg;

	if (rt->rt_gateway == NULL \|\| rt->rt_gateway->sa_family != AF_INET6)
	return (0);

	if (!IN6_ARE_ADDR_EQUAL(gate, &SIN6(rt->rt_gateway)->sin6_addr)) {
	return (0);
	}

	/*
	* Do not delete a static route.
	* XXX: this seems to be a bit ad-hoc. Should we consider the
	* 'cloned' bit instead?
	*/
	if ((rt->rt_flags & RTF_STATIC) != 0)
	return (0);

	/*
	* We delete only host route. This means, in particular, we don't
	* delete default route.
	*/
	if ((rt->rt_flags & RTF_HOST) == 0)
	return (0);

	return (rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
	rt_mask(rt), rt->rt_flags, 0));
	#undef SIN6
	}

	int
	nd6_setdefaultiface(int ifindex)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	int error = 0;

	if (ifindex < 0 \|\| V_if_index < ifindex)
	return (EINVAL);
	if (ifindex != 0 && !ifnet_byindex(ifindex))
	return (EINVAL);

	if (V_nd6_defifindex != ifindex) {
	V_nd6_defifindex = ifindex;
	if (V_nd6_defifindex > 0)
	V_nd6_defifp = ifnet_byindex(V_nd6_defifindex);
	else
	V_nd6_defifp = NULL;

	/*
	* Our current implementation assumes one-to-one maping between
	* interfaces and links, so it would be natural to use the
	* default interface as the default link.
	*/
	scope6_setdefault(V_nd6_defifp);
	}

	return (error);
	}
	Index: head/sys/netinet6/raw_ip6.c
	===================================================================
	--- head/sys/netinet6/raw_ip6.c (revision 183549)
	+++ head/sys/netinet6/raw_ip6.c (revision 183550)
	@@ -1,819 +1,835 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1993
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ipsec.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/errno.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_var.h>
	#include <netinet/in_systm.h>
	#include <netinet/icmp6.h>
	#include <netinet/in_pcb.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6protosw.h>
	#include <netinet6/ip6_mroute.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/nd6.h>
	#include <netinet6/raw_ip6.h>
	#include <netinet6/scope6_var.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#endif /* IPSEC */

	#include <machine/stdarg.h>

	#define satosin6(sa) ((struct sockaddr_in6 *)(sa))
	#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))

	/*
	* Raw interface to IP6 protocol.
	*/

	extern struct inpcbhead ripcb;
	extern struct inpcbinfo ripcbinfo;
	extern u_long rip_sendspace;
	extern u_long rip_recvspace;

	struct rip6stat rip6stat;

	/*
	* Hooks for multicast forwarding.
	*/
	struct socket *ip6_mrouter = NULL;
	int (ip6_mrouter_set)(struct socket , struct sockopt *);
	int (ip6_mrouter_get)(struct socket , struct sockopt *);
	int (*ip6_mrouter_done)(void);
	int (ip6_mforward)(struct ip6_hdr , struct ifnet , struct mbuf );
	int (*mrt6_ioctl)(int, caddr_t);

	/*
	* Setup generic address and protocol structures for raw_input routine, then
	* pass them along with mbuf chain.
	*/
	int
	rip6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	+#ifdef IPSEC
	+ INIT_VNET_IPSEC(curvnet);
	+#endif
	struct mbuf m = mp;
	register struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );
	register struct inpcb *in6p;
	struct inpcb *last = 0;
	struct mbuf *opts = NULL;
	struct sockaddr_in6 fromsa;

	V_rip6stat.rip6s_ipackets++;

	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
	/* XXX Send icmp6 host/port unreach? */
	m_freem(m);
	return (IPPROTO_DONE);
	}

	init_sin6(&fromsa, m); /* general init */

	INP_INFO_RLOCK(&V_ripcbinfo);
	LIST_FOREACH(in6p, &V_ripcb, inp_list) {
	if ((in6p->in6p_vflag & INP_IPV6) == 0)
	continue;
	if (in6p->in6p_ip6_nxt &&
	in6p->in6p_ip6_nxt != proto)
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
	!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
	!IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
	continue;
	INP_RLOCK(in6p);
	if (in6p->in6p_cksum != -1) {
	V_rip6stat.rip6s_isum++;
	if (in6_cksum(m, proto, *offp,
	m->m_pkthdr.len - *offp)) {
	INP_RUNLOCK(in6p);
	V_rip6stat.rip6s_badsum++;
	continue;
	}
	}
	if (last) {
	struct mbuf *n = m_copy(m, 0, (int)M_COPYALL);

	#ifdef IPSEC
	/*
	* Check AH/ESP integrity.
	*/
	if (n && ipsec6_in_reject(n, last)) {
	m_freem(n);
	V_ipsec6stat.in_polvio++;
	/* Do not inject data into pcb. */
	} else
	#endif /* IPSEC */
	if (n) {
	if (last->in6p_flags & IN6P_CONTROLOPTS \|\|
	last->in6p_socket->so_options & SO_TIMESTAMP)
	ip6_savecontrol(last, n, &opts);
	/* strip intermediate headers */
	m_adj(n, *offp);
	if (sbappendaddr(&last->in6p_socket->so_rcv,
	(struct sockaddr *)&fromsa,
	n, opts) == 0) {
	m_freem(n);
	if (opts)
	m_freem(opts);
	V_rip6stat.rip6s_fullsock++;
	} else
	sorwakeup(last->in6p_socket);
	opts = NULL;
	}
	INP_RUNLOCK(last);
	}
	last = in6p;
	}
	INP_INFO_RUNLOCK(&V_ripcbinfo);
	#ifdef IPSEC
	/*
	* Check AH/ESP integrity.
	*/
	if (last && ipsec6_in_reject(m, last)) {
	m_freem(m);
	V_ipsec6stat.in_polvio++;
	V_ip6stat.ip6s_delivered--;
	/* Do not inject data into pcb. */
	INP_RUNLOCK(last);
	} else
	#endif /* IPSEC */
	if (last) {
	if (last->in6p_flags & IN6P_CONTROLOPTS \|\|
	last->in6p_socket->so_options & SO_TIMESTAMP)
	ip6_savecontrol(last, m, &opts);
	/* Strip intermediate headers. */
	m_adj(m, *offp);
	if (sbappendaddr(&last->in6p_socket->so_rcv,
	(struct sockaddr *)&fromsa, m, opts) == 0) {
	m_freem(m);
	if (opts)
	m_freem(opts);
	V_rip6stat.rip6s_fullsock++;
	} else
	sorwakeup(last->in6p_socket);
	INP_RUNLOCK(last);
	} else {
	V_rip6stat.rip6s_nosock++;
	if (m->m_flags & M_MCAST)
	V_rip6stat.rip6s_nosockmcast++;
	if (proto == IPPROTO_NONE)
	m_freem(m);
	else {
	char prvnxtp = ip6_get_prevhdr(m, offp); /* XXX */
	icmp6_error(m, ICMP6_PARAM_PROB,
	ICMP6_PARAMPROB_NEXTHEADER,
	prvnxtp - mtod(m, char *));
	}
	V_ip6stat.ip6s_delivered--;
	}
	return (IPPROTO_DONE);
	}

	void
	rip6_ctlinput(int cmd, struct sockaddr sa, void d)
	{
	+ INIT_VNET_INET(curvnet);
	struct ip6_hdr *ip6;
	struct mbuf *m;
	int off = 0;
	struct ip6ctlparam *ip6cp = NULL;
	const struct sockaddr_in6 *sa6_src = NULL;
	void *cmdarg;
	struct inpcb (notify)(struct inpcb *, int) = in6_rtchange;

	if (sa->sa_family != AF_INET6 \|\|
	sa->sa_len != sizeof(struct sockaddr_in6))
	return;

	if ((unsigned)cmd >= PRC_NCMDS)
	return;
	if (PRC_IS_REDIRECT(cmd))
	notify = in6_rtchange, d = NULL;
	else if (cmd == PRC_HOSTDEAD)
	d = NULL;
	else if (inet6ctlerrmap[cmd] == 0)
	return;

	/*
	* If the parameter is from icmp6, decode it.
	*/
	if (d != NULL) {
	ip6cp = (struct ip6ctlparam *)d;
	m = ip6cp->ip6c_m;
	ip6 = ip6cp->ip6c_ip6;
	off = ip6cp->ip6c_off;
	cmdarg = ip6cp->ip6c_cmdarg;
	sa6_src = ip6cp->ip6c_src;
	} else {
	m = NULL;
	ip6 = NULL;
	cmdarg = NULL;
	sa6_src = &sa6_any;
	}

	(void) in6_pcbnotify(&V_ripcbinfo, sa, 0,
	(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
	}

	/*
	* Generate IPv6 header and pass packet to ip6_output. Tack on options user
	* may have setup with control call.
	*/
	int
	#if __STDC__
	rip6_output(struct mbuf *m, ...)
	#else
	rip6_output(m, va_alist)
	struct mbuf *m;
	va_dcl
	#endif
	{
	+ INIT_VNET_INET6(curvnet);
	struct mbuf *control;
	struct socket *so;
	struct sockaddr_in6 *dstsock;
	struct in6_addr *dst;
	struct ip6_hdr *ip6;
	struct inpcb *in6p;
	u_int plen = m->m_pkthdr.len;
	int error = 0;
	struct ip6_pktopts opt, *optp;
	struct ifnet *oifp = NULL;
	int type = 0, code = 0; /* for ICMPv6 output statistics only */
	int scope_ambiguous = 0;
	struct in6_addr *in6a;
	va_list ap;

	va_start(ap, m);
	so = va_arg(ap, struct socket *);
	dstsock = va_arg(ap, struct sockaddr_in6 *);
	control = va_arg(ap, struct mbuf *);
	va_end(ap);

	in6p = sotoin6pcb(so);
	INP_WLOCK(in6p);

	dst = &dstsock->sin6_addr;
	if (control) {
	if ((error = ip6_setpktopts(control, &opt,
	in6p->in6p_outputopts, so->so_cred,
	so->so_proto->pr_protocol)) != 0) {
	goto bad;
	}
	optp = &opt;
	} else
	optp = in6p->in6p_outputopts;

	/*
	* Check and convert scope zone ID into internal form.
	*
	* XXX: we may still need to determine the zone later.
	*/
	if (!(so->so_state & SS_ISCONNECTED)) {
	if (dstsock->sin6_scope_id == 0 && !V_ip6_use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(dstsock, V_ip6_use_defzone)) != 0)
	goto bad;
	}

	/*
	* For an ICMPv6 packet, we should know its type and code to update
	* statistics.
	*/
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
	struct icmp6_hdr *icmp6;
	if (m->m_len < sizeof(struct icmp6_hdr) &&
	(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	icmp6 = mtod(m, struct icmp6_hdr *);
	type = icmp6->icmp6_type;
	code = icmp6->icmp6_code;
	}

	M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
	if (m == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	ip6 = mtod(m, struct ip6_hdr *);

	/*
	* Source address selection.
	*/
	if ((in6a = in6_selectsrc(dstsock, optp, in6p, NULL, so->so_cred,
	&oifp, &error)) == NULL) {
	if (error == 0)
	error = EADDRNOTAVAIL;
	goto bad;
	}
	ip6->ip6_src = *in6a;

	if (oifp && scope_ambiguous) {
	/*
	* Application should provide a proper zone ID or the use of
	* default zone IDs should be enabled. Unfortunately, some
	* applications do not behave as it should, so we need a
	* workaround. Even if an appropriate ID is not determined
	* (when it's required), if we can determine the outgoing
	* interface. determine the zone ID based on the interface.
	*/
	error = in6_setscope(&dstsock->sin6_addr, oifp, NULL);
	if (error != 0)
	goto bad;
	}
	ip6->ip6_dst = dstsock->sin6_addr;

	/*
	* Fill in the rest of the IPv6 header fields.
	*/
	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) \|
	(in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK);
	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) \|
	(IPV6_VERSION & IPV6_VERSION_MASK);

	/*
	* ip6_plen will be filled in ip6_output, so not fill it here.
	*/
	ip6->ip6_nxt = in6p->in6p_ip6_nxt;
	ip6->ip6_hlim = in6_selecthlim(in6p, oifp);

	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 \|\|
	in6p->in6p_cksum != -1) {
	struct mbuf *n;
	int off;
	u_int16_t *p;

	/* Compute checksum. */
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
	off = offsetof(struct icmp6_hdr, icmp6_cksum);
	else
	off = in6p->in6p_cksum;
	if (plen < off + 1) {
	error = EINVAL;
	goto bad;
	}
	off += sizeof(struct ip6_hdr);

	n = m;
	while (n && n->m_len <= off) {
	off -= n->m_len;
	n = n->m_next;
	}
	if (!n)
	goto bad;
	p = (u_int16_t *)(mtod(n, caddr_t) + off);
	*p = 0;
	p = in6_cksum(m, ip6->ip6_nxt, sizeof(ip6), plen);
	}

	error = ip6_output(m, optp, NULL, 0, in6p->in6p_moptions, &oifp, in6p);
	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
	if (oifp)
	icmp6_ifoutstat_inc(oifp, type, code);
	V_icmp6stat.icp6s_outhist[type]++;
	} else
	V_rip6stat.rip6s_opackets++;

	goto freectl;

	bad:
	if (m)
	m_freem(m);

	freectl:
	if (control) {
	ip6_clearpktopts(&opt, -1);
	m_freem(control);
	}
	INP_WUNLOCK(in6p);
	return (error);
	}

	/*
	* Raw IPv6 socket option processing.
	*/
	int
	rip6_ctloutput(struct socket so, struct sockopt sopt)
	{
	int error;

	if (sopt->sopt_level == IPPROTO_ICMPV6)
	/*
	* XXX: is it better to call icmp6_ctloutput() directly
	* from protosw?
	*/
	return (icmp6_ctloutput(so, sopt));
	else if (sopt->sopt_level != IPPROTO_IPV6)
	return (EINVAL);

	error = 0;

	switch (sopt->sopt_dir) {
	case SOPT_GET:
	switch (sopt->sopt_name) {
	case MRT6_INIT:
	case MRT6_DONE:
	case MRT6_ADD_MIF:
	case MRT6_DEL_MIF:
	case MRT6_ADD_MFC:
	case MRT6_DEL_MFC:
	case MRT6_PIM:
	error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) :
	EOPNOTSUPP;
	break;
	case IPV6_CHECKSUM:
	error = ip6_raw_ctloutput(so, sopt);
	break;
	default:
	error = ip6_ctloutput(so, sopt);
	break;
	}
	break;

	case SOPT_SET:
	switch (sopt->sopt_name) {
	case MRT6_INIT:
	case MRT6_DONE:
	case MRT6_ADD_MIF:
	case MRT6_DEL_MIF:
	case MRT6_ADD_MFC:
	case MRT6_DEL_MFC:
	case MRT6_PIM:
	error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) :
	EOPNOTSUPP;
	break;
	case IPV6_CHECKSUM:
	error = ip6_raw_ctloutput(so, sopt);
	break;
	default:
	error = ip6_ctloutput(so, sopt);
	break;
	}
	break;
	}

	return (error);
	}

	static int
	rip6_attach(struct socket so, int proto, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	struct icmp6_filter *filter;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("rip6_attach: inp != NULL"));

	error = priv_check(td, PRIV_NETINET_RAW);
	if (error)
	return (error);
	error = soreserve(so, rip_sendspace, rip_recvspace);
	if (error)
	return (error);
	MALLOC(filter, struct icmp6_filter *,
	sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
	if (filter == NULL)
	return (ENOMEM);
	INP_INFO_WLOCK(&V_ripcbinfo);
	error = in_pcballoc(so, &V_ripcbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	FREE(filter, M_PCB);
	return (error);
	}
	inp = (struct inpcb *)so->so_pcb;
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	inp->inp_vflag \|= INP_IPV6;
	inp->in6p_ip6_nxt = (long)proto;
	inp->in6p_hops = -1; /* use kernel default */
	inp->in6p_cksum = -1;
	inp->in6p_icmp6filt = filter;
	ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
	INP_WUNLOCK(inp);
	return (0);
	}

	static void
	rip6_detach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_detach: inp == NULL"));

	if (so == ip6_mrouter && ip6_mrouter_done)
	ip6_mrouter_done();
	/* xxx: RSVP */
	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	FREE(inp->in6p_icmp6filt, M_PCB);
	in6_pcbdetach(inp);
	in6_pcbfree(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	}

	/* XXXRW: This can't ever be called. */
	static void
	rip6_abort(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_abort: inp == NULL"));

	soisdisconnected(so);
	}

	static void
	rip6_close(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_close: inp == NULL"));

	soisdisconnected(so);
	}

	static int
	rip6_disconnect(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL"));

	if ((so->so_state & SS_ISCONNECTED) == 0)
	return (ENOTCONN);
	inp->in6p_faddr = in6addr_any;
	rip6_abort(so);
	return (0);
	}

	static int
	rip6_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_NET(so->so_vnet);
	+ INIT_VNET_INET(so->so_vnet);
	+ INIT_VNET_INET6(so->so_vnet);
	struct inpcb *inp;
	struct sockaddr_in6 addr = (struct sockaddr_in6 )nam;
	struct ifaddr *ia = NULL;
	int error = 0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_bind: inp == NULL"));

	if (nam->sa_len != sizeof(*addr))
	return (EINVAL);
	if (TAILQ_EMPTY(&V_ifnet) \|\| addr->sin6_family != AF_INET6)
	return (EADDRNOTAVAIL);
	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
	return (error);

	if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
	(ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0)
	return (EADDRNOTAVAIL);
	if (ia &&
	((struct in6_ifaddr *)ia)->ia6_flags &
	(IN6_IFF_ANYCAST\|IN6_IFF_NOTREADY\|
	IN6_IFF_DETACHED\|IN6_IFF_DEPRECATED)) {
	return (EADDRNOTAVAIL);
	}
	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	inp->in6p_laddr = addr->sin6_addr;
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip6_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_NET(so->so_vnet);
	+ INIT_VNET_INET(so->so_vnet);
	+ INIT_VNET_INET6(so->so_vnet);
	struct inpcb *inp;
	struct sockaddr_in6 addr = (struct sockaddr_in6 )nam;
	struct in6_addr *in6a = NULL;
	struct ifnet *ifp = NULL;
	int error = 0, scope_ambiguous = 0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_connect: inp == NULL"));

	if (nam->sa_len != sizeof(*addr))
	return (EINVAL);
	if (TAILQ_EMPTY(&V_ifnet))
	return (EADDRNOTAVAIL);
	if (addr->sin6_family != AF_INET6)
	return (EAFNOSUPPORT);

	/*
	* Application should provide a proper zone ID or the use of default
	* zone IDs should be enabled. Unfortunately, some applications do
	* not behave as it should, so we need a workaround. Even if an
	* appropriate ID is not determined, we'll see if we can determine
	* the outgoing interface. If we can, determine the zone ID based on
	* the interface below.
	*/
	if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
	return (error);

	INP_INFO_WLOCK(&V_ripcbinfo);
	INP_WLOCK(inp);
	/* Source address selection. XXX: need pcblookup? */
	in6a = in6_selectsrc(addr, inp->in6p_outputopts,
	inp, NULL, so->so_cred,
	&ifp, &error);
	if (in6a == NULL) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (error ? error : EADDRNOTAVAIL);
	}

	/* XXX: see above */
	if (ifp && scope_ambiguous &&
	(error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) {
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (error);
	}
	inp->in6p_faddr = addr->sin6_addr;
	inp->in6p_laddr = *in6a;
	soisconnected(so);
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_ripcbinfo);
	return (0);
	}

	static int
	rip6_shutdown(struct socket *so)
	{
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL"));

	INP_WLOCK(inp);
	socantsendmore(so);
	INP_WUNLOCK(inp);
	return (0);
	}

	static int
	rip6_send(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	struct sockaddr_in6 tmp;
	struct sockaddr_in6 *dst;
	int ret;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("rip6_send: inp == NULL"));

	/* Always copy sockaddr to avoid overwrites. */
	/* Unlocked read. */
	if (so->so_state & SS_ISCONNECTED) {
	if (nam) {
	m_freem(m);
	return (EISCONN);
	}
	/* XXX */
	bzero(&tmp, sizeof(tmp));
	tmp.sin6_family = AF_INET6;
	tmp.sin6_len = sizeof(struct sockaddr_in6);
	INP_RLOCK(inp);
	bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
	sizeof(struct in6_addr));
	INP_RUNLOCK(inp);
	dst = &tmp;
	} else {
	if (nam == NULL) {
	m_freem(m);
	return (ENOTCONN);
	}
	if (nam->sa_len != sizeof(struct sockaddr_in6)) {
	m_freem(m);
	return (EINVAL);
	}
	tmp = (struct sockaddr_in6 )nam;
	dst = &tmp;

	if (dst->sin6_family == AF_UNSPEC) {
	/*
	* XXX: we allow this case for backward
	* compatibility to buggy applications that
	* rely on old (and wrong) kernel behavior.
	*/
	log(LOG_INFO, "rip6 SEND: address family is "
	"unspec. Assume AF_INET6\n");
	dst->sin6_family = AF_INET6;
	} else if (dst->sin6_family != AF_INET6) {
	m_freem(m);
	return(EAFNOSUPPORT);
	}
	}
	ret = rip6_output(m, so, dst, control);
	return (ret);
	}

	struct pr_usrreqs rip6_usrreqs = {
	.pru_abort = rip6_abort,
	.pru_attach = rip6_attach,
	.pru_bind = rip6_bind,
	.pru_connect = rip6_connect,
	.pru_control = in6_control,
	.pru_detach = rip6_detach,
	.pru_disconnect = rip6_disconnect,
	.pru_peeraddr = in6_getpeeraddr,
	.pru_send = rip6_send,
	.pru_shutdown = rip6_shutdown,
	.pru_sockaddr = in6_getsockaddr,
	.pru_close = rip6_close,
	};
	Index: head/sys/netinet6/route6.c
	===================================================================
	--- head/sys/netinet6/route6.c (revision 183549)
	+++ head/sys/netinet6/route6.c (revision 183550)
	@@ -1,244 +1,246 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: route6.c,v 1.24 2001/03/14 03:07:05 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/systm.h>
	#include <sys/queue.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet6/in6_var.h>
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/scope6_var.h>

	#include <netinet/icmp6.h>

	#if 0
	static int ip6_rthdr0 __P((struct mbuf , struct ip6_hdr ,
	struct ip6_rthdr0 *));

	#endif /* Disable route header processing. */

	/*
	* proto - is unused
	*/

	int
	route6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET6(curvnet);
	struct ip6_hdr *ip6;
	struct mbuf m = mp;
	struct ip6_rthdr *rh;
	int off = *offp, rhlen;
	struct ip6aux *ip6a;

	ip6a = ip6_findaux(m);
	if (ip6a) {
	/* XXX reject home-address option before rthdr */
	if (ip6a->ip6a_flags & IP6A_SWAP) {
	V_ip6stat.ip6s_badoptions++;
	m_freem(m);
	return IPPROTO_DONE;
	}
	}

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(*rh), IPPROTO_DONE);
	ip6 = mtod(m, struct ip6_hdr *);
	rh = (struct ip6_rthdr *)((caddr_t)ip6 + off);
	#else
	ip6 = mtod(m, struct ip6_hdr *);
	IP6_EXTHDR_GET(rh, struct ip6_rthdr , m, off, sizeof(rh));
	if (rh == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return IPPROTO_DONE;
	}
	#endif

	switch (rh->ip6r_type) {
	#if 0
	case IPV6_RTHDR_TYPE_0:
	rhlen = (rh->ip6r_len + 1) << 3;
	#ifndef PULLDOWN_TEST
	/*
	* note on option length:
	* due to IP6_EXTHDR_CHECK assumption, we cannot handle
	* very big routing header (max rhlen == 2048).
	*/
	IP6_EXTHDR_CHECK(m, off, rhlen, IPPROTO_DONE);
	#else
	/*
	* note on option length:
	* maximum rhlen: 2048
	* max mbuf m_pulldown can handle: MCLBYTES == usually 2048
	* so, here we are assuming that m_pulldown can handle
	* rhlen == 2048 case. this may not be a good thing to
	* assume - we may want to avoid pulling it up altogether.
	*/
	IP6_EXTHDR_GET(rh, struct ip6_rthdr *, m, off, rhlen);
	if (rh == NULL) {
	V_ip6stat.ip6s_tooshort++;
	return IPPROTO_DONE;
	}
	#endif
	if (ip6_rthdr0(m, ip6, (struct ip6_rthdr0 *)rh))
	return (IPPROTO_DONE);
	break;
	#endif /* Disable route header 0 */
	default:
	/* unknown routing type */
	if (rh->ip6r_segleft == 0) {
	rhlen = (rh->ip6r_len + 1) << 3;
	break; /* Final dst. Just ignore the header. */
	}
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
	(caddr_t)&rh->ip6r_type - (caddr_t)ip6);
	return (IPPROTO_DONE);
	}

	*offp += rhlen;
	return (rh->ip6r_nxt);
	}

	/*
	* Type0 routing header processing
	*
	* RFC2292 backward compatibility warning: no support for strict/loose bitmap,
	* as it was dropped between RFC1883 and RFC2460.
	*/
	#if 0
	static int
	ip6_rthdr0(struct mbuf m, struct ip6_hdr ip6, struct ip6_rthdr0 *rh0)
	{
	+ INIT_VNET_INET6(curvnet);
	int addrs, index;
	struct in6_addr *nextaddr, tmpaddr;
	struct in6_ifaddr *ifa;

	if (rh0->ip6r0_segleft == 0)
	return (0);

	if (rh0->ip6r0_len % 2
	#ifdef COMPAT_RFC1883
	\|\| rh0->ip6r0_len > 46
	#endif
	) {
	/*
	* Type 0 routing header can't contain more than 23 addresses.
	* RFC 2462: this limitation was removed since strict/loose
	* bitmap field was deleted.
	*/
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
	(caddr_t)&rh0->ip6r0_len - (caddr_t)ip6);
	return (-1);
	}

	if ((addrs = rh0->ip6r0_len / 2) < rh0->ip6r0_segleft) {
	V_ip6stat.ip6s_badoptions++;
	icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
	(caddr_t)&rh0->ip6r0_segleft - (caddr_t)ip6);
	return (-1);
	}

	index = addrs - rh0->ip6r0_segleft;
	rh0->ip6r0_segleft--;
	nextaddr = ((struct in6_addr *)(rh0 + 1)) + index;

	/*
	* reject invalid addresses. be proactive about malicious use of
	* IPv4 mapped/compat address.
	* XXX need more checks?
	*/
	if (IN6_IS_ADDR_MULTICAST(nextaddr) \|\|
	IN6_IS_ADDR_UNSPECIFIED(nextaddr) \|\|
	IN6_IS_ADDR_V4MAPPED(nextaddr) \|\|
	IN6_IS_ADDR_V4COMPAT(nextaddr)) {
	V_ip6stat.ip6s_badoptions++;
	m_freem(m);
	return (-1);
	}
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst) \|\|
	IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
	V_ip6stat.ip6s_badoptions++;
	m_freem(m);
	return (-1);
	}

	/*
	* Determine the scope zone of the next hop, based on the interface
	* of the current hop. [RFC4007, Section 9]
	* Then disambiguate the scope zone for the next hop (if necessary).
	*/
	if ((ifa = ip6_getdstifaddr(m)) == NULL)
	goto bad;
	if (in6_setscope(nextaddr, ifa->ia_ifp, NULL) != 0) {
	V_ip6stat.ip6s_badscope++;
	goto bad;
	}

	/*
	* Swap the IPv6 destination address and nextaddr. Forward the packet.
	*/
	tmpaddr = *nextaddr;
	*nextaddr = ip6->ip6_dst;
	in6_clearscope(nextaddr); /* XXX */
	ip6->ip6_dst = tmpaddr;

	#ifdef COMPAT_RFC1883
	if (rh0->ip6r0_slmap[index / 8] & (1 << (7 - (index % 8))))
	ip6_forward(m, IPV6_SRCRT_NEIGHBOR);
	else
	ip6_forward(m, IPV6_SRCRT_NOTNEIGHBOR);
	#else
	ip6_forward(m, 1);
	#endif

	return (-1); /* m would be freed in ip6_forward() */

	bad:
	m_freem(m);
	return (-1);
	}
	#endif
	Index: head/sys/netinet6/scope6.c
	===================================================================
	--- head/sys/netinet6/scope6.c (revision 183549)
	+++ head/sys/netinet6/scope6.c (revision 183550)
	@@ -1,488 +1,497 @@
	/*-
	* Copyright (C) 2000 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/systm.h>
	#include <sys/queue.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/route.h>
	#include <net/if.h>

	#include <netinet/in.h>
	+#include <netinet/ip6.h>

	#include <netinet6/in6_var.h>
	#include <netinet6/scope6_var.h>

	#ifdef ENABLE_DEFAULT_SCOPE
	int ip6_use_defzone = 1;
	#else
	int ip6_use_defzone = 0;
	#endif

	/*
	* The scope6_lock protects the global sid default stored in
	* sid_default below.
	*/
	static struct mtx scope6_lock;
	#define SCOPE6_LOCK_INIT() mtx_init(&scope6_lock, "scope6_lock", NULL, MTX_DEF)
	#define SCOPE6_LOCK() mtx_lock(&scope6_lock)
	#define SCOPE6_UNLOCK() mtx_unlock(&scope6_lock)
	#define SCOPE6_LOCK_ASSERT() mtx_assert(&scope6_lock, MA_OWNED)

	static struct scope6_id sid_default;
	#define SID(ifp) \
	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id)

	void
	scope6_init(void)
	{
	+ INIT_VNET_INET6(curvnet);

	SCOPE6_LOCK_INIT();
	bzero(&V_sid_default, sizeof(V_sid_default));
	}

	struct scope6_id *
	scope6_ifattach(struct ifnet *ifp)
	{
	struct scope6_id *sid;

	sid = (struct scope6_id )malloc(sizeof(sid), M_IFADDR, M_WAITOK);
	bzero(sid, sizeof(*sid));

	/*
	* XXX: IPV6_ADDR_SCOPE_xxx macros are not standard.
	* Should we rather hardcode here?
	*/
	sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index;
	sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index;
	#ifdef MULTI_SCOPE
	/* by default, we don't care about scope boundary for these scopes. */
	sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL] = 1;
	sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL] = 1;
	#endif

	return sid;
	}

	void
	scope6_ifdetach(struct scope6_id *sid)
	{

	free(sid, M_IFADDR);
	}

	int
	scope6_set(struct ifnet ifp, struct scope6_id idlist)
	{
	+ INIT_VNET_NET(ifp->if_vnet);
	int i;
	int error = 0;
	struct scope6_id *sid = NULL;

	IF_AFDATA_LOCK(ifp);
	sid = SID(ifp);

	if (!sid) { /* paranoid? */
	IF_AFDATA_UNLOCK(ifp);
	return (EINVAL);
	}

	/*
	* XXX: We need more consistency checks of the relationship among
	* scopes (e.g. an organization should be larger than a site).
	*/

	/*
	* TODO(XXX): after setting, we should reflect the changes to
	* interface addresses, routing table entries, PCB entries...
	*/

	SCOPE6_LOCK();
	for (i = 0; i < 16; i++) {
	if (idlist->s6id_list[i] &&
	idlist->s6id_list[i] != sid->s6id_list[i]) {
	/*
	* An interface zone ID must be the corresponding
	* interface index by definition.
	*/
	if (i == IPV6_ADDR_SCOPE_INTFACELOCAL &&
	idlist->s6id_list[i] != ifp->if_index) {
	IF_AFDATA_UNLOCK(ifp);
	SCOPE6_UNLOCK();
	return (EINVAL);
	}

	if (i == IPV6_ADDR_SCOPE_LINKLOCAL &&
	idlist->s6id_list[i] > V_if_index) {
	/*
	* XXX: theoretically, there should be no
	* relationship between link IDs and interface
	* IDs, but we check the consistency for
	* safety in later use.
	*/
	IF_AFDATA_UNLOCK(ifp);
	SCOPE6_UNLOCK();
	return (EINVAL);
	}

	/*
	* XXX: we must need lots of work in this case,
	* but we simply set the new value in this initial
	* implementation.
	*/
	sid->s6id_list[i] = idlist->s6id_list[i];
	}
	}
	SCOPE6_UNLOCK();
	IF_AFDATA_UNLOCK(ifp);

	return (error);
	}

	int
	scope6_get(struct ifnet ifp, struct scope6_id idlist)
	{
	/* We only need to lock the interface's afdata for SID() to work. */
	IF_AFDATA_LOCK(ifp);
	struct scope6_id *sid = SID(ifp);

	if (sid == NULL) { /* paranoid? */
	IF_AFDATA_UNLOCK(ifp);
	return (EINVAL);
	}

	SCOPE6_LOCK();
	idlist = sid;
	SCOPE6_UNLOCK();

	IF_AFDATA_UNLOCK(ifp);
	return (0);
	}


	/*
	* Get a scope of the address. Node-local, link-local, site-local or global.
	*/
	int
	in6_addrscope(struct in6_addr *addr)
	{
	int scope;

	if (addr->s6_addr[0] == 0xfe) {
	scope = addr->s6_addr[1] & 0xc0;

	switch (scope) {
	case 0x80:
	return IPV6_ADDR_SCOPE_LINKLOCAL;
	break;
	case 0xc0:
	return IPV6_ADDR_SCOPE_SITELOCAL;
	break;
	default:
	return IPV6_ADDR_SCOPE_GLOBAL; /* just in case */
	break;
	}
	}


	if (addr->s6_addr[0] == 0xff) {
	scope = addr->s6_addr[1] & 0x0f;

	/*
	* due to other scope such as reserved,
	* return scope doesn't work.
	*/
	switch (scope) {
	case IPV6_ADDR_SCOPE_INTFACELOCAL:
	return IPV6_ADDR_SCOPE_INTFACELOCAL;
	break;
	case IPV6_ADDR_SCOPE_LINKLOCAL:
	return IPV6_ADDR_SCOPE_LINKLOCAL;
	break;
	case IPV6_ADDR_SCOPE_SITELOCAL:
	return IPV6_ADDR_SCOPE_SITELOCAL;
	break;
	default:
	return IPV6_ADDR_SCOPE_GLOBAL;
	break;
	}
	}

	/*
	* Regard loopback and unspecified addresses as global, since
	* they have no ambiguity.
	*/
	if (bcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) {
	if (addr->s6_addr[15] == 1) /* loopback */
	return IPV6_ADDR_SCOPE_LINKLOCAL;
	if (addr->s6_addr[15] == 0) /* unspecified */
	return IPV6_ADDR_SCOPE_GLOBAL; /* XXX: correct? */
	}

	return IPV6_ADDR_SCOPE_GLOBAL;
	}

	/*
	* ifp - note that this might be NULL
	*/

	void
	scope6_setdefault(struct ifnet *ifp)
	{
	+ INIT_VNET_INET6(ifp->if_vnet);
	+
	/*
	* Currently, this function just sets the default "interfaces"
	* and "links" according to the given interface.
	* We might eventually have to separate the notion of "link" from
	* "interface" and provide a user interface to set the default.
	*/
	SCOPE6_LOCK();
	if (ifp) {
	V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] =
	ifp->if_index;
	V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] =
	ifp->if_index;
	} else {
	V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0;
	V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0;
	}
	SCOPE6_UNLOCK();
	}

	int
	scope6_get_default(struct scope6_id *idlist)
	{
	+ INIT_VNET_INET6(curvnet);

	SCOPE6_LOCK();
	*idlist = V_sid_default;
	SCOPE6_UNLOCK();

	return (0);
	}

	u_int32_t
	scope6_addr2default(struct in6_addr *addr)
	{
	+ INIT_VNET_INET6(curvnet);
	u_int32_t id;

	/*
	* special case: The loopback address should be considered as
	* link-local, but there's no ambiguity in the syntax.
	*/
	if (IN6_IS_ADDR_LOOPBACK(addr))
	return (0);

	/*
	* XXX: 32-bit read is atomic on all our platforms, is it OK
	* not to lock here?
	*/
	SCOPE6_LOCK();
	id = V_sid_default.s6id_list[in6_addrscope(addr)];
	SCOPE6_UNLOCK();
	return (id);
	}

	/*
	* Validate the specified scope zone ID in the sin6_scope_id field. If the ID
	* is unspecified (=0), needs to be specified, and the default zone ID can be
	* used, the default value will be used.
	* This routine then generates the kernel-internal form: if the address scope
	* of is interface-local or link-local, embed the interface index in the
	* address.
	*/
	int
	sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok)
	{
	+ INIT_VNET_NET(curvnet);
	struct ifnet *ifp;
	u_int32_t zoneid;

	if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok)
	zoneid = scope6_addr2default(&sin6->sin6_addr);

	if (zoneid != 0 &&
	(IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) \|\|
	IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) {
	/*
	* At this moment, we only check interface-local and
	* link-local scope IDs, and use interface indices as the
	* zone IDs assuming a one-to-one mapping between interfaces
	* and links.
	*/
	if (V_if_index < zoneid)
	return (ENXIO);
	ifp = ifnet_byindex(zoneid);
	if (ifp == NULL) /* XXX: this can happen for some OS */
	return (ENXIO);

	/* XXX assignment to 16bit from 32bit variable */
	sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff);

	sin6->sin6_scope_id = 0;
	}

	return 0;
	}

	/*
	* generate standard sockaddr_in6 from embedded form.
	*/
	int
	sa6_recoverscope(struct sockaddr_in6 *sin6)
	{
	+ INIT_VNET_NET(curvnet);
	char ip6buf[INET6_ADDRSTRLEN];
	u_int32_t zoneid;

	if (sin6->sin6_scope_id != 0) {
	log(LOG_NOTICE,
	"sa6_recoverscope: assumption failure (non 0 ID): %s%%%d\n",
	ip6_sprintf(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id);
	/* XXX: proceed anyway... */
	}
	if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) \|\|
	IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) {
	/*
	* KAME assumption: link id == interface id
	*/
	zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]);
	if (zoneid) {
	/* sanity check */
	if (zoneid < 0 \|\| V_if_index < zoneid)
	return (ENXIO);
	if (!ifnet_byindex(zoneid))
	return (ENXIO);
	sin6->sin6_addr.s6_addr16[1] = 0;
	sin6->sin6_scope_id = zoneid;
	}
	}

	return 0;
	}

	/*
	* Determine the appropriate scope zone ID for in6 and ifp. If ret_id is
	* non NULL, it is set to the zone ID. If the zone ID needs to be embedded
	* in the in6_addr structure, in6 will be modified.
	*
	* ret_id - unnecessary?
	*/
	int
	in6_setscope(struct in6_addr in6, struct ifnet ifp, u_int32_t *ret_id)
	{
	int scope;
	u_int32_t zoneid = 0;
	struct scope6_id *sid;

	IF_AFDATA_LOCK(ifp);

	sid = SID(ifp);

	#ifdef DIAGNOSTIC
	if (sid == NULL) { /* should not happen */
	panic("in6_setscope: scope array is NULL");
	/* NOTREACHED */
	}
	#endif

	/*
	* special case: the loopback address can only belong to a loopback
	* interface.
	*/
	if (IN6_IS_ADDR_LOOPBACK(in6)) {
	if (!(ifp->if_flags & IFF_LOOPBACK)) {
	IF_AFDATA_UNLOCK(ifp);
	return (EINVAL);
	} else {
	if (ret_id != NULL)
	ret_id = 0; / there's no ambiguity */
	IF_AFDATA_UNLOCK(ifp);
	return (0);
	}
	}

	scope = in6_addrscope(in6);

	SCOPE6_LOCK();
	switch (scope) {
	case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */
	zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL];
	break;

	case IPV6_ADDR_SCOPE_LINKLOCAL:
	zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL];
	break;

	case IPV6_ADDR_SCOPE_SITELOCAL:
	zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL];
	break;

	case IPV6_ADDR_SCOPE_ORGLOCAL:
	zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL];
	break;

	default:
	zoneid = 0; /* XXX: treat as global. */
	break;
	}
	SCOPE6_UNLOCK();
	IF_AFDATA_UNLOCK(ifp);

	if (ret_id != NULL)
	*ret_id = zoneid;

	if (IN6_IS_SCOPE_LINKLOCAL(in6) \|\| IN6_IS_ADDR_MC_INTFACELOCAL(in6))
	in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */

	return (0);
	}

	/*
	* Just clear the embedded scope identifier. Return 0 if the original address
	* is intact; return non 0 if the address is modified.
	*/
	int
	in6_clearscope(struct in6_addr *in6)
	{
	int modified = 0;

	if (IN6_IS_SCOPE_LINKLOCAL(in6) \|\| IN6_IS_ADDR_MC_INTFACELOCAL(in6)) {
	if (in6->s6_addr16[1] != 0)
	modified = 1;
	in6->s6_addr16[1] = 0;
	}

	return (modified);
	}
	Index: head/sys/netinet6/udp6_usrreq.c
	===================================================================
	--- head/sys/netinet6/udp6_usrreq.c (revision 183549)
	+++ head/sys/netinet6/udp6_usrreq.c (revision 183550)
	@@ -1,1031 +1,1048 @@
	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $
	* $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $
	*/

	/*-
	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
	* The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mbuf.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/if_types.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_icmp.h>
	#include <netinet/ip6.h>
	#include <netinet/icmp_var.h>
	#include <netinet/icmp6.h>
	#include <netinet/ip_var.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>
	#include <netinet6/ip6protosw.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/udp6_var.h>
	#include <netinet6/scope6_var.h>

	#ifdef IPSEC
	#include <netipsec/ipsec.h>
	#include <netipsec/ipsec6.h>
	#endif /* IPSEC */

	#include <security/mac/mac_framework.h>

	/*
	* UDP protocol implementation.
	* Per RFC 768, August, 1980.
	*/

	extern struct protosw inetsw[];
	static void udp6_detach(struct socket *so);

	static void
	udp6_append(struct inpcb inp, struct mbuf n, int off,
	struct sockaddr_in6 *fromsa)
	{
	+ INIT_VNET_INET(inp->inp_vnet);
	struct socket *so;
	struct mbuf *opts;

	INP_LOCK_ASSERT(inp);

	#ifdef IPSEC
	/* Check AH/ESP integrity. */
	if (ipsec6_in_reject(n, inp)) {
	+ INIT_VNET_IPSEC(inp->inp_vnet);
	m_freem(n);
	V_ipsec6stat.in_polvio++;
	return;
	}
	#endif /* IPSEC */
	#ifdef MAC
	if (mac_inpcb_check_deliver(inp, n) != 0) {
	m_freem(n);
	return;
	}
	#endif
	opts = NULL;
	if (inp->in6p_flags & IN6P_CONTROLOPTS \|\|
	inp->inp_socket->so_options & SO_TIMESTAMP)
	ip6_savecontrol(inp, n, &opts);
	m_adj(n, off + sizeof(struct udphdr));

	so = inp->inp_socket;
	SOCKBUF_LOCK(&so->so_rcv);
	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n,
	opts) == 0) {
	SOCKBUF_UNLOCK(&so->so_rcv);
	m_freem(n);
	if (opts)
	m_freem(opts);
	V_udpstat.udps_fullsock++;
	} else
	sorwakeup_locked(so);
	}

	int
	udp6_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct mbuf m = mp;
	struct ip6_hdr *ip6;
	struct udphdr *uh;
	struct inpcb *inp;
	int off = *offp;
	int plen, ulen;
	struct sockaddr_in6 fromsa;

	ip6 = mtod(m, struct ip6_hdr *);

	if (faithprefix_p != NULL && (*faithprefix_p)(&ip6->ip6_dst)) {
	/* XXX send icmp6 host/port unreach? */
	m_freem(m);
	return (IPPROTO_DONE);
	}

	#ifndef PULLDOWN_TEST
	IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE);
	ip6 = mtod(m, struct ip6_hdr *);
	uh = (struct udphdr *)((caddr_t)ip6 + off);
	#else
	IP6_EXTHDR_GET(uh, struct udphdr , m, off, sizeof(uh));
	if (!uh)
	return (IPPROTO_DONE);
	#endif

	V_udpstat.udps_ipackets++;

	/*
	* Destination port of 0 is illegal, based on RFC768.
	*/
	if (uh->uh_dport == 0)
	goto badunlocked;

	plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6);
	ulen = ntohs((u_short)uh->uh_ulen);

	if (plen != ulen) {
	V_udpstat.udps_badlen++;
	goto badunlocked;
	}

	/*
	* Checksum extended UDP header and data.
	*/
	if (uh->uh_sum == 0) {
	V_udpstat.udps_nosum++;
	goto badunlocked;
	}
	if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) {
	V_udpstat.udps_badsum++;
	goto badunlocked;
	}

	/*
	* Construct sockaddr format source address.
	*/
	init_sin6(&fromsa, m);
	fromsa.sin6_port = uh->uh_sport;

	INP_INFO_RLOCK(&V_udbinfo);
	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
	struct inpcb *last;

	/*
	* In the event that laddr should be set to the link-local
	* address (this happens in RIPng), the multicast address
	* specified in the received packet will not match laddr. To
	* handle this situation, matching is relaxed if the
	* receiving interface is the same as one specified in the
	* socket and if the destination multicast address matches
	* one of the multicast groups specified in the socket.
	*/

	/*
	* KAME note: traditionally we dropped udpiphdr from mbuf
	* here. We need udphdr for IPsec processing so we do that
	* later.
	*/
	last = NULL;
	LIST_FOREACH(inp, &V_udb, inp_list) {
	if ((inp->inp_vflag & INP_IPV6) == 0)
	continue;
	if (inp->in6p_lport != uh->uh_dport)
	continue;
	/*
	* XXX: Do not check source port of incoming datagram
	* unless inp_connect() has been called to bind the
	* fport part of the 4-tuple; the source could be
	* trying to talk to us with an ephemeral port.
	*/
	if (inp->inp_fport != 0 &&
	inp->inp_fport != uh->uh_sport)
	continue;
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
	if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
	&ip6->ip6_dst))
	continue;
	}
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
	&ip6->ip6_src) \|\|
	inp->in6p_fport != uh->uh_sport)
	continue;
	}

	if (last != NULL) {
	struct mbuf *n;

	if ((n = m_copy(m, 0, M_COPYALL)) != NULL) {
	INP_RLOCK(last);
	udp6_append(last, n, off, &fromsa);
	INP_RUNLOCK(last);
	}
	}
	last = inp;
	/*
	* Don't look for additional matches if this one does
	* not have either the SO_REUSEPORT or SO_REUSEADDR
	* socket options set. This heuristic avoids
	* searching through all pcbs in the common case of a
	* non-shared port. It assumes that an application
	* will never clear these options after setting them.
	*/
	if ((last->inp_socket->so_options &
	(SO_REUSEPORT\|SO_REUSEADDR)) == 0)
	break;
	}

	if (last == NULL) {
	/*
	* No matching pcb found; discard datagram. (No need
	* to send an ICMP Port Unreachable for a broadcast
	* or multicast datgram.)
	*/
	V_udpstat.udps_noport++;
	V_udpstat.udps_noportmcast++;
	goto badheadlocked;
	}
	INP_RLOCK(last);
	INP_INFO_RUNLOCK(&V_udbinfo);
	udp6_append(last, m, off, &fromsa);
	INP_RUNLOCK(last);
	return (IPPROTO_DONE);
	}
	/*
	* Locate pcb for datagram.
	*/
	inp = in6_pcblookup_hash(&V_udbinfo, &ip6->ip6_src, uh->uh_sport,
	&ip6->ip6_dst, uh->uh_dport, 1, m->m_pkthdr.rcvif);
	if (inp == NULL) {
	if (udp_log_in_vain) {
	char ip6bufs[INET6_ADDRSTRLEN];
	char ip6bufd[INET6_ADDRSTRLEN];

	log(LOG_INFO,
	"Connection attempt to UDP [%s]:%d from [%s]:%d\n",
	ip6_sprintf(ip6bufd, &ip6->ip6_dst),
	ntohs(uh->uh_dport),
	ip6_sprintf(ip6bufs, &ip6->ip6_src),
	ntohs(uh->uh_sport));
	}
	V_udpstat.udps_noport++;
	if (m->m_flags & M_MCAST) {
	printf("UDP6: M_MCAST is set in a unicast packet.\n");
	V_udpstat.udps_noportmcast++;
	goto badheadlocked;
	}
	INP_INFO_RUNLOCK(&V_udbinfo);
	if (V_udp_blackhole)
	goto badunlocked;
	if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0)
	goto badunlocked;
	icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
	return (IPPROTO_DONE);
	}
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(&V_udbinfo);
	udp6_append(inp, m, off, &fromsa);
	INP_RUNLOCK(inp);
	return (IPPROTO_DONE);

	badheadlocked:
	INP_INFO_RUNLOCK(&V_udbinfo);
	badunlocked:
	if (m)
	m_freem(m);
	return (IPPROTO_DONE);
	}

	void
	udp6_ctlinput(int cmd, struct sockaddr sa, void d)
	{
	+ INIT_VNET_INET(curvnet);
	struct udphdr uh;
	struct ip6_hdr *ip6;
	struct mbuf *m;
	int off = 0;
	struct ip6ctlparam *ip6cp = NULL;
	const struct sockaddr_in6 *sa6_src = NULL;
	void *cmdarg;
	struct inpcb (notify)(struct inpcb *, int) = udp_notify;
	struct udp_portonly {
	u_int16_t uh_sport;
	u_int16_t uh_dport;
	} *uhp;

	if (sa->sa_family != AF_INET6 \|\|
	sa->sa_len != sizeof(struct sockaddr_in6))
	return;

	if ((unsigned)cmd >= PRC_NCMDS)
	return;
	if (PRC_IS_REDIRECT(cmd))
	notify = in6_rtchange, d = NULL;
	else if (cmd == PRC_HOSTDEAD)
	d = NULL;
	else if (inet6ctlerrmap[cmd] == 0)
	return;

	/* if the parameter is from icmp6, decode it. */
	if (d != NULL) {
	ip6cp = (struct ip6ctlparam *)d;
	m = ip6cp->ip6c_m;
	ip6 = ip6cp->ip6c_ip6;
	off = ip6cp->ip6c_off;
	cmdarg = ip6cp->ip6c_cmdarg;
	sa6_src = ip6cp->ip6c_src;
	} else {
	m = NULL;
	ip6 = NULL;
	cmdarg = NULL;
	sa6_src = &sa6_any;
	}

	if (ip6) {
	/*
	* XXX: We assume that when IPV6 is non NULL,
	* M and OFF are valid.
	*/

	/* Check if we can safely examine src and dst ports. */
	if (m->m_pkthdr.len < off + sizeof(*uhp))
	return;

	bzero(&uh, sizeof(uh));
	m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh);

	(void) in6_pcbnotify(&V_udbinfo, sa, uh.uh_dport,
	(struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd,
	cmdarg, notify);
	} else
	(void) in6_pcbnotify(&V_udbinfo, sa, 0,
	(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
	}

	static int
	udp6_getcred(SYSCTL_HANDLER_ARGS)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	struct xucred xuc;
	struct sockaddr_in6 addrs[2];
	struct inpcb *inp;
	int error;

	error = priv_check(req->td, PRIV_NETINET_GETCRED);
	if (error)
	return (error);

	if (req->newlen != sizeof(addrs))
	return (EINVAL);
	if (req->oldlen != sizeof(struct xucred))
	return (EINVAL);
	error = SYSCTL_IN(req, addrs, sizeof(addrs));
	if (error)
	return (error);
	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 \|\|
	(error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
	return (error);
	}
	INP_INFO_RLOCK(&V_udbinfo);
	inp = in6_pcblookup_hash(&V_udbinfo, &addrs[1].sin6_addr,
	addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, 1,
	NULL);
	if (inp != NULL) {
	INP_RLOCK(inp);
	INP_INFO_RUNLOCK(&V_udbinfo);
	if (inp->inp_socket == NULL)
	error = ENOENT;
	if (error == 0)
	error = cr_canseesocket(req->td->td_ucred,
	inp->inp_socket);
	if (error == 0)
	cru2x(inp->inp_socket->so_cred, &xuc);
	INP_RUNLOCK(inp);
	} else {
	INP_INFO_RUNLOCK(&V_udbinfo);
	error = ENOENT;
	}
	if (error == 0)
	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
	return (error);
	}

	SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE\|CTLFLAG_RW, 0,
	0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");

	static int
	udp6_output(struct inpcb inp, struct mbuf m, struct sockaddr *addr6,
	struct mbuf control, struct thread td)
	{
	+ INIT_VNET_INET(curvnet);
	+ INIT_VNET_INET6(curvnet);
	u_int32_t ulen = m->m_pkthdr.len;
	u_int32_t plen = sizeof(struct udphdr) + ulen;
	struct ip6_hdr *ip6;
	struct udphdr *udp6;
	struct in6_addr laddr, faddr;
	struct sockaddr_in6 *sin6 = NULL;
	struct ifnet *oifp = NULL;
	int scope_ambiguous = 0;
	u_short fport;
	int error = 0;
	struct ip6_pktopts *optp, opt;
	int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
	int flags;
	struct sockaddr_in6 tmp;

	INP_WLOCK_ASSERT(inp);

	if (addr6) {
	/* addr6 has been validated in udp6_send(). */
	sin6 = (struct sockaddr_in6 *)addr6;

	/* protect sin6 from overwrites /
	tmp = *sin6;
	sin6 = &tmp;

	/*
	* Application should provide a proper zone ID or the use of
	* default zone IDs should be enabled. Unfortunately, some
	* applications do not behave as it should, so we need a
	* workaround. Even if an appropriate ID is not determined,
	* we'll see if we can determine the outgoing interface. If we
	* can, determine the zone ID based on the interface below.
	*/
	if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
	scope_ambiguous = 1;
	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
	return (error);
	}

	if (control) {
	if ((error = ip6_setpktopts(control, &opt,
	inp->in6p_outputopts, td->td_ucred, IPPROTO_UDP)) != 0)
	goto release;
	optp = &opt;
	} else
	optp = inp->in6p_outputopts;

	if (sin6) {
	faddr = &sin6->sin6_addr;

	/*
	* IPv4 version of udp_output calls in_pcbconnect in this case,
	* which needs splnet and affects performance.
	* Since we saw no essential reason for calling in_pcbconnect,
	* we get rid of such kind of logic, and call in6_selectsrc
	* and in6_pcbsetport in order to fill in the local address
	* and the local port.
	*/
	if (sin6->sin6_port == 0) {
	error = EADDRNOTAVAIL;
	goto release;
	}

	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	/* how about ::ffff:0.0.0.0 case? */
	error = EISCONN;
	goto release;
	}

	fport = sin6->sin6_port; /* allow 0 port */

	if (IN6_IS_ADDR_V4MAPPED(faddr)) {
	if ((inp->in6p_flags & IN6P_IPV6_V6ONLY)) {
	/*
	* I believe we should explicitly discard the
	* packet when mapped addresses are disabled,
	* rather than send the packet as an IPv6 one.
	* If we chose the latter approach, the packet
	* might be sent out on the wire based on the
	* default route, the situation which we'd
	* probably want to avoid.
	* (20010421 jinmei@kame.net)
	*/
	error = EINVAL;
	goto release;
	}
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
	!IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
	/*
	* when remote addr is an IPv4-mapped address,
	* local addr should not be an IPv6 address,
	* since you cannot determine how to map IPv6
	* source address to IPv4.
	*/
	error = EINVAL;
	goto release;
	}

	af = AF_INET;
	}

	if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
	laddr = in6_selectsrc(sin6, optp, inp, NULL,
	td->td_ucred, &oifp, &error);
	if (oifp && scope_ambiguous &&
	(error = in6_setscope(&sin6->sin6_addr,
	oifp, NULL))) {
	goto release;
	}
	} else
	laddr = &inp->in6p_laddr; /* XXX */
	if (laddr == NULL) {
	if (error == 0)
	error = EADDRNOTAVAIL;
	goto release;
	}
	if (inp->in6p_lport == 0 &&
	(error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0)
	goto release;
	} else {
	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	error = ENOTCONN;
	goto release;
	}
	if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) {
	if ((inp->in6p_flags & IN6P_IPV6_V6ONLY)) {
	/*
	* XXX: this case would happen when the
	* application sets the V6ONLY flag after
	* connecting the foreign address.
	* Such applications should be fixed,
	* so we bark here.
	*/
	log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
	"option was set for a connected socket\n");
	error = EINVAL;
	goto release;
	} else
	af = AF_INET;
	}
	laddr = &inp->in6p_laddr;
	faddr = &inp->in6p_faddr;
	fport = inp->in6p_fport;
	}

	if (af == AF_INET)
	hlen = sizeof(struct ip);

	/*
	* Calculate data length and get a mbuf
	* for UDP and IP6 headers.
	*/
	M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT);
	if (m == 0) {
	error = ENOBUFS;
	goto release;
	}

	/*
	* Stuff checksum and output datagram.
	*/
	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
	udp6->uh_sport = inp->in6p_lport; /* lport is always set in the PCB */
	udp6->uh_dport = fport;
	if (plen <= 0xffff)
	udp6->uh_ulen = htons((u_short)plen);
	else
	udp6->uh_ulen = 0;
	udp6->uh_sum = 0;

	switch (af) {
	case AF_INET6:
	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_flow = inp->in6p_flowinfo & IPV6_FLOWINFO_MASK;
	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6->ip6_vfc \|= IPV6_VERSION;
	#if 0 /* ip6_plen will be filled in ip6_output. */
	ip6->ip6_plen = htons((u_short)plen);
	#endif
	ip6->ip6_nxt = IPPROTO_UDP;
	ip6->ip6_hlim = in6_selecthlim(inp, NULL);
	ip6->ip6_src = *laddr;
	ip6->ip6_dst = *faddr;

	if ((udp6->uh_sum = in6_cksum(m, IPPROTO_UDP,
	sizeof(struct ip6_hdr), plen)) == 0) {
	udp6->uh_sum = 0xffff;
	}

	flags = 0;

	V_udpstat.udps_opackets++;
	error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,
	NULL, inp);
	break;
	case AF_INET:
	error = EAFNOSUPPORT;
	goto release;
	}
	goto releaseopt;

	release:
	m_freem(m);

	releaseopt:
	if (control) {
	ip6_clearpktopts(&opt, -1);
	m_freem(control);
	}
	return (error);
	}

	static void
	udp6_abort(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_abort: inp == NULL"));

	#ifdef INET
	if (inp->inp_vflag & INP_IPV4) {
	struct pr_usrreqs *pru;

	pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
	(*pru->pru_abort)(so);
	return;
	}
	#endif

	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	in6_pcbdisconnect(inp);
	inp->in6p_laddr = in6addr_any;
	soisdisconnected(so);
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	}

	static int
	udp6_attach(struct socket so, int proto, struct thread td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp == NULL, ("udp6_attach: inp != NULL"));

	if (so->so_snd.sb_hiwat == 0 \|\| so->so_rcv.sb_hiwat == 0) {
	error = soreserve(so, udp_sendspace, udp_recvspace);
	if (error)
	return (error);
	}
	INP_INFO_WLOCK(&V_udbinfo);
	error = in_pcballoc(so, &V_udbinfo);
	if (error) {
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);
	}
	inp = (struct inpcb *)so->so_pcb;
	INP_INFO_WUNLOCK(&V_udbinfo);
	inp->inp_vflag \|= INP_IPV6;
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
	inp->inp_vflag \|= INP_IPV4;
	inp->in6p_hops = -1; /* use kernel default */
	inp->in6p_cksum = -1; /* just to be sure */
	/*
	* XXX: ugly!!
	* IPv4 TTL initialization is necessary for an IPv6 socket as well,
	* because the socket may be bound to an IPv6 wildcard address,
	* which may match an IPv4-mapped IPv6 address.
	*/
	inp->inp_ip_ttl = V_ip_defttl;
	INP_WUNLOCK(inp);
	return (0);
	}

	static int
	udp6_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_bind: inp == NULL"));

	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	inp->inp_vflag &= ~INP_IPV4;
	inp->inp_vflag \|= INP_IPV6;
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	struct sockaddr_in6 *sin6_p;

	sin6_p = (struct sockaddr_in6 *)nam;

	if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr))
	inp->inp_vflag \|= INP_IPV4;
	else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
	struct sockaddr_in sin;

	in6_sin6_2_sin(&sin, sin6_p);
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_vflag &= ~INP_IPV6;
	error = in_pcbbind(inp, (struct sockaddr *)&sin,
	td->td_ucred);
	goto out;
	}
	}

	error = in6_pcbbind(inp, nam, td->td_ucred);
	out:
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);
	}

	static void
	udp6_close(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_close: inp == NULL"));

	#ifdef INET
	if (inp->inp_vflag & INP_IPV4) {
	struct pr_usrreqs *pru;

	pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
	(*pru->pru_disconnect)(so);
	return;
	}
	#endif
	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	in6_pcbdisconnect(inp);
	inp->in6p_laddr = in6addr_any;
	soisdisconnected(so);
	}
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	}

	static int
	udp6_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_connect: inp == NULL"));

	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	struct sockaddr_in6 *sin6_p;

	sin6_p = (struct sockaddr_in6 *)nam;
	if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
	struct sockaddr_in sin;

	if (inp->inp_faddr.s_addr != INADDR_ANY) {
	error = EISCONN;
	goto out;
	}
	in6_sin6_2_sin(&sin, sin6_p);
	error = in_pcbconnect(inp, (struct sockaddr *)&sin,
	td->td_ucred);
	if (error == 0) {
	inp->inp_vflag \|= INP_IPV4;
	inp->inp_vflag &= ~INP_IPV6;
	soisconnected(so);
	}
	goto out;
	}
	}
	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	error = EISCONN;
	goto out;
	}
	error = in6_pcbconnect(inp, nam, td->td_ucred);
	if (error == 0) {
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	/* should be non mapped addr */
	inp->inp_vflag &= ~INP_IPV4;
	inp->inp_vflag \|= INP_IPV6;
	}
	soisconnected(so);
	}
	out:
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);
	}

	static void
	udp6_detach(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));

	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	in6_pcbdetach(inp);
	in6_pcbfree(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	}

	static int
	udp6_disconnect(struct socket *so)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL"));

	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);

	#ifdef INET
	if (inp->inp_vflag & INP_IPV4) {
	struct pr_usrreqs *pru;

	pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
	error = (*pru->pru_disconnect)(so);
	goto out;
	}
	#endif

	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
	error = ENOTCONN;
	goto out;
	}

	in6_pcbdisconnect(inp);
	inp->in6p_laddr = in6addr_any;
	/* XXXRW: so_state locking? */
	so->so_state &= ~SS_ISCONNECTED; /* XXX */
	out:
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (0);
	}

	static int
	udp6_send(struct socket so, int flags, struct mbuf m,
	struct sockaddr addr, struct mbuf control, struct thread *td)
	{
	+ INIT_VNET_INET(so->so_vnet);
	struct inpcb *inp;
	int error = 0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("udp6_send: inp == NULL"));

	INP_INFO_WLOCK(&V_udbinfo);
	INP_WLOCK(inp);
	if (addr) {
	if (addr->sa_len != sizeof(struct sockaddr_in6)) {
	error = EINVAL;
	goto bad;
	}
	if (addr->sa_family != AF_INET6) {
	error = EAFNOSUPPORT;
	goto bad;
	}
	}

	#ifdef INET
	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
	int hasv4addr;
	struct sockaddr_in6 *sin6 = 0;

	if (addr == 0)
	hasv4addr = (inp->inp_vflag & INP_IPV4);
	else {
	sin6 = (struct sockaddr_in6 *)addr;
	hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
	? 1 : 0;
	}
	if (hasv4addr) {
	struct pr_usrreqs *pru;

	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
	!IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) {
	/*
	* When remote addr is IPv4-mapped address,
	* local addr should not be an IPv6 address;
	* since you cannot determine how to map IPv6
	* source address to IPv4.
	*/
	error = EINVAL;
	goto out;
	}

	/*
	* XXXRW: We release UDP-layer locks before calling
	* udp_send() in order to avoid recursion. However,
	* this does mean there is a short window where inp's
	* fields are unstable. Could this lead to a
	* potential race in which the factors causing us to
	* select the UDPv4 output routine are invalidated?
	*/
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	if (sin6)
	in6_sin6_2_sin_in_sock(addr);
	pru = inetsw[ip_protox[IPPROTO_UDP]].pr_usrreqs;
	/* addr will just be freed in sendit(). */
	return ((*pru->pru_send)(so, flags, m, addr, control,
	td));
	}
	}
	#endif
	#ifdef MAC
	mac_inpcb_create_mbuf(inp, m);
	#endif
	error = udp6_output(inp, m, addr, control, td);
	out:
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	return (error);

	bad:
	INP_WUNLOCK(inp);
	INP_INFO_WUNLOCK(&V_udbinfo);
	m_freem(m);
	return (error);
	}

	struct pr_usrreqs udp6_usrreqs = {
	.pru_abort = udp6_abort,
	.pru_attach = udp6_attach,
	.pru_bind = udp6_bind,
	.pru_connect = udp6_connect,
	.pru_control = in6_control,
	.pru_detach = udp6_detach,
	.pru_disconnect = udp6_disconnect,
	.pru_peeraddr = in6_mapped_peeraddr,
	.pru_send = udp6_send,
	.pru_shutdown = udp_shutdown,
	.pru_sockaddr = in6_mapped_sockaddr,
	.pru_soreceive = soreceive_dgram,
	.pru_sosend = sosend_dgram,
	.pru_sosetlabel = in_pcbsosetlabel,
	.pru_close = udp6_close
	};
	Index: head/sys/netinet6/vinet6.h
	===================================================================
	--- head/sys/netinet6/vinet6.h (nonexistent)
	+++ head/sys/netinet6/vinet6.h (revision 183550)
	@@ -0,0 +1,259 @@
	+/*-
	+ * Copyright (c) 2006-2008 University of Zagreb
	+ * Copyright (c) 2006-2008 FreeBSD Foundation
	+ *
	+ * This software was developed by the University of Zagreb and the
	+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
	+ * FreeBSD Foundation.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _NETINET6_VINET6_H_
	+#define _NETINET6_VINET6_H_
	+
	+#ifdef VIMAGE
	+#include <sys/socket.h>
	+#include <netinet/ip6.h>
	+#include <net/if.h>
	+#include <netinet6/ip6_var.h>
	+#include <netinet6/raw_ip6.h>
	+#include <netinet/icmp6.h>
	+#include <netinet6/scope6_var.h>
	+#include <netinet6/in6_ifattach.h>
	+#include <netinet6/in6_var.h>
	+#include <netinet6/nd6.h>
	+#include <netinet/in_pcb.h>
	+
	+struct vnet_inet6 {
	+ struct in6_ifaddr * _in6_ifaddr;
	+
	+ u_int _frag6_nfragpackets;
	+ u_int _frag6_nfrags;
	+ struct ip6q _ip6q;
	+
	+ struct route_in6 _ip6_forward_rt;
	+
	+ struct in6_addrpolicy _defaultaddrpolicy;
	+ TAILQ_HEAD(, addrsel_policyent) _addrsel_policytab;
	+ u_int _in6_maxmtu;
	+ int _ip6_auto_linklocal;
	+ int _rtq_minreallyold6;
	+ int _rtq_reallyold6;
	+ int _rtq_toomany6;
	+
	+ struct ip6stat _ip6stat;
	+ struct rip6stat _rip6stat;
	+ struct icmp6stat _icmp6stat;
	+
	+ int _rtq_timeout6;
	+ struct callout _rtq_timer6;
	+ struct callout _rtq_mtutimer;
	+ struct callout _nd6_slowtimo_ch;
	+ struct callout _nd6_timer_ch;
	+ struct callout _in6_tmpaddrtimer_ch;
	+
	+ int _nd6_inuse;
	+ int _nd6_allocated;
	+ struct llinfo_nd6 _llinfo_nd6;
	+ struct nd_drhead _nd_defrouter;
	+ struct nd_prhead _nd_prefix;
	+ struct ifnet * _nd6_defifp;
	+ int _nd6_defifindex;
	+
	+ struct scope6_id _sid_default;
	+
	+ TAILQ_HEAD(, dadq) _dadq;
	+ int _dad_init;
	+
	+ int _icmp6errpps_count;
	+ int _icmp6errppslim_last;
	+
	+ int _ip6_forwarding;
	+ int _ip6_sendredirects;
	+ int _ip6_defhlim;
	+ int _ip6_defmcasthlim;
	+ int _ip6_accept_rtadv;
	+ int _ip6_maxfragpackets;
	+ int _ip6_maxfrags;
	+ int _ip6_log_interval;
	+ int _ip6_hdrnestlimit;
	+ int _ip6_dad_count;
	+ int _ip6_auto_flowlabel;
	+ int _ip6_use_deprecated;
	+ int _ip6_rr_prune;
	+ int _ip6_mcast_pmtu;
	+ int _ip6_v6only;
	+ int _ip6_keepfaith;
	+ int _ip6stealth;
	+ time_t _ip6_log_time;
	+
	+ int _pmtu_expire;
	+ int _pmtu_probe;
	+ u_long _rip6_sendspace;
	+ u_long _rip6_recvspace;
	+ int _icmp6_rediraccept;
	+ int _icmp6_redirtimeout;
	+ int _icmp6errppslim;
	+ int _icmp6_nodeinfo;
	+ int _udp6_sendspace;
	+ int _udp6_recvspace;
	+ int _ip6qmaxlen;
	+ int _ip6_prefer_tempaddr;
	+ int _ip6_forward_srcrt;
	+ int _ip6_sourcecheck;
	+ int _ip6_sourcecheck_interval;
	+ int _ip6_ours_check_algorithm;
	+
	+ int _nd6_prune;
	+ int _nd6_delay;
	+ int _nd6_umaxtries;
	+ int _nd6_mmaxtries;
	+ int _nd6_useloopback;
	+ int _nd6_gctimer;
	+ int _nd6_maxndopt;
	+ int _nd6_maxnudhint;
	+ int _nd6_maxqueuelen;
	+ int _nd6_debug;
	+ int _nd6_recalc_reachtm_interval;
	+ int _dad_ignore_ns;
	+ int _dad_maxtry;
	+ int _ip6_use_tempaddr;
	+ int _ip6_desync_factor;
	+ u_int32_t _ip6_temp_preferred_lifetime;
	+ u_int32_t _ip6_temp_valid_lifetime;
	+
	+ int _ip6_mrouter_ver;
	+ int _pim6;
	+ u_int _mrt6debug;
	+
	+ int _ip6_temp_regen_advance;
	+ int _ip6_use_defzone;
	+
	+ struct ip6_pktopts _ip6_opts;
	+};
	+#endif
	+
	+
	+#define INIT_VNET_INET6(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_INET6, struct vnet_inet6, vnet_inet6)
	+
	+#define VNET_INET6(sym) VSYM(vnet_inet6, sym)
	+
	+
	+/*
	+ * Symbol translation macros
	+ */
	+#define V_addrsel_policytab VNET_INET6(addrsel_policytab)
	+#define V_dad_ignore_ns VNET_INET6(dad_ignore_ns)
	+#define V_dad_init VNET_INET6(dad_init)
	+#define V_dad_maxtry VNET_INET6(dad_maxtry)
	+#define V_dadq VNET_INET6(dadq)
	+#define V_defaultaddrpolicy VNET_INET6(defaultaddrpolicy)
	+#define V_frag6_nfragpackets VNET_INET6(frag6_nfragpackets)
	+#define V_frag6_nfrags VNET_INET6(frag6_nfrags)
	+#define V_icmp6_nodeinfo VNET_INET6(icmp6_nodeinfo)
	+#define V_icmp6_rediraccept VNET_INET6(icmp6_rediraccept)
	+#define V_icmp6_redirtimeout VNET_INET6(icmp6_redirtimeout)
	+#define V_icmp6errpps_count VNET_INET6(icmp6errpps_count)
	+#define V_icmp6errppslim VNET_INET6(icmp6errppslim)
	+#define V_icmp6errppslim_last VNET_INET6(icmp6errppslim_last)
	+#define V_icmp6stat VNET_INET6(icmp6stat)
	+#define V_in6_ifaddr VNET_INET6(in6_ifaddr)
	+#define V_in6_maxmtu VNET_INET6(in6_maxmtu)
	+#define V_in6_tmpaddrtimer_ch VNET_INET6(in6_tmpaddrtimer_ch)
	+#define V_ip6_accept_rtadv VNET_INET6(ip6_accept_rtadv)
	+#define V_ip6_auto_flowlabel VNET_INET6(ip6_auto_flowlabel)
	+#define V_ip6_auto_linklocal VNET_INET6(ip6_auto_linklocal)
	+#define V_ip6_dad_count VNET_INET6(ip6_dad_count)
	+#define V_ip6_defhlim VNET_INET6(ip6_defhlim)
	+#define V_ip6_defmcasthlim VNET_INET6(ip6_defmcasthlim)
	+#define V_ip6_desync_factor VNET_INET6(ip6_desync_factor)
	+#define V_ip6_forward_rt VNET_INET6(ip6_forward_rt)
	+#define V_ip6_forward_srcrt VNET_INET6(ip6_forward_srcrt)
	+#define V_ip6_forwarding VNET_INET6(ip6_forwarding)
	+#define V_ip6_hdrnestlimit VNET_INET6(ip6_hdrnestlimit)
	+#define V_ip6_keepfaith VNET_INET6(ip6_keepfaith)
	+#define V_ip6_log_interval VNET_INET6(ip6_log_interval)
	+#define V_ip6_log_time VNET_INET6(ip6_log_time)
	+#define V_ip6_maxfragpackets VNET_INET6(ip6_maxfragpackets)
	+#define V_ip6_maxfrags VNET_INET6(ip6_maxfrags)
	+#define V_ip6_mcast_pmtu VNET_INET6(ip6_mcast_pmtu)
	+#define V_ip6_mrouter_ver VNET_INET6(ip6_mrouter_ver)
	+#define V_ip6_opts VNET_INET6(ip6_opts)
	+#define V_ip6_ours_check_algorithm VNET_INET6(ip6_ours_check_algorithm)
	+#define V_ip6_prefer_tempaddr VNET_INET6(ip6_prefer_tempaddr)
	+#define V_ip6_rr_prune VNET_INET6(ip6_rr_prune)
	+#define V_ip6_sendredirects VNET_INET6(ip6_sendredirects)
	+#define V_ip6_sourcecheck VNET_INET6(ip6_sourcecheck)
	+#define V_ip6_sourcecheck_interval VNET_INET6(ip6_sourcecheck_interval)
	+#define V_ip6_temp_preferred_lifetime VNET_INET6(ip6_temp_preferred_lifetime)
	+#define V_ip6_temp_regen_advance VNET_INET6(ip6_temp_regen_advance)
	+#define V_ip6_temp_valid_lifetime VNET_INET6(ip6_temp_valid_lifetime)
	+#define V_ip6_use_defzone VNET_INET6(ip6_use_defzone)
	+#define V_ip6_use_deprecated VNET_INET6(ip6_use_deprecated)
	+#define V_ip6_use_tempaddr VNET_INET6(ip6_use_tempaddr)
	+#define V_ip6_v6only VNET_INET6(ip6_v6only)
	+#define V_ip6q VNET_INET6(ip6q)
	+#define V_ip6qmaxlen VNET_INET6(ip6qmaxlen)
	+#define V_ip6stat VNET_INET6(ip6stat)
	+#define V_ip6stealth VNET_INET6(ip6stealth)
	+#define V_llinfo_nd6 VNET_INET6(llinfo_nd6)
	+#define V_mrt6debug VNET_INET6(mrt6debug)
	+#define V_nd6_allocated VNET_INET6(nd6_allocated)
	+#define V_nd6_debug VNET_INET6(nd6_debug)
	+#define V_nd6_defifindex VNET_INET6(nd6_defifindex)
	+#define V_nd6_defifp VNET_INET6(nd6_defifp)
	+#define V_nd6_delay VNET_INET6(nd6_delay)
	+#define V_nd6_gctimer VNET_INET6(nd6_gctimer)
	+#define V_nd6_inuse VNET_INET6(nd6_inuse)
	+#define V_nd6_maxndopt VNET_INET6(nd6_maxndopt)
	+#define V_nd6_maxnudhint VNET_INET6(nd6_maxnudhint)
	+#define V_nd6_maxqueuelen VNET_INET6(nd6_maxqueuelen)
	+#define V_nd6_mmaxtries VNET_INET6(nd6_mmaxtries)
	+#define V_nd6_prune VNET_INET6(nd6_prune)
	+#define V_nd6_recalc_reachtm_interval VNET_INET6(nd6_recalc_reachtm_interval)
	+#define V_nd6_slowtimo_ch VNET_INET6(nd6_slowtimo_ch)
	+#define V_nd6_timer_ch VNET_INET6(nd6_timer_ch)
	+#define V_nd6_umaxtries VNET_INET6(nd6_umaxtries)
	+#define V_nd6_useloopback VNET_INET6(nd6_useloopback)
	+#define V_nd_defrouter VNET_INET6(nd_defrouter)
	+#define V_nd_prefix VNET_INET6(nd_prefix)
	+#define V_pim6 VNET_INET6(pim6)
	+#define V_pmtu_expire VNET_INET6(pmtu_expire)
	+#define V_pmtu_probe VNET_INET6(pmtu_probe)
	+#define V_rip6_recvspace VNET_INET6(rip6_recvspace)
	+#define V_rip6_sendspace VNET_INET6(rip6_sendspace)
	+#define V_rip6stat VNET_INET6(rip6stat)
	+#define V_rtq_minreallyold6 VNET_INET6(rtq_minreallyold6)
	+#define V_rtq_mtutimer VNET_INET6(rtq_mtutimer)
	+#define V_rtq_reallyold6 VNET_INET6(rtq_reallyold6)
	+#define V_rtq_timeout6 VNET_INET6(rtq_timeout6)
	+#define V_rtq_timer6 VNET_INET6(rtq_timer6)
	+#define V_rtq_toomany6 VNET_INET6(rtq_toomany6)
	+#define V_sid_default VNET_INET6(sid_default)
	+#define V_udp6_recvspace VNET_INET6(udp6_recvspace)
	+#define V_udp6_sendspace VNET_INET6(udp6_sendspace)
	+
	+#endif /* !_NETINET6_VINET6_H_ */

	Property changes on: head/sys/netinet6/vinet6.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/sys/netipsec/ipsec.c
	===================================================================
	--- head/sys/netipsec/ipsec.c (revision 183549)
	+++ head/sys/netipsec/ipsec.c (revision 183550)
	@@ -1,1974 +1,2008 @@
	/* $FreeBSD$ */
	/* $KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* IPsec controller part.
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/priv.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/errno.h>
	#include <sys/time.h>
	#include <sys/kernel.h>
	#include <sys/syslog.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/in_var.h>
	#include <netinet/udp.h>
	#include <netinet/udp_var.h>
	#include <netinet/tcp.h>
	#include <netinet/udp.h>

	#include <netinet/ip6.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#endif
	#include <netinet/in_pcb.h>
	#ifdef INET6
	#include <netinet/icmp6.h>
	#endif

	#include <sys/types.h>
	#include <netipsec/ipsec.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif
	#include <netipsec/ah_var.h>
	#include <netipsec/esp_var.h>
	#include <netipsec/ipcomp.h> /XXX/
	#include <netipsec/ipcomp_var.h>

	#include <netipsec/key.h>
	#include <netipsec/keydb.h>
	#include <netipsec/key_debug.h>

	#include <netipsec/xform.h>

	#include <machine/in_cksum.h>

	#include <opencrypto/cryptodev.h>

	#ifdef IPSEC_DEBUG
	int ipsec_debug = 1;
	#else
	int ipsec_debug = 0;
	#endif

	/* NB: name changed so netstat doesn't use it */
	struct ipsecstat ipsec4stat;
	int ip4_ah_offsetmask = 0; /* maybe IP_DF? */
	int ip4_ipsec_dfbit = 0; /* DF bit on encap. 0: clear 1: set 2: copy */
	int ip4_esp_trans_deflev = IPSEC_LEVEL_USE;
	int ip4_esp_net_deflev = IPSEC_LEVEL_USE;
	int ip4_ah_trans_deflev = IPSEC_LEVEL_USE;
	int ip4_ah_net_deflev = IPSEC_LEVEL_USE;
	struct secpolicy ip4_def_policy;
	int ip4_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */
	int ip4_esp_randpad = -1;
	/*
	* Crypto support requirements:
	*
	* 1 require hardware support
	* -1 require software support
	* 0 take anything
	*/
	int crypto_support = CRYPTOCAP_F_HARDWARE \| CRYPTOCAP_F_SOFTWARE;

	SYSCTL_DECL(_net_inet_ipsec);

	/* net.inet.ipsec */
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY,
	- def_policy, CTLFLAG_RW, &ip4_def_policy.policy, 0,
	- "IPsec default policy.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
	- CTLFLAG_RW, &ip4_esp_trans_deflev, 0, "Default ESP transport mode level");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
	- CTLFLAG_RW, &ip4_esp_net_deflev, 0, "Default ESP tunnel mode level.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
	- CTLFLAG_RW, &ip4_ah_trans_deflev, 0, "AH transfer mode default level.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
	- CTLFLAG_RW, &ip4_ah_net_deflev, 0, "AH tunnel mode default level.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS,
	- ah_cleartos, CTLFLAG_RW, &ah_cleartos, 0,
	- "If set clear type-of-service field when doing AH computation.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK,
	- ah_offsetmask, CTLFLAG_RW, &ip4_ah_offsetmask, 0,
	- "If not set clear offset field mask when doing AH computation.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT,
	- dfbit, CTLFLAG_RW, &ip4_ipsec_dfbit, 0, "Do not fragment bit on encap.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN,
	- ecn, CTLFLAG_RW, &ip4_ipsec_ecn, 0,
	- "Explicit Congestion Notification handling.");
	-SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG,
	- debug, CTLFLAG_RW, &ipsec_debug, 0,
	- "Enable IPsec debugging output when set.");
	-SYSCTL_INT(_net_inet_ipsec, OID_AUTO,
	- crypto_support, CTLFLAG_RW, &crypto_support, 0,
	- "Crypto driver selection.");
	-SYSCTL_STRUCT(_net_inet_ipsec, OID_AUTO,
	- ipsecstats, CTLFLAG_RD, &ipsec4stat, ipsecstat, "IPsec IPv4 statistics.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_POLICY,
	+ def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0,
	+ "IPsec default policy.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV,
	+ esp_trans_deflev, CTLFLAG_RW, ip4_esp_trans_deflev, 0,
	+ "Default ESP transport mode level");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV,
	+ esp_net_deflev, CTLFLAG_RW, ip4_esp_net_deflev, 0,
	+ "Default ESP tunnel mode level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV,
	+ ah_trans_deflev, CTLFLAG_RW, ip4_ah_trans_deflev, 0,
	+ "AH transfer mode default level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV,
	+ ah_net_deflev, CTLFLAG_RW, ip4_ah_net_deflev, 0,
	+ "AH tunnel mode default level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_CLEARTOS,
	+ ah_cleartos, CTLFLAG_RW, ah_cleartos, 0,
	+ "If set clear type-of-service field when doing AH computation.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_AH_OFFSETMASK,
	+ ah_offsetmask, CTLFLAG_RW, ip4_ah_offsetmask, 0,
	+ "If not set clear offset field mask when doing AH computation.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DFBIT,
	+ dfbit, CTLFLAG_RW, ip4_ipsec_dfbit, 0,
	+ "Do not fragment bit on encap.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_ECN,
	+ ecn, CTLFLAG_RW, ip4_ipsec_ecn, 0,
	+ "Explicit Congestion Notification handling.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, IPSECCTL_DEBUG,
	+ debug, CTLFLAG_RW, ipsec_debug, 0,
	+ "Enable IPsec debugging output when set.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO,
	+ crypto_support, CTLFLAG_RW, crypto_support,0,
	+ "Crypto driver selection.");
	+SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipsec, OID_AUTO,
	+ ipsecstats, CTLFLAG_RD, ipsec4stat, ipsecstat,
	+ "IPsec IPv4 statistics.");

	#ifdef REGRESSION
	/*
	* When set to 1, IPsec will send packets with the same sequence number.
	* This allows to verify if the other side has proper replay attacks detection.
	*/
	int ipsec_replay = 0;
	-SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_replay, CTLFLAG_RW, &ipsec_replay, 0,
	- "Emulate replay attack");
	+SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_replay,
	+ CTLFLAG_RW, ipsec_replay, 0, "Emulate replay attack");
	/*
	* When set 1, IPsec will send packets with corrupted HMAC.
	* This allows to verify if the other side properly detects modified packets.
	*/
	int ipsec_integrity = 0;
	-SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_integrity, CTLFLAG_RW,
	- &ipsec_integrity, 0, "Emulate man-in-the-middle attack");
	+SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_ipsec, OID_AUTO, test_integrity,
	+ CTLFLAG_RW, ipsec_integrity, 0, "Emulate man-in-the-middle attack");
	#endif

	#ifdef INET6
	struct ipsecstat ipsec6stat;
	int ip6_esp_trans_deflev = IPSEC_LEVEL_USE;
	int ip6_esp_net_deflev = IPSEC_LEVEL_USE;
	int ip6_ah_trans_deflev = IPSEC_LEVEL_USE;
	int ip6_ah_net_deflev = IPSEC_LEVEL_USE;
	int ip6_ipsec_ecn = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */

	SYSCTL_DECL(_net_inet6_ipsec6);

	/* net.inet6.ipsec6 */
	#ifdef COMPAT_KAME
	SYSCTL_OID(_net_inet6_ipsec6, IPSECCTL_STATS, stats, CTLFLAG_RD,
	0, 0, compat_ipsecstats_sysctl, "S", "IPsec IPv6 statistics.");
	#endif /* COMPAT_KAME */
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY,
	- def_policy, CTLFLAG_RW, &ip4_def_policy.policy, 0, "IPsec default policy.");
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev,
	- CTLFLAG_RW, &ip6_esp_trans_deflev, 0, "Default ESP transport mode level.");
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev,
	- CTLFLAG_RW, &ip6_esp_net_deflev, 0, "Default ESP tunnel mode level.");
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev,
	- CTLFLAG_RW, &ip6_ah_trans_deflev, 0, "AH transfer mode default level.");
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev,
	- CTLFLAG_RW, &ip6_ah_net_deflev, 0, "AH tunnel mode default level.");
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN,
	- ecn, CTLFLAG_RW, &ip6_ipsec_ecn, 0,
	- "Explicit Congestion Notification handling.");
	-SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG,
	- debug, CTLFLAG_RW, &ipsec_debug, 0,
	- "Enable IPsec debugging output when set.");
	-SYSCTL_STRUCT(_net_inet6_ipsec6, IPSECCTL_STATS,
	- ipsecstats, CTLFLAG_RD, &ipsec6stat, ipsecstat, "IPsec IPv6 statistics.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_POLICY,
	+ def_policy, CTLFLAG_RW, ip4_def_policy.policy, 0,
	+ "IPsec default policy.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV,
	+ esp_trans_deflev, CTLFLAG_RW, ip6_esp_trans_deflev, 0,
	+ "Default ESP transport mode level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV,
	+ esp_net_deflev, CTLFLAG_RW, ip6_esp_net_deflev, 0,
	+ "Default ESP tunnel mode level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV,
	+ ah_trans_deflev, CTLFLAG_RW, ip6_ah_trans_deflev, 0,
	+ "AH transfer mode default level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV,
	+ ah_net_deflev, CTLFLAG_RW, ip6_ah_net_deflev, 0,
	+ "AH tunnel mode default level.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_ECN,
	+ ecn, CTLFLAG_RW, ip6_ipsec_ecn, 0,
	+ "Explicit Congestion Notification handling.");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_DEBUG,
	+ debug, CTLFLAG_RW, ipsec_debug, 0,
	+ "Enable IPsec debugging output when set.");
	+SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet6_ipsec6, IPSECCTL_STATS,
	+ ipsecstats, CTLFLAG_RD, ipsec6stat, ipsecstat,
	+ "IPsec IPv6 statistics.");
	#endif /* INET6 */

	static int ipsec4_setspidx_inpcb __P((struct mbuf , struct inpcb pcb));
	#ifdef INET6
	static int ipsec6_setspidx_in6pcb __P((struct mbuf , struct in6pcb pcb));
	#endif
	static int ipsec_setspidx __P((struct mbuf , struct secpolicyindex , int));
	static void ipsec4_get_ulp __P((struct mbuf m, struct secpolicyindex , int));
	static int ipsec4_setspidx_ipaddr __P((struct mbuf , struct secpolicyindex ));
	#ifdef INET6
	static void ipsec6_get_ulp __P((struct mbuf m, struct secpolicyindex , int));
	static int ipsec6_setspidx_ipaddr __P((struct mbuf , struct secpolicyindex ));
	#endif
	static void ipsec_delpcbpolicy __P((struct inpcbpolicy *));
	static struct secpolicy ipsec_deepcopy_policy __P((struct secpolicy src));
	static int ipsec_set_policy __P((struct secpolicy **pcb_sp,
	int optname, caddr_t request, size_t len, struct ucred *cred));
	static int ipsec_get_policy __P((struct secpolicy pcb_sp, struct mbuf *mp));
	static void vshiftl __P((unsigned char *, int, int));
	static size_t ipsec_hdrsiz __P((struct secpolicy *));

	MALLOC_DEFINE(M_IPSEC_INPCB, "inpcbpolicy", "inpcb-resident ipsec policy");

	/*
	* Return a held reference to the default SP.
	*/
	static struct secpolicy *
	key_allocsp_default(const char* where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP key_allocsp_default from %s:%u\n", where, tag));

	sp = &V_ip4_def_policy;
	if (sp->policy != IPSEC_POLICY_DISCARD &&
	sp->policy != IPSEC_POLICY_NONE) {
	ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n",
	sp->policy, IPSEC_POLICY_NONE));
	sp->policy = IPSEC_POLICY_NONE;
	}
	key_addref(sp);

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP key_allocsp_default returns SP:%p (%u)\n",
	sp, sp->refcnt));
	return sp;
	}
	#define KEY_ALLOCSP_DEFAULT() \
	key_allocsp_default(__FILE__, __LINE__)

	/*
	* For OUTBOUND packet having a socket. Searching SPD for packet,
	* and return a pointer to SP.
	* OUT: NULL: no apropreate SP found, the following value is set to error.
	* 0 : bypass
	* EACCES : discard packet.
	* ENOENT : ipsec_acquire() in progress, maybe.
	* others : error occured.
	* others: a pointer to SP
	*
	* NOTE: IPv6 mapped adddress concern is implemented here.
	*/
	struct secpolicy *
	ipsec_getpolicy(struct tdb_ident *tdbi, u_int dir)
	{
	struct secpolicy *sp;

	IPSEC_ASSERT(tdbi != NULL, ("null tdbi"));
	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", dir));

	sp = KEY_ALLOCSP2(tdbi->spi, &tdbi->dst, tdbi->proto, dir);
	if (sp == NULL) /XXX????/
	sp = KEY_ALLOCSP_DEFAULT();
	IPSEC_ASSERT(sp != NULL, ("null SP"));
	return sp;
	}

	/*
	* For OUTBOUND packet having a socket. Searching SPD for packet,
	* and return a pointer to SP.
	* OUT: NULL: no apropreate SP found, the following value is set to error.
	* 0 : bypass
	* EACCES : discard packet.
	* ENOENT : ipsec_acquire() in progress, maybe.
	* others : error occured.
	* others: a pointer to SP
	*
	* NOTE: IPv6 mapped adddress concern is implemented here.
	*/
	struct secpolicy *
	ipsec_getpolicybysock(m, dir, inp, error)
	struct mbuf *m;
	u_int dir;
	struct inpcb *inp;
	int *error;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct inpcbpolicy *pcbsp = NULL;
	struct secpolicy currsp = NULL; / policy on socket */
	struct secpolicy *sp;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(inp != NULL, ("null inpcb"));
	IPSEC_ASSERT(error != NULL, ("null error"));
	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", dir));

	/* set spidx in pcb */
	if (inp->inp_vflag & INP_IPV6PROTO) {
	#ifdef INET6
	*error = ipsec6_setspidx_in6pcb(m, inp);
	pcbsp = inp->in6p_sp;
	#else
	error = EINVAL; / should not happen */
	#endif
	} else {
	*error = ipsec4_setspidx_inpcb(m, inp);
	pcbsp = inp->inp_sp;
	}
	if (*error)
	return NULL;

	IPSEC_ASSERT(pcbsp != NULL, ("null pcbsp"));
	switch (dir) {
	case IPSEC_DIR_INBOUND:
	currsp = pcbsp->sp_in;
	break;
	case IPSEC_DIR_OUTBOUND:
	currsp = pcbsp->sp_out;
	break;
	}
	IPSEC_ASSERT(currsp != NULL, ("null currsp"));

	if (pcbsp->priv) { /* when privilieged socket */
	switch (currsp->policy) {
	case IPSEC_POLICY_BYPASS:
	case IPSEC_POLICY_IPSEC:
	key_addref(currsp);
	sp = currsp;
	break;

	case IPSEC_POLICY_ENTRUST:
	/* look for a policy in SPD */
	sp = KEY_ALLOCSP(&currsp->spidx, dir);
	if (sp == NULL) /* no SP found */
	sp = KEY_ALLOCSP_DEFAULT();
	break;

	default:
	ipseclog((LOG_ERR, "%s: Invalid policy for PCB %d\n",
	__func__, currsp->policy));
	*error = EINVAL;
	return NULL;
	}
	} else { /* unpriv, SPD has policy */
	sp = KEY_ALLOCSP(&currsp->spidx, dir);
	if (sp == NULL) { /* no SP found */
	switch (currsp->policy) {
	case IPSEC_POLICY_BYPASS:
	ipseclog((LOG_ERR, "%s: Illegal policy for "
	"non-priviliged defined %d\n",
	__func__, currsp->policy));
	*error = EINVAL;
	return NULL;

	case IPSEC_POLICY_ENTRUST:
	sp = KEY_ALLOCSP_DEFAULT();
	break;

	case IPSEC_POLICY_IPSEC:
	key_addref(currsp);
	sp = currsp;
	break;

	default:
	ipseclog((LOG_ERR, "%s: Invalid policy for "
	"PCB %d\n", __func__, currsp->policy));
	*error = EINVAL;
	return NULL;
	}
	}
	}
	IPSEC_ASSERT(sp != NULL,
	("null SP (priv %u policy %u", pcbsp->priv, currsp->policy));
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s (priv %u policy %u) allocate SP:%p (refcnt %u)\n",
	__func__, pcbsp->priv, currsp->policy, sp, sp->refcnt));
	return sp;
	}

	/*
	* For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet,
	* and return a pointer to SP.
	* OUT: positive: a pointer to the entry for security policy leaf matched.
	* NULL: no apropreate SP found, the following value is set to error.
	* 0 : bypass
	* EACCES : discard packet.
	* ENOENT : ipsec_acquire() in progress, maybe.
	* others : error occured.
	*/
	struct secpolicy *
	ipsec_getpolicybyaddr(m, dir, flag, error)
	struct mbuf *m;
	u_int dir;
	int flag;
	int *error;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicyindex spidx;
	struct secpolicy *sp;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(error != NULL, ("null error"));
	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", dir));

	sp = NULL;
	if (key_havesp(dir)) {
	/* Make an index to look for a policy. */
	*error = ipsec_setspidx(m, &spidx,
	(flag & IP_FORWARDING) ? 0 : 1);
	if (*error != 0) {
	DPRINTF(("%s: setpidx failed, dir %u flag %u\n",
	__func__, dir, flag));
	return NULL;
	}
	spidx.dir = dir;

	sp = KEY_ALLOCSP(&spidx, dir);
	}
	if (sp == NULL) /* no SP found, use system default */
	sp = KEY_ALLOCSP_DEFAULT();
	IPSEC_ASSERT(sp != NULL, ("null SP"));
	return sp;
	}

	struct secpolicy *
	ipsec4_checkpolicy(m, dir, flag, error, inp)
	struct mbuf *m;
	u_int dir, flag;
	int *error;
	struct inpcb *inp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;

	*error = 0;
	if (inp == NULL)
	sp = ipsec_getpolicybyaddr(m, dir, flag, error);
	else
	sp = ipsec_getpolicybysock(m, dir, inp, error);
	if (sp == NULL) {
	IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error"));
	V_ipsec4stat.ips_out_inval++;
	return NULL;
	}
	IPSEC_ASSERT(error == 0, ("sp w/ error set to %u", error));
	switch (sp->policy) {
	case IPSEC_POLICY_ENTRUST:
	default:
	printf("%s: invalid policy %u\n", __func__, sp->policy);
	/* fall thru... */
	case IPSEC_POLICY_DISCARD:
	V_ipsec4stat.ips_out_polvio++;
	error = -EINVAL; / packet is discarded by caller */
	break;
	case IPSEC_POLICY_BYPASS:
	case IPSEC_POLICY_NONE:
	KEY_FREESP(&sp);
	sp = NULL; /* NB: force NULL result */
	break;
	case IPSEC_POLICY_IPSEC:
	if (sp->req == NULL) /* acquire an SA */
	*error = key_spdacquire(sp);
	break;
	}
	if (*error != 0) {
	KEY_FREESP(&sp);
	sp = NULL;
	}
	return sp;
	}

	static int
	ipsec4_setspidx_inpcb(m, pcb)
	struct mbuf *m;
	struct inpcb *pcb;
	{
	int error;

	IPSEC_ASSERT(pcb != NULL, ("null pcb"));
	IPSEC_ASSERT(pcb->inp_sp != NULL, ("null inp_sp"));
	IPSEC_ASSERT(pcb->inp_sp->sp_out != NULL && pcb->inp_sp->sp_in != NULL,
	("null sp_in \|\| sp_out"));

	error = ipsec_setspidx(m, &pcb->inp_sp->sp_in->spidx, 1);
	if (error == 0) {
	pcb->inp_sp->sp_in->spidx.dir = IPSEC_DIR_INBOUND;
	pcb->inp_sp->sp_out->spidx = pcb->inp_sp->sp_in->spidx;
	pcb->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND;
	} else {
	bzero(&pcb->inp_sp->sp_in->spidx,
	sizeof (pcb->inp_sp->sp_in->spidx));
	bzero(&pcb->inp_sp->sp_out->spidx,
	sizeof (pcb->inp_sp->sp_in->spidx));
	}
	return error;
	}

	#ifdef INET6
	static int
	ipsec6_setspidx_in6pcb(m, pcb)
	struct mbuf *m;
	struct in6pcb *pcb;
	{
	+ //INIT_VNET_IPSEC(curvnet);
	struct secpolicyindex *spidx;
	int error;

	IPSEC_ASSERT(pcb != NULL, ("null pcb"));
	IPSEC_ASSERT(pcb->in6p_sp != NULL, ("null inp_sp"));
	IPSEC_ASSERT(pcb->in6p_sp->sp_out != NULL && pcb->in6p_sp->sp_in != NULL,
	("null sp_in \|\| sp_out"));

	bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx));
	bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx));

	spidx = &pcb->in6p_sp->sp_in->spidx;
	error = ipsec_setspidx(m, spidx, 1);
	if (error)
	goto bad;
	spidx->dir = IPSEC_DIR_INBOUND;

	spidx = &pcb->in6p_sp->sp_out->spidx;
	error = ipsec_setspidx(m, spidx, 1);
	if (error)
	goto bad;
	spidx->dir = IPSEC_DIR_OUTBOUND;

	return 0;

	bad:
	bzero(&pcb->in6p_sp->sp_in->spidx, sizeof(*spidx));
	bzero(&pcb->in6p_sp->sp_out->spidx, sizeof(*spidx));
	return error;
	}
	#endif

	/*
	* configure security policy index (src/dst/proto/sport/dport)
	* by looking at the content of mbuf.
	* the caller is responsible for error recovery (like clearing up spidx).
	*/
	static int
	ipsec_setspidx(m, spidx, needport)
	struct mbuf *m;
	struct secpolicyindex *spidx;
	int needport;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct ip *ip = NULL;
	struct ip ipbuf;
	u_int v;
	struct mbuf *n;
	int len;
	int error;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));

	/*
	* validate m->m_pkthdr.len. we see incorrect length if we
	* mistakenly call this function with inconsistent mbuf chain
	* (like 4.4BSD tcp/udp processing). XXX should we panic here?
	*/
	len = 0;
	for (n = m; n; n = n->m_next)
	len += n->m_len;
	if (m->m_pkthdr.len != len) {
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: pkthdr len(%d) mismatch (%d), ignored.\n",
	__func__, len, m->m_pkthdr.len));
	return EINVAL;
	}

	if (m->m_pkthdr.len < sizeof(struct ip)) {
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: pkthdr len(%d) too small (v4), ignored.\n",
	__func__, m->m_pkthdr.len));
	return EINVAL;
	}

	if (m->m_len >= sizeof(*ip))
	ip = mtod(m, struct ip *);
	else {
	m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf);
	ip = &ipbuf;
	}
	#ifdef _IP_VHL
	v = _IP_VHL_V(ip->ip_vhl);
	#else
	v = ip->ip_v;
	#endif
	switch (v) {
	case 4:
	error = ipsec4_setspidx_ipaddr(m, spidx);
	if (error)
	return error;
	ipsec4_get_ulp(m, spidx, needport);
	return 0;
	#ifdef INET6
	case 6:
	if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) {
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: pkthdr len(%d) too small (v6), "
	"ignored\n", __func__, m->m_pkthdr.len));
	return EINVAL;
	}
	error = ipsec6_setspidx_ipaddr(m, spidx);
	if (error)
	return error;
	ipsec6_get_ulp(m, spidx, needport);
	return 0;
	#endif
	default:
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: " "unknown IP version %u, ignored.\n",
	__func__, v));
	return EINVAL;
	}
	}

	static void
	ipsec4_get_ulp(struct mbuf m, struct secpolicyindex spidx, int needport)
	{
	u_int8_t nxt;
	int off;

	/* sanity check */
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(m->m_pkthdr.len >= sizeof(struct ip),("packet too short"));

	/* NB: ip_input() flips it into host endian XXX need more checking */
	if (m->m_len < sizeof (struct ip)) {
	struct ip ip = mtod(m, struct ip );
	if (ip->ip_off & (IP_MF \| IP_OFFMASK))
	goto done;
	#ifdef _IP_VHL
	off = _IP_VHL_HL(ip->ip_vhl) << 2;
	#else
	off = ip->ip_hl << 2;
	#endif
	nxt = ip->ip_p;
	} else {
	struct ip ih;

	m_copydata(m, 0, sizeof (struct ip), (caddr_t) &ih);
	if (ih.ip_off & (IP_MF \| IP_OFFMASK))
	goto done;
	#ifdef _IP_VHL
	off = _IP_VHL_HL(ih.ip_vhl) << 2;
	#else
	off = ih.ip_hl << 2;
	#endif
	nxt = ih.ip_p;
	}

	while (off < m->m_pkthdr.len) {
	struct ip6_ext ip6e;
	struct tcphdr th;
	struct udphdr uh;

	switch (nxt) {
	case IPPROTO_TCP:
	spidx->ul_proto = nxt;
	if (!needport)
	goto done_proto;
	if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
	goto done;
	m_copydata(m, off, sizeof (th), (caddr_t) &th);
	spidx->src.sin.sin_port = th.th_sport;
	spidx->dst.sin.sin_port = th.th_dport;
	return;
	case IPPROTO_UDP:
	spidx->ul_proto = nxt;
	if (!needport)
	goto done_proto;
	if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
	goto done;
	m_copydata(m, off, sizeof (uh), (caddr_t) &uh);
	spidx->src.sin.sin_port = uh.uh_sport;
	spidx->dst.sin.sin_port = uh.uh_dport;
	return;
	case IPPROTO_AH:
	if (off + sizeof(ip6e) > m->m_pkthdr.len)
	goto done;
	/* XXX sigh, this works but is totally bogus */
	m_copydata(m, off, sizeof(ip6e), (caddr_t) &ip6e);
	off += (ip6e.ip6e_len + 2) << 2;
	nxt = ip6e.ip6e_nxt;
	break;
	case IPPROTO_ICMP:
	default:
	/* XXX intermediate headers??? */
	spidx->ul_proto = nxt;
	goto done_proto;
	}
	}
	done:
	spidx->ul_proto = IPSEC_ULPROTO_ANY;
	done_proto:
	spidx->src.sin.sin_port = IPSEC_PORT_ANY;
	spidx->dst.sin.sin_port = IPSEC_PORT_ANY;
	}

	/* assumes that m is sane */
	static int
	ipsec4_setspidx_ipaddr(struct mbuf m, struct secpolicyindex spidx)
	{
	static const struct sockaddr_in template = {
	sizeof (struct sockaddr_in),
	AF_INET,
	0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 }
	};

	spidx->src.sin = template;
	spidx->dst.sin = template;

	if (m->m_len < sizeof (struct ip)) {
	m_copydata(m, offsetof(struct ip, ip_src),
	sizeof (struct in_addr),
	(caddr_t) &spidx->src.sin.sin_addr);
	m_copydata(m, offsetof(struct ip, ip_dst),
	sizeof (struct in_addr),
	(caddr_t) &spidx->dst.sin.sin_addr);
	} else {
	struct ip ip = mtod(m, struct ip );
	spidx->src.sin.sin_addr = ip->ip_src;
	spidx->dst.sin.sin_addr = ip->ip_dst;
	}

	spidx->prefs = sizeof(struct in_addr) << 3;
	spidx->prefd = sizeof(struct in_addr) << 3;

	return 0;
	}

	#ifdef INET6
	static void
	ipsec6_get_ulp(m, spidx, needport)
	struct mbuf *m;
	struct secpolicyindex *spidx;
	int needport;
	{
	+ INIT_VNET_IPSEC(curvnet);
	int off, nxt;
	struct tcphdr th;
	struct udphdr uh;
	struct icmp6_hdr ih;

	/* sanity check */
	if (m == NULL)
	panic("%s: NULL pointer was passed.\n", __func__);

	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s:\n", __func__); kdebug_mbuf(m));

	/* set default */
	spidx->ul_proto = IPSEC_ULPROTO_ANY;
	((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY;
	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY;

	nxt = -1;
	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
	if (off < 0 \|\| m->m_pkthdr.len < off)
	return;

	switch (nxt) {
	case IPPROTO_TCP:
	spidx->ul_proto = nxt;
	if (!needport)
	break;
	if (off + sizeof(struct tcphdr) > m->m_pkthdr.len)
	break;
	m_copydata(m, off, sizeof(th), (caddr_t)&th);
	((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport;
	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport;
	break;
	case IPPROTO_UDP:
	spidx->ul_proto = nxt;
	if (!needport)
	break;
	if (off + sizeof(struct udphdr) > m->m_pkthdr.len)
	break;
	m_copydata(m, off, sizeof(uh), (caddr_t)&uh);
	((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport;
	((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport;
	break;
	case IPPROTO_ICMPV6:
	spidx->ul_proto = nxt;
	if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len)
	break;
	m_copydata(m, off, sizeof(ih), (caddr_t)&ih);
	((struct sockaddr_in6 *)&spidx->src)->sin6_port =
	htons((uint16_t)ih.icmp6_type);
	((struct sockaddr_in6 *)&spidx->dst)->sin6_port =
	htons((uint16_t)ih.icmp6_code);
	break;
	default:
	/* XXX intermediate headers??? */
	spidx->ul_proto = nxt;
	break;
	}
	}

	/* assumes that m is sane */
	static int
	ipsec6_setspidx_ipaddr(m, spidx)
	struct mbuf *m;
	struct secpolicyindex *spidx;
	{
	struct ip6_hdr *ip6 = NULL;
	struct ip6_hdr ip6buf;
	struct sockaddr_in6 *sin6;

	if (m->m_len >= sizeof(*ip6))
	ip6 = mtod(m, struct ip6_hdr *);
	else {
	m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf);
	ip6 = &ip6buf;
	}

	sin6 = (struct sockaddr_in6 *)&spidx->src;
	bzero(sin6, sizeof(*sin6));
	sin6->sin6_family = AF_INET6;
	sin6->sin6_len = sizeof(struct sockaddr_in6);
	bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src));
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
	sin6->sin6_addr.s6_addr16[1] = 0;
	sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]);
	}
	spidx->prefs = sizeof(struct in6_addr) << 3;

	sin6 = (struct sockaddr_in6 *)&spidx->dst;
	bzero(sin6, sizeof(*sin6));
	sin6->sin6_family = AF_INET6;
	sin6->sin6_len = sizeof(struct sockaddr_in6);
	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst));
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
	sin6->sin6_addr.s6_addr16[1] = 0;
	sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]);
	}
	spidx->prefd = sizeof(struct in6_addr) << 3;

	return 0;
	}
	#endif

	static void
	ipsec_delpcbpolicy(p)
	struct inpcbpolicy *p;
	{
	free(p, M_IPSEC_INPCB);
	}

	/* initialize policy in PCB */
	int
	ipsec_init_policy(so, pcb_sp)
	struct socket *so;
	struct inpcbpolicy **pcb_sp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct inpcbpolicy *new;

	/* sanity check. */
	if (so == NULL \|\| pcb_sp == NULL)
	panic("%s: NULL pointer was passed.\n", __func__);

	new = (struct inpcbpolicy *) malloc(sizeof(struct inpcbpolicy),
	M_IPSEC_INPCB, M_NOWAIT\|M_ZERO);
	if (new == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return ENOBUFS;
	}

	new->priv = IPSEC_IS_PRIVILEGED_SO(so);

	if ((new->sp_in = KEY_NEWSP()) == NULL) {
	ipsec_delpcbpolicy(new);
	return ENOBUFS;
	}
	new->sp_in->state = IPSEC_SPSTATE_ALIVE;
	new->sp_in->policy = IPSEC_POLICY_ENTRUST;

	if ((new->sp_out = KEY_NEWSP()) == NULL) {
	KEY_FREESP(&new->sp_in);
	ipsec_delpcbpolicy(new);
	return ENOBUFS;
	}
	new->sp_out->state = IPSEC_SPSTATE_ALIVE;
	new->sp_out->policy = IPSEC_POLICY_ENTRUST;

	*pcb_sp = new;

	return 0;
	}

	/* copy old ipsec policy into new */
	int
	ipsec_copy_policy(old, new)
	struct inpcbpolicy old, new;
	{
	struct secpolicy *sp;

	sp = ipsec_deepcopy_policy(old->sp_in);
	if (sp) {
	KEY_FREESP(&new->sp_in);
	new->sp_in = sp;
	} else
	return ENOBUFS;

	sp = ipsec_deepcopy_policy(old->sp_out);
	if (sp) {
	KEY_FREESP(&new->sp_out);
	new->sp_out = sp;
	} else
	return ENOBUFS;

	new->priv = old->priv;

	return 0;
	}

	struct ipsecrequest *
	ipsec_newisr(void)
	{
	struct ipsecrequest *p;

	p = malloc(sizeof(struct ipsecrequest), M_IPSEC_SR, M_NOWAIT\|M_ZERO);
	if (p != NULL)
	IPSECREQUEST_LOCK_INIT(p);
	return p;
	}

	void
	ipsec_delisr(struct ipsecrequest *p)
	{
	IPSECREQUEST_LOCK_DESTROY(p);
	free(p, M_IPSEC_SR);
	}

	/* deep-copy a policy in PCB */
	static struct secpolicy *
	ipsec_deepcopy_policy(src)
	struct secpolicy *src;
	{
	struct ipsecrequest *newchain = NULL;
	struct ipsecrequest *p;
	struct ipsecrequest **q;
	struct ipsecrequest *r;
	struct secpolicy *dst;

	if (src == NULL)
	return NULL;
	dst = KEY_NEWSP();
	if (dst == NULL)
	return NULL;

	/*
	* deep-copy IPsec request chain. This is required since struct
	* ipsecrequest is not reference counted.
	*/
	q = &newchain;
	for (p = src->req; p; p = p->next) {
	*q = ipsec_newisr();
	if (*q == NULL)
	goto fail;
	(*q)->saidx.proto = p->saidx.proto;
	(*q)->saidx.mode = p->saidx.mode;
	(*q)->level = p->level;
	(*q)->saidx.reqid = p->saidx.reqid;

	bcopy(&p->saidx.src, &(q)->saidx.src, sizeof((q)->saidx.src));
	bcopy(&p->saidx.dst, &(q)->saidx.dst, sizeof((q)->saidx.dst));

	(*q)->sp = dst;

	q = &((*q)->next);
	}

	dst->req = newchain;
	dst->state = src->state;
	dst->policy = src->policy;
	/* do not touch the refcnt fields */

	return dst;

	fail:
	for (p = newchain; p; p = r) {
	r = p->next;
	ipsec_delisr(p);
	p = NULL;
	}
	return NULL;
	}

	/* set policy and ipsec request if present. */
	static int
	ipsec_set_policy(pcb_sp, optname, request, len, cred)
	struct secpolicy **pcb_sp;
	int optname;
	caddr_t request;
	size_t len;
	struct ucred *cred;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_x_policy *xpl;
	struct secpolicy *newsp = NULL;
	int error;

	/* sanity check. */
	if (pcb_sp == NULL \|\| *pcb_sp == NULL \|\| request == NULL)
	return EINVAL;
	if (len < sizeof(*xpl))
	return EINVAL;
	xpl = (struct sadb_x_policy *)request;

	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: passed policy\n", __func__);
	kdebug_sadb_x_policy((struct sadb_ext *)xpl));

	/* check policy type */
	/* ipsec_set_policy() accepts IPSEC, ENTRUST and BYPASS. */
	if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD
	\|\| xpl->sadb_x_policy_type == IPSEC_POLICY_NONE)
	return EINVAL;

	/* check privileged socket */
	if (cred != NULL && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
	error = priv_check_cred(cred, PRIV_NETINET_IPSEC, 0);
	if (error)
	return EACCES;
	}

	/* allocation new SP entry */
	if ((newsp = key_msg2sp(xpl, len, &error)) == NULL)
	return error;

	newsp->state = IPSEC_SPSTATE_ALIVE;

	/* clear old SP and set new SP */
	KEY_FREESP(pcb_sp);
	*pcb_sp = newsp;
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: new policy\n", __func__);
	kdebug_secpolicy(newsp));

	return 0;
	}

	static int
	ipsec_get_policy(pcb_sp, mp)
	struct secpolicy *pcb_sp;
	struct mbuf **mp;
	{
	+ INIT_VNET_IPSEC(curvnet);

	/* sanity check. */
	if (pcb_sp == NULL \|\| mp == NULL)
	return EINVAL;

	*mp = key_sp2msg(pcb_sp);
	if (!*mp) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return ENOBUFS;
	}

	(*mp)->m_type = MT_DATA;
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s:\n", __func__); kdebug_mbuf(*mp));

	return 0;
	}

	int
	ipsec4_set_policy(inp, optname, request, len, cred)
	struct inpcb *inp;
	int optname;
	caddr_t request;
	size_t len;
	struct ucred *cred;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_x_policy *xpl;
	struct secpolicy **pcb_sp;

	/* sanity check. */
	if (inp == NULL \|\| request == NULL)
	return EINVAL;
	if (len < sizeof(*xpl))
	return EINVAL;
	xpl = (struct sadb_x_policy *)request;

	/* select direction */
	switch (xpl->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	pcb_sp = &inp->inp_sp->sp_in;
	break;
	case IPSEC_DIR_OUTBOUND:
	pcb_sp = &inp->inp_sp->sp_out;
	break;
	default:
	ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
	xpl->sadb_x_policy_dir));
	return EINVAL;
	}

	return ipsec_set_policy(pcb_sp, optname, request, len, cred);
	}

	int
	ipsec4_get_policy(inp, request, len, mp)
	struct inpcb *inp;
	caddr_t request;
	size_t len;
	struct mbuf **mp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_x_policy *xpl;
	struct secpolicy *pcb_sp;

	/* sanity check. */
	if (inp == NULL \|\| request == NULL \|\| mp == NULL)
	return EINVAL;
	IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp"));
	if (len < sizeof(*xpl))
	return EINVAL;
	xpl = (struct sadb_x_policy *)request;

	/* select direction */
	switch (xpl->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	pcb_sp = inp->inp_sp->sp_in;
	break;
	case IPSEC_DIR_OUTBOUND:
	pcb_sp = inp->inp_sp->sp_out;
	break;
	default:
	ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
	xpl->sadb_x_policy_dir));
	return EINVAL;
	}

	return ipsec_get_policy(pcb_sp, mp);
	}

	/* delete policy in PCB */
	int
	ipsec4_delete_pcbpolicy(inp)
	struct inpcb *inp;
	{
	IPSEC_ASSERT(inp != NULL, ("null inp"));

	if (inp->inp_sp == NULL)
	return 0;

	if (inp->inp_sp->sp_in != NULL)
	KEY_FREESP(&inp->inp_sp->sp_in);

	if (inp->inp_sp->sp_out != NULL)
	KEY_FREESP(&inp->inp_sp->sp_out);

	ipsec_delpcbpolicy(inp->inp_sp);
	inp->inp_sp = NULL;

	return 0;
	}

	#ifdef INET6
	int
	ipsec6_set_policy(in6p, optname, request, len, cred)
	struct in6pcb *in6p;
	int optname;
	caddr_t request;
	size_t len;
	struct ucred *cred;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_x_policy *xpl;
	struct secpolicy **pcb_sp;

	/* sanity check. */
	if (in6p == NULL \|\| request == NULL)
	return EINVAL;
	if (len < sizeof(*xpl))
	return EINVAL;
	xpl = (struct sadb_x_policy *)request;

	/* select direction */
	switch (xpl->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	pcb_sp = &in6p->in6p_sp->sp_in;
	break;
	case IPSEC_DIR_OUTBOUND:
	pcb_sp = &in6p->in6p_sp->sp_out;
	break;
	default:
	ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
	xpl->sadb_x_policy_dir));
	return EINVAL;
	}

	return ipsec_set_policy(pcb_sp, optname, request, len, cred);
	}

	int
	ipsec6_get_policy(in6p, request, len, mp)
	struct in6pcb *in6p;
	caddr_t request;
	size_t len;
	struct mbuf **mp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_x_policy *xpl;
	struct secpolicy *pcb_sp;

	/* sanity check. */
	if (in6p == NULL \|\| request == NULL \|\| mp == NULL)
	return EINVAL;
	IPSEC_ASSERT(in6p->in6p_sp != NULL, ("null in6p_sp"));
	if (len < sizeof(*xpl))
	return EINVAL;
	xpl = (struct sadb_x_policy *)request;

	/* select direction */
	switch (xpl->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	pcb_sp = in6p->in6p_sp->sp_in;
	break;
	case IPSEC_DIR_OUTBOUND:
	pcb_sp = in6p->in6p_sp->sp_out;
	break;
	default:
	ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__,
	xpl->sadb_x_policy_dir));
	return EINVAL;
	}

	return ipsec_get_policy(pcb_sp, mp);
	}

	int
	ipsec6_delete_pcbpolicy(in6p)
	struct in6pcb *in6p;
	{
	IPSEC_ASSERT(in6p != NULL, ("null in6p"));

	if (in6p->in6p_sp == NULL)
	return 0;

	if (in6p->in6p_sp->sp_in != NULL)
	KEY_FREESP(&in6p->in6p_sp->sp_in);

	if (in6p->in6p_sp->sp_out != NULL)
	KEY_FREESP(&in6p->in6p_sp->sp_out);

	ipsec_delpcbpolicy(in6p->in6p_sp);
	in6p->in6p_sp = NULL;

	return 0;
	}
	#endif

	/*
	* return current level.
	* Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned.
	*/
	u_int
	ipsec_get_reqlevel(isr)
	struct ipsecrequest *isr;
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int level = 0;
	u_int esp_trans_deflev, esp_net_deflev;
	u_int ah_trans_deflev, ah_net_deflev;

	IPSEC_ASSERT(isr != NULL && isr->sp != NULL, ("null argument"));
	IPSEC_ASSERT(isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family,
	("af family mismatch, src %u, dst %u",
	isr->sp->spidx.src.sa.sa_family,
	isr->sp->spidx.dst.sa.sa_family));

	/* XXX note that we have ipseclog() expanded here - code sync issue */
	#define IPSEC_CHECK_DEFAULT(lev) \
	(((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE \
	&& (lev) != IPSEC_LEVEL_UNIQUE) \
	? (V_ipsec_debug \
	? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\
	(lev), IPSEC_LEVEL_REQUIRE) \
	: 0), \
	(lev) = IPSEC_LEVEL_REQUIRE, \
	(lev) \
	: (lev))

	/* set default level */
	switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) {
	#ifdef INET
	case AF_INET:
	esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev);
	esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev);
	ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev);
	ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev);
	esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev);
	ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev);
	ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev);
	break;
	#endif /* INET6 */
	default:
	panic("%s: unknown af %u",
	__func__, isr->sp->spidx.src.sa.sa_family);
	}

	#undef IPSEC_CHECK_DEFAULT

	/* set level */
	switch (isr->level) {
	case IPSEC_LEVEL_DEFAULT:
	switch (isr->saidx.proto) {
	case IPPROTO_ESP:
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
	level = esp_net_deflev;
	else
	level = esp_trans_deflev;
	break;
	case IPPROTO_AH:
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
	level = ah_net_deflev;
	else
	level = ah_trans_deflev;
	break;
	case IPPROTO_IPCOMP:
	/*
	* we don't really care, as IPcomp document says that
	* we shouldn't compress small packets
	*/
	level = IPSEC_LEVEL_USE;
	break;
	default:
	panic("%s: Illegal protocol defined %u\n", __func__,
	isr->saidx.proto);
	}
	break;

	case IPSEC_LEVEL_USE:
	case IPSEC_LEVEL_REQUIRE:
	level = isr->level;
	break;
	case IPSEC_LEVEL_UNIQUE:
	level = IPSEC_LEVEL_REQUIRE;
	break;

	default:
	panic("%s: Illegal IPsec level %u\n", __func__, isr->level);
	}

	return level;
	}

	/*
	* Check security policy requirements against the actual
	* packet contents. Return one if the packet should be
	* reject as "invalid"; otherwiser return zero to have the
	* packet treated as "valid".
	*
	* OUT:
	* 0: valid
	* 1: invalid
	*/
	int
	ipsec_in_reject(struct secpolicy sp, struct mbuf m)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct ipsecrequest *isr;
	int need_auth;

	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));

	/* check policy */
	switch (sp->policy) {
	case IPSEC_POLICY_DISCARD:
	return 1;
	case IPSEC_POLICY_BYPASS:
	case IPSEC_POLICY_NONE:
	return 0;
	}

	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
	("invalid policy %u", sp->policy));

	/* XXX should compare policy against ipsec header history */

	need_auth = 0;
	for (isr = sp->req; isr != NULL; isr = isr->next) {
	if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE)
	continue;
	switch (isr->saidx.proto) {
	case IPPROTO_ESP:
	if ((m->m_flags & M_DECRYPTED) == 0) {
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: ESP m_flags:%x\n", __func__,
	m->m_flags));
	return 1;
	}

	if (!need_auth &&
	isr->sav != NULL &&
	isr->sav->tdb_authalgxform != NULL &&
	(m->m_flags & M_AUTHIPDGM) == 0) {
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: ESP/AH m_flags:%x\n", __func__,
	m->m_flags));
	return 1;
	}
	break;
	case IPPROTO_AH:
	need_auth = 1;
	if ((m->m_flags & M_AUTHIPHDR) == 0) {
	KEYDEBUG(KEYDEBUG_IPSEC_DUMP,
	printf("%s: AH m_flags:%x\n", __func__,
	m->m_flags));
	return 1;
	}
	break;
	case IPPROTO_IPCOMP:
	/*
	* we don't really care, as IPcomp document
	* says that we shouldn't compress small
	* packets, IPComp policy should always be
	* treated as being in "use" level.
	*/
	break;
	}
	}
	return 0; /* valid */
	}

	/*
	* Check AH/ESP integrity.
	* This function is called from tcp_input(), udp_input(),
	* and {ah,esp}4_input for tunnel mode
	*/
	int
	ipsec4_in_reject(m, inp)
	struct mbuf *m;
	struct inpcb *inp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;
	int error;
	int result;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));

	/* get SP for this packet.
	* When we are called from ip_forward(), we call
	* ipsec_getpolicybyaddr() with IP_FORWARDING flag.
	*/
	if (inp == NULL)
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
	else
	sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error);

	if (sp != NULL) {
	result = ipsec_in_reject(sp, m);
	if (result)
	V_ipsec4stat.ips_in_polvio++;
	KEY_FREESP(&sp);
	} else {
	result = 0; /* XXX should be panic ?
	* -> No, there may be error. */
	}
	return result;
	}

	#ifdef INET6
	/*
	* Check AH/ESP integrity.
	* This function is called from tcp6_input(), udp6_input(),
	* and {ah,esp}6_input for tunnel mode
	*/
	int
	ipsec6_in_reject(m, inp)
	struct mbuf *m;
	struct inpcb *inp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp = NULL;
	int error;
	int result;

	/* sanity check */
	if (m == NULL)
	return 0; /* XXX should be panic ? */

	/* get SP for this packet.
	* When we are called from ip_forward(), we call
	* ipsec_getpolicybyaddr() with IP_FORWARDING flag.
	*/
	if (inp == NULL)
	sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error);
	else
	sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error);

	if (sp != NULL) {
	result = ipsec_in_reject(sp, m);
	if (result)
	V_ipsec6stat.ips_in_polvio++;
	KEY_FREESP(&sp);
	} else {
	result = 0;
	}
	return result;
	}
	#endif

	/*
	* compute the byte size to be occupied by IPsec header.
	* in case it is tunneled, it includes the size of outer IP header.
	* NOTE: SP passed is free in this function.
	*/
	static size_t
	ipsec_hdrsiz(struct secpolicy *sp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct ipsecrequest *isr;
	size_t siz;

	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("%s: using SP\n", __func__); kdebug_secpolicy(sp));

	switch (sp->policy) {
	case IPSEC_POLICY_DISCARD:
	case IPSEC_POLICY_BYPASS:
	case IPSEC_POLICY_NONE:
	return 0;
	}

	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
	("invalid policy %u", sp->policy));

	siz = 0;
	for (isr = sp->req; isr != NULL; isr = isr->next) {
	size_t clen = 0;

	switch (isr->saidx.proto) {
	case IPPROTO_ESP:
	clen = esp_hdrsiz(isr->sav);
	break;
	case IPPROTO_AH:
	clen = ah_hdrsiz(isr->sav);
	break;
	case IPPROTO_IPCOMP:
	clen = sizeof(struct ipcomp);
	break;
	}

	if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
	switch (isr->saidx.dst.sa.sa_family) {
	case AF_INET:
	clen += sizeof(struct ip);
	break;
	#ifdef INET6
	case AF_INET6:
	clen += sizeof(struct ip6_hdr);
	break;
	#endif
	default:
	ipseclog((LOG_ERR, "%s: unknown AF %d in "
	"IPsec tunnel SA\n", __func__,
	((struct sockaddr *)&isr->saidx.dst)->sa_family));
	break;
	}
	}
	siz += clen;
	}

	return siz;
	}

	/* This function is called from ip_forward() and ipsec4_hdrsize_tcp(). */
	size_t
	ipsec4_hdrsiz(m, dir, inp)
	struct mbuf *m;
	u_int dir;
	struct inpcb *inp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;
	int error;
	size_t size;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));

	/* get SP for this packet.
	* When we are called from ip_forward(), we call
	* ipsec_getpolicybyaddr() with IP_FORWARDING flag.
	*/
	if (inp == NULL)
	sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
	else
	sp = ipsec_getpolicybysock(m, dir, inp, &error);

	if (sp != NULL) {
	size = ipsec_hdrsiz(sp);
	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("%s: size:%lu.\n", __func__,
	(unsigned long)size));

	KEY_FREESP(&sp);
	} else {
	size = 0; /* XXX should be panic ?
	* -> No, we are called w/o knowing if
	* IPsec processing is needed. */
	}
	return size;
	}

	#ifdef INET6
	/* This function is called from ipsec6_hdrsize_tcp(),
	* and maybe from ip6_forward.()
	*/
	size_t
	ipsec6_hdrsiz(m, dir, in6p)
	struct mbuf *m;
	u_int dir;
	struct in6pcb *in6p;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;
	int error;
	size_t size;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(in6p == NULL \|\| in6p->in6p_socket != NULL,
	("socket w/o inpcb"));

	/* get SP for this packet */
	/* XXX Is it right to call with IP_FORWARDING. */
	if (in6p == NULL)
	sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error);
	else
	sp = ipsec_getpolicybysock(m, dir, in6p, &error);

	if (sp == NULL)
	return 0;
	size = ipsec_hdrsiz(sp);
	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("%s: size:%lu.\n", __func__, (unsigned long)size));
	KEY_FREESP(&sp);

	return size;
	}
	#endif /INET6/

	/*
	* Check the variable replay window.
	* ipsec_chkreplay() performs replay check before ICV verification.
	* ipsec_updatereplay() updates replay bitmap. This must be called after
	* ICV verification (it also performs replay check, which is usually done
	* beforehand).
	* 0 (zero) is returned if packet disallowed, 1 if packet permitted.
	*
	* based on RFC 2401.
	*/
	int
	ipsec_chkreplay(seq, sav)
	u_int32_t seq;
	struct secasvar *sav;
	{
	const struct secreplay *replay;
	u_int32_t diff;
	int fr;
	u_int32_t wsizeb; /* constant: bits of window size */
	int frlast; /* constant: last frame */

	IPSEC_ASSERT(sav != NULL, ("Null SA"));
	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));

	replay = sav->replay;

	if (replay->wsize == 0)
	return 1; /* no need to check replay. */

	/* constant */
	frlast = replay->wsize - 1;
	wsizeb = replay->wsize << 3;

	/* sequence number of 0 is invalid */
	if (seq == 0)
	return 0;

	/* first time is always okay */
	if (replay->count == 0)
	return 1;

	if (seq > replay->lastseq) {
	/* larger sequences are okay */
	return 1;
	} else {
	/* seq is equal or less than lastseq. */
	diff = replay->lastseq - seq;

	/* over range to check, i.e. too old or wrapped */
	if (diff >= wsizeb)
	return 0;

	fr = frlast - diff / 8;

	/* this packet already seen ? */
	if ((replay->bitmap)[fr] & (1 << (diff % 8)))
	return 0;

	/* out of order but good */
	return 1;
	}
	}

	/*
	* check replay counter whether to update or not.
	* OUT: 0: OK
	* 1: NG
	*/
	int
	ipsec_updatereplay(seq, sav)
	u_int32_t seq;
	struct secasvar *sav;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secreplay *replay;
	u_int32_t diff;
	int fr;
	u_int32_t wsizeb; /* constant: bits of window size */
	int frlast; /* constant: last frame */

	IPSEC_ASSERT(sav != NULL, ("Null SA"));
	IPSEC_ASSERT(sav->replay != NULL, ("Null replay state"));

	replay = sav->replay;

	if (replay->wsize == 0)
	goto ok; /* no need to check replay. */

	/* constant */
	frlast = replay->wsize - 1;
	wsizeb = replay->wsize << 3;

	/* sequence number of 0 is invalid */
	if (seq == 0)
	return 1;

	/* first time */
	if (replay->count == 0) {
	replay->lastseq = seq;
	bzero(replay->bitmap, replay->wsize);
	(replay->bitmap)[frlast] = 1;
	goto ok;
	}

	if (seq > replay->lastseq) {
	/* seq is larger than lastseq. */
	diff = seq - replay->lastseq;

	/* new larger sequence number */
	if (diff < wsizeb) {
	/* In window */
	/* set bit for this packet */
	vshiftl(replay->bitmap, diff, replay->wsize);
	(replay->bitmap)[frlast] \|= 1;
	} else {
	/* this packet has a "way larger" */
	bzero(replay->bitmap, replay->wsize);
	(replay->bitmap)[frlast] = 1;
	}
	replay->lastseq = seq;

	/* larger is good */
	} else {
	/* seq is equal or less than lastseq. */
	diff = replay->lastseq - seq;

	/* over range to check, i.e. too old or wrapped */
	if (diff >= wsizeb)
	return 1;

	fr = frlast - diff / 8;

	/* this packet already seen ? */
	if ((replay->bitmap)[fr] & (1 << (diff % 8)))
	return 1;

	/* mark as seen */
	(replay->bitmap)[fr] \|= (1 << (diff % 8));

	/* out of order but good */
	}

	ok:
	if (replay->count == ~0) {

	/* set overflow flag */
	replay->overflow++;

	/* don't increment, no more packets accepted */
	if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0)
	return 1;

	ipseclog((LOG_WARNING, "%s: replay counter made %d cycle. %s\n",
	__func__, replay->overflow, ipsec_logsastr(sav)));
	}

	replay->count++;

	return 0;
	}

	/*
	* shift variable length buffer to left.
	* IN: bitmap: pointer to the buffer
	* nbit: the number of to shift.
	* wsize: buffer size (bytes).
	*/
	static void
	vshiftl(bitmap, nbit, wsize)
	unsigned char *bitmap;
	int nbit, wsize;
	{
	int s, j, i;
	unsigned char over;

	for (j = 0; j < nbit; j += 8) {
	s = (nbit - j < 8) ? (nbit - j): 8;
	bitmap[0] <<= s;
	for (i = 1; i < wsize; i++) {
	over = (bitmap[i] >> (8 - s));
	bitmap[i] <<= s;
	bitmap[i-1] \|= over;
	}
	}

	return;
	}

	/* Return a printable string for the IPv4 address. */
	static char *
	inet_ntoa4(struct in_addr ina)
	{
	static char buf[4][4 * sizeof "123" + 4];
	unsigned char ucp = (unsigned char ) &ina;
	static int i = 3;

	/* XXX-BZ returns static buffer. */
	i = (i + 1) % 4;
	sprintf(buf[i], "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff,
	ucp[2] & 0xff, ucp[3] & 0xff);
	return (buf[i]);
	}

	/* Return a printable string for the address. */
	char *
	ipsec_address(union sockaddr_union* sa)
	{
	#ifdef INET6
	char ip6buf[INET6_ADDRSTRLEN];
	#endif
	switch (sa->sa.sa_family) {
	#ifdef INET
	case AF_INET:
	return inet_ntoa4(sa->sin.sin_addr);
	#endif /* INET */

	#ifdef INET6
	case AF_INET6:
	return ip6_sprintf(ip6buf, &sa->sin6.sin6_addr);
	#endif /* INET6 */

	default:
	return "(unknown address family)";
	}
	}

	const char *
	ipsec_logsastr(sav)
	struct secasvar *sav;
	{
	static char buf[256];
	char *p;
	struct secasindex *saidx = &sav->sah->saidx;

	IPSEC_ASSERT(saidx->src.sa.sa_family == saidx->dst.sa.sa_family,
	("address family mismatch"));

	p = buf;
	snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi));
	while (p && *p)
	p++;
	/* NB: only use ipsec_address on one address at a time */
	snprintf(p, sizeof (buf) - (p - buf), "src=%s ",
	ipsec_address(&saidx->src));
	while (p && *p)
	p++;
	snprintf(p, sizeof (buf) - (p - buf), "dst=%s)",
	ipsec_address(&saidx->dst));

	return buf;
	}

	void
	ipsec_dumpmbuf(m)
	struct mbuf *m;
	{
	int totlen;
	int i;
	u_char *p;

	totlen = 0;
	printf("---\n");
	while (m) {
	p = mtod(m, u_char *);
	for (i = 0; i < m->m_len; i++) {
	printf("%02x ", p[i]);
	totlen++;
	if (totlen % 16 == 0)
	printf("\n");
	}
	m = m->m_next;
	}
	if (totlen % 16 != 0)
	printf("\n");
	printf("---\n");
	}

	static void
	ipsec_attach(void)
	{
	SECPOLICY_LOCK_INIT(&V_ip4_def_policy);
	ip4_def_policy.refcnt = 1; /* NB: disallow free */
	}
	SYSINIT(ipsec, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, ipsec_attach, NULL);


	/* XXX this stuff doesn't belong here... */

	static struct xformsw* xforms = NULL;

	/*
	* Register a transform; typically at system startup.
	*/
	void
	xform_register(struct xformsw* xsp)
	{
	xsp->xf_next = xforms;
	xforms = xsp;
	}

	/*
	* Initialize transform support in an sav.
	*/
	int
	xform_init(struct secasvar *sav, int xftype)
	{
	struct xformsw *xsp;

	if (sav->tdb_xform != NULL) /* previously initialized */
	return 0;
	for (xsp = xforms; xsp; xsp = xsp->xf_next)
	if (xsp->xf_type == xftype)
	return (*xsp->xf_init)(sav, xsp);
	return EINVAL;
	}
	Index: head/sys/netipsec/ipsec.h
	===================================================================
	--- head/sys/netipsec/ipsec.h (revision 183549)
	+++ head/sys/netipsec/ipsec.h (revision 183550)
	@@ -1,438 +1,441 @@
	/* $FreeBSD$ */
	/* $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* IPsec controller part.
	*/

	#ifndef _NETIPSEC_IPSEC_H_
	#define _NETIPSEC_IPSEC_H_

	#if defined(_KERNEL) && !defined(_LKM) && !defined(KLD_MODULE)
	#include "opt_inet.h"
	#include "opt_ipsec.h"
	#endif

	#include <net/pfkeyv2.h>
	#include <netipsec/keydb.h>

	#ifdef _KERNEL

	#define IPSEC_ASSERT(_c,_m) KASSERT(_c, _m)

	#define IPSEC_IS_PRIVILEGED_SO(_so) \
	((_so)->so_cred != NULL && \
	priv_check_cred((_so)->so_cred, PRIV_NETINET_IPSEC, 0) \
	== 0)

	/*
	* Security Policy Index
	* Ensure that both address families in the "src" and "dst" are same.
	* When the value of the ul_proto is ICMPv6, the port field in "src"
	* specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code.
	*/
	struct secpolicyindex {
	u_int8_t dir; /* direction of packet flow, see blow */
	union sockaddr_union src; /* IP src address for SP */
	union sockaddr_union dst; /* IP dst address for SP */
	u_int8_t prefs; /* prefix length in bits for src */
	u_int8_t prefd; /* prefix length in bits for dst */
	u_int16_t ul_proto; /* upper layer Protocol */
	#ifdef notyet
	uid_t uids;
	uid_t uidd;
	gid_t gids;
	gid_t gidd;
	#endif
	};

	/* Security Policy Data Base */
	struct secpolicy {
	LIST_ENTRY(secpolicy) chain;
	struct mtx lock;

	u_int refcnt; /* reference count */
	struct secpolicyindex spidx; /* selector */
	u_int32_t id; /* It's unique number on the system. */
	u_int state; /* 0: dead, others: alive */
	#define IPSEC_SPSTATE_DEAD 0
	#define IPSEC_SPSTATE_ALIVE 1
	u_int16_t policy; /* policy_type per pfkeyv2.h */
	u_int16_t scangen; /* scan generation # */
	struct ipsecrequest *req;
	/* pointer to the ipsec request tree, */
	/* if policy == IPSEC else this value == NULL.*/

	/*
	* lifetime handler.
	* the policy can be used without limitiation if both lifetime and
	* validtime are zero.
	* "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime.
	* "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime.
	*/
	time_t created; /* time created the policy */
	time_t lastused; /* updated every when kernel sends a packet */
	long lifetime; /* duration of the lifetime of this policy */
	long validtime; /* duration this policy is valid without use */
	};

	#define SECPOLICY_LOCK_INIT(_sp) \
	mtx_init(&(_sp)->lock, "ipsec policy", NULL, MTX_DEF)
	#define SECPOLICY_LOCK(_sp) mtx_lock(&(_sp)->lock)
	#define SECPOLICY_UNLOCK(_sp) mtx_unlock(&(_sp)->lock)
	#define SECPOLICY_LOCK_DESTROY(_sp) mtx_destroy(&(_sp)->lock)
	#define SECPOLICY_LOCK_ASSERT(_sp) mtx_assert(&(_sp)->lock, MA_OWNED)

	/* Request for IPsec */
	struct ipsecrequest {
	struct ipsecrequest *next;
	/* pointer to next structure */
	/* If NULL, it means the end of chain. */
	struct secasindex saidx;/* hint for search proper SA */
	/* if __ss_len == 0 then no address specified.*/
	u_int level; /* IPsec level defined below. */

	struct secasvar sav; / place holder of SA for use */
	struct secpolicy sp; / back pointer to SP */
	struct mtx lock; /* to interlock updates */
	};

	/*
	* Need recursion for when crypto callbacks happen directly,
	* as in the case of software crypto. Need to look at how
	* hard it is to remove this...
	*/
	#define IPSECREQUEST_LOCK_INIT(_isr) \
	mtx_init(&(_isr)->lock, "ipsec request", NULL, MTX_DEF \| MTX_RECURSE)
	#define IPSECREQUEST_LOCK(_isr) mtx_lock(&(_isr)->lock)
	#define IPSECREQUEST_UNLOCK(_isr) mtx_unlock(&(_isr)->lock)
	#define IPSECREQUEST_LOCK_DESTROY(_isr) mtx_destroy(&(_isr)->lock)
	#define IPSECREQUEST_LOCK_ASSERT(_isr) mtx_assert(&(_isr)->lock, MA_OWNED)

	/* security policy in PCB */
	struct inpcbpolicy {
	struct secpolicy *sp_in;
	struct secpolicy *sp_out;
	int priv; /* privileged socket ? */
	};

	/* SP acquiring list table. */
	struct secspacq {
	LIST_ENTRY(secspacq) chain;

	struct secpolicyindex spidx;

	time_t created; /* for lifetime */
	int count; /* for lifetime */
	/* XXX: here is mbuf place holder to be sent ? */
	};
	#endif /* _KERNEL */

	/* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */
	#define IPSEC_PORT_ANY 0
	#define IPSEC_ULPROTO_ANY 255
	#define IPSEC_PROTO_ANY 255

	/* mode of security protocol */
	/* NOTE: DON'T use IPSEC_MODE_ANY at SPD. It's only use in SAD */
	#define IPSEC_MODE_ANY 0 /* i.e. wildcard. */
	#define IPSEC_MODE_TRANSPORT 1
	#define IPSEC_MODE_TUNNEL 2
	#define IPSEC_MODE_TCPMD5 3 /* TCP MD5 mode */

	/*
	* Direction of security policy.
	* NOTE: Since INVALID is used just as flag.
	* The other are used for loop counter too.
	*/
	#define IPSEC_DIR_ANY 0
	#define IPSEC_DIR_INBOUND 1
	#define IPSEC_DIR_OUTBOUND 2
	#define IPSEC_DIR_MAX 3
	#define IPSEC_DIR_INVALID 4

	/* Policy level */
	/*
	* IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB,
	* DISCARD, IPSEC and NONE are allowed for setkey() in SPD.
	* DISCARD and NONE are allowed for system default.
	*/
	#define IPSEC_POLICY_DISCARD 0 /* discarding packet */
	#define IPSEC_POLICY_NONE 1 /* through IPsec engine */
	#define IPSEC_POLICY_IPSEC 2 /* do IPsec */
	#define IPSEC_POLICY_ENTRUST 3 /* consulting SPD if present. */
	#define IPSEC_POLICY_BYPASS 4 /* only for privileged socket. */

	/* Security protocol level */
	#define IPSEC_LEVEL_DEFAULT 0 /* reference to system default */
	#define IPSEC_LEVEL_USE 1 /* use SA if present. */
	#define IPSEC_LEVEL_REQUIRE 2 /* require SA. */
	#define IPSEC_LEVEL_UNIQUE 3 /* unique SA. */

	#define IPSEC_MANUAL_REQID_MAX 0x3fff
	/*
	* if security policy level == unique, this id
	* indicate to a relative SA for use, else is
	* zero.
	* 1 - 0x3fff are reserved for manual keying.
	* 0 are reserved for above reason. Others is
	* for kernel use.
	* Note that this id doesn't identify SA
	* by only itself.
	*/
	#define IPSEC_REPLAYWSIZE 32

	/* statistics for ipsec processing */
	struct ipsecstat {
	u_quad_t in_success; /* succeeded inbound process */
	u_quad_t in_polvio;
	/* security policy violation for inbound process */
	u_quad_t in_nosa; /* inbound SA is unavailable */
	u_quad_t in_inval; /* inbound processing failed due to EINVAL */
	u_quad_t in_nomem; /* inbound processing failed due to ENOBUFS */
	u_quad_t in_badspi; /* failed getting a SPI */
	u_quad_t in_ahreplay; /* AH replay check failed */
	u_quad_t in_espreplay; /* ESP replay check failed */
	u_quad_t in_ahauthsucc; /* AH authentication success */
	u_quad_t in_ahauthfail; /* AH authentication failure */
	u_quad_t in_espauthsucc; /* ESP authentication success */
	u_quad_t in_espauthfail; /* ESP authentication failure */
	u_quad_t in_esphist[256];
	u_quad_t in_ahhist[256];
	u_quad_t in_comphist[256];
	u_quad_t out_success; /* succeeded outbound process */
	u_quad_t out_polvio;
	/* security policy violation for outbound process */
	u_quad_t out_nosa; /* outbound SA is unavailable */
	u_quad_t out_inval; /* outbound process failed due to EINVAL */
	u_quad_t out_nomem; /* inbound processing failed due to ENOBUFS */
	u_quad_t out_noroute; /* there is no route */
	u_quad_t out_esphist[256];
	u_quad_t out_ahhist[256];
	u_quad_t out_comphist[256];

	u_quad_t spdcachelookup;
	u_quad_t spdcachemiss;

	u_int32_t ips_in_polvio; /* input: sec policy violation */
	u_int32_t ips_out_polvio; /* output: sec policy violation */
	u_int32_t ips_out_nosa; /* output: SA unavailable */
	u_int32_t ips_out_nomem; /* output: no memory available */
	u_int32_t ips_out_noroute; /* output: no route available */
	u_int32_t ips_out_inval; /* output: generic error */
	u_int32_t ips_out_bundlesa; /* output: bundled SA processed */
	u_int32_t ips_mbcoalesced; /* mbufs coalesced during clone */
	u_int32_t ips_clcoalesced; /* clusters coalesced during clone */
	u_int32_t ips_clcopied; /* clusters copied during clone */
	u_int32_t ips_mbinserted; /* mbufs inserted during makespace */
	/*
	* Temporary statistics for performance analysis.
	*/
	/* See where ESP/AH/IPCOMP header land in mbuf on input */
	u_int32_t ips_input_front;
	u_int32_t ips_input_middle;
	u_int32_t ips_input_end;
	};

	/*
	* Definitions for IPsec & Key sysctl operations.
	*/
	/*
	* Names for IPsec & Key sysctl objects
	*/
	#define IPSECCTL_STATS 1 /* stats */
	#define IPSECCTL_DEF_POLICY 2
	#define IPSECCTL_DEF_ESP_TRANSLEV 3 /* int; ESP transport mode */
	#define IPSECCTL_DEF_ESP_NETLEV 4 /* int; ESP tunnel mode */
	#define IPSECCTL_DEF_AH_TRANSLEV 5 /* int; AH transport mode */
	#define IPSECCTL_DEF_AH_NETLEV 6 /* int; AH tunnel mode */
	#if 0 /* obsolete, do not reuse */
	#define IPSECCTL_INBOUND_CALL_IKE 7
	#endif
	#define IPSECCTL_AH_CLEARTOS 8
	#define IPSECCTL_AH_OFFSETMASK 9
	#define IPSECCTL_DFBIT 10
	#define IPSECCTL_ECN 11
	#define IPSECCTL_DEBUG 12
	#define IPSECCTL_ESP_RANDPAD 13
	#define IPSECCTL_MAXID 14

	#define IPSECCTL_NAMES { \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "def_policy", CTLTYPE_INT }, \
	{ "esp_trans_deflev", CTLTYPE_INT }, \
	{ "esp_net_deflev", CTLTYPE_INT }, \
	{ "ah_trans_deflev", CTLTYPE_INT }, \
	{ "ah_net_deflev", CTLTYPE_INT }, \
	{ 0, 0 }, \
	{ "ah_cleartos", CTLTYPE_INT }, \
	{ "ah_offsetmask", CTLTYPE_INT }, \
	{ "dfbit", CTLTYPE_INT }, \
	{ "ecn", CTLTYPE_INT }, \
	{ "debug", CTLTYPE_INT }, \
	{ "esp_randpad", CTLTYPE_INT }, \
	}

	#define IPSEC6CTL_NAMES { \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "def_policy", CTLTYPE_INT }, \
	{ "esp_trans_deflev", CTLTYPE_INT }, \
	{ "esp_net_deflev", CTLTYPE_INT }, \
	{ "ah_trans_deflev", CTLTYPE_INT }, \
	{ "ah_net_deflev", CTLTYPE_INT }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ 0, 0 }, \
	{ "ecn", CTLTYPE_INT }, \
	{ "debug", CTLTYPE_INT }, \
	{ "esp_randpad", CTLTYPE_INT }, \
	}

	#ifdef _KERNEL
	struct ipsec_output_state {
	struct mbuf *m;
	struct route *ro;
	struct sockaddr *dst;
	};

	struct ipsec_history {
	int ih_proto;
	u_int32_t ih_spi;
	};

	extern int ipsec_debug;
	#ifdef REGRESSION
	extern int ipsec_replay;
	extern int ipsec_integrity;
	#endif

	extern struct ipsecstat ipsec4stat;
	extern struct secpolicy ip4_def_policy;
	extern int ip4_esp_trans_deflev;
	extern int ip4_esp_net_deflev;
	extern int ip4_ah_trans_deflev;
	extern int ip4_ah_net_deflev;
	extern int ip4_ah_cleartos;
	extern int ip4_ah_offsetmask;
	extern int ip4_ipsec_dfbit;
	extern int ip4_ipsec_ecn;
	extern int ip4_esp_randpad;
	extern int crypto_support;

	#define ipseclog(x) do { if (V_ipsec_debug) log x; } while (0)
	/* for openbsd compatibility */
	#define DPRINTF(x) do { if (V_ipsec_debug) printf x; } while (0)

	extern struct ipsecrequest *ipsec_newisr(void);
	extern void ipsec_delisr(struct ipsecrequest *);

	struct tdb_ident;
	extern struct secpolicy ipsec_getpolicy __P((struct tdb_ident, u_int));
	struct inpcb;
	extern struct secpolicy ipsec4_checkpolicy __P((struct mbuf , u_int, u_int,
	int , struct inpcb ));
	extern struct secpolicy ipsec_getpolicybysock(struct mbuf , u_int,
	struct inpcb , int );
	extern struct secpolicy * ipsec_getpolicybyaddr(struct mbuf *, u_int,
	int, int *);

	struct inpcb;
	extern int ipsec_init_policy __P((struct socket so, struct inpcbpolicy *));
	extern int ipsec_copy_policy
	__P((struct inpcbpolicy , struct inpcbpolicy ));
	extern u_int ipsec_get_reqlevel __P((struct ipsecrequest *));
	extern int ipsec_in_reject __P((struct secpolicy , struct mbuf ));

	extern int ipsec4_set_policy __P((struct inpcb *inp, int optname,
	caddr_t request, size_t len, struct ucred *cred));
	extern int ipsec4_get_policy __P((struct inpcb *inpcb, caddr_t request,
	size_t len, struct mbuf **mp));
	extern int ipsec4_delete_pcbpolicy __P((struct inpcb *));
	extern int ipsec4_in_reject __P((struct mbuf , struct inpcb ));

	struct secas;
	struct tcpcb;
	extern int ipsec_chkreplay __P((u_int32_t, struct secasvar *));
	extern int ipsec_updatereplay __P((u_int32_t, struct secasvar *));

	extern size_t ipsec4_hdrsiz __P((struct mbuf , u_int, struct inpcb ));
	extern size_t ipsec_hdrsiz_tcp __P((struct tcpcb *));

	union sockaddr_union;
	extern char * ipsec_address(union sockaddr_union* sa);
	extern const char ipsec_logsastr __P((struct secasvar ));

	extern void ipsec_dumpmbuf __P((struct mbuf *));

	struct m_tag;
	extern void ah4_input(struct mbuf *m, int off);
	extern void ah4_ctlinput(int cmd, struct sockaddr sa, void );
	extern void esp4_input(struct mbuf *m, int off);
	extern void esp4_ctlinput(int cmd, struct sockaddr sa, void );
	extern void ipcomp4_input(struct mbuf *m, int off);
	extern int ipsec4_common_input(struct mbuf *m, ...);
	extern int ipsec4_common_input_cb(struct mbuf m, struct secasvar sav,
	int skip, int protoff, struct m_tag *mt);
	extern int ipsec4_process_packet __P((struct mbuf , struct ipsecrequest ,
	int, int));
	extern int ipsec_process_done __P((struct mbuf , struct ipsecrequest ));

	extern struct mbuf ipsec_copypkt __P((struct mbuf ));

	extern void m_checkalignment(const char* where, struct mbuf *m0,
	int off, int len);
	extern struct mbuf m_makespace(struct mbuf m0, int skip, int hlen, int *off);
	extern caddr_t m_pad(struct mbuf *m, int n);
	extern int m_striphdr(struct mbuf *m, int skip, int hlen);

	#ifdef DEV_ENC
	#define ENC_BEFORE 0x0001
	#define ENC_AFTER 0x0002
	#define ENC_IN 0x0100
	#define ENC_OUT 0x0200
	extern int ipsec_filter(struct mbuf **, int, int);
	extern void ipsec_bpf(struct mbuf , struct secasvar , int, int);
	#endif
	#endif /* _KERNEL */

	#ifndef _KERNEL
	extern caddr_t ipsec_set_policy __P((char *, int));
	extern int ipsec_get_policylen __P((caddr_t));
	extern char ipsec_dump_policy __P((caddr_t, char ));

	extern const char *ipsec_strerror __P((void));
	-#endif /* !_KERNEL */
	+
	+#else
	+#include <netipsec/vipsec.h>
	+#endif /* ! KERNEL */

	#endif /* _NETIPSEC_IPSEC_H_ */
	Index: head/sys/netipsec/ipsec_input.c
	===================================================================
	--- head/sys/netipsec/ipsec_input.c (revision 183549)
	+++ head/sys/netipsec/ipsec_input.c (revision 183550)
	@@ -1,878 +1,883 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ipsec_input.c,v 1.63 2003/02/20 18:35:43 deraadt Exp $ */
	/*-
	* The authors of this code are John Ioannidis (ji@tla.org),
	* Angelos D. Keromytis (kermit@csd.uch.gr) and
	* Niels Provos (provos@physnet.uni-hamburg.de).
	*
	* This code was written by John Ioannidis for BSD/OS in Athens, Greece,
	* in November 1995.
	*
	* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
	* by Angelos D. Keromytis.
	*
	* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
	* and Niels Provos.
	*
	* Additional features in 1999 by Angelos D. Keromytis.
	*
	* Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
	* Angelos D. Keromytis and Niels Provos.
	* Copyright (c) 2001, Angelos D. Keromytis.
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all copies of any software which is or includes a copy or
	* modification of this software.
	* You may use this code under the GNU public license if you so wish. Please
	* contribute changes back to the authors under this freer than GPL license
	* so that we may further the use of strong encryption without limitations to
	* all.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/

	/*
	* IPsec input processing.
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_enc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/pfil.h>
	#include <net/route.h>
	#include <net/netisr.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/in_var.h>

	#include <netinet/ip6.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#endif
	#include <netinet/in_pcb.h>
	#ifdef INET6
	#include <netinet/icmp6.h>
	#endif

	#include <netipsec/ipsec.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif
	#include <netipsec/ah_var.h>
	#include <netipsec/esp.h>
	#include <netipsec/esp_var.h>
	#include <netipsec/ipcomp_var.h>

	#include <netipsec/key.h>
	#include <netipsec/keydb.h>

	#include <netipsec/xform.h>
	#include <netinet6/ip6protosw.h>

	#include <machine/in_cksum.h>
	#include <machine/stdarg.h>

	#ifdef DEV_ENC
	#include <net/if_enc.h>
	#endif


	#define IPSEC_ISTAT(p,x,y,z) ((p) == IPPROTO_ESP ? (x)++ : \
	(p) == IPPROTO_AH ? (y)++ : (z)++)

	static void ipsec4_common_ctlinput(int, struct sockaddr , void , int);

	/*
	* ipsec_common_input gets called when an IPsec-protected packet
	* is received by IPv4 or IPv6. It's job is to find the right SA
	* and call the appropriate transform. The transform callback
	* takes care of further processing (like ingress filtering).
	*/
	static int
	ipsec_common_input(struct mbuf *m, int skip, int protoff, int af, int sproto)
	{
	+ INIT_VNET_IPSEC(curvnet);
	union sockaddr_union dst_address;
	struct secasvar *sav;
	u_int32_t spi;
	int error;

	IPSEC_ISTAT(sproto, V_espstat.esps_input, V_ahstat.ahs_input,
	V_ipcompstat.ipcomps_input);

	IPSEC_ASSERT(m != NULL, ("null packet"));

	IPSEC_ASSERT(sproto == IPPROTO_ESP \|\| sproto == IPPROTO_AH \|\|
	sproto == IPPROTO_IPCOMP,
	("unexpected security protocol %u", sproto));

	if ((sproto == IPPROTO_ESP && !V_esp_enable) \|\|
	(sproto == IPPROTO_AH && !V_ah_enable) \|\|
	(sproto == IPPROTO_IPCOMP && !V_ipcomp_enable)) {
	m_freem(m);
	IPSEC_ISTAT(sproto, V_espstat.esps_pdrops, V_ahstat.ahs_pdrops,
	V_ipcompstat.ipcomps_pdrops);
	return EOPNOTSUPP;
	}

	if (m->m_pkthdr.len - skip < 2 * sizeof (u_int32_t)) {
	m_freem(m);
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	DPRINTF(("%s: packet too small\n", __func__));
	return EINVAL;
	}

	/* Retrieve the SPI from the relevant IPsec header */
	if (sproto == IPPROTO_ESP)
	m_copydata(m, skip, sizeof(u_int32_t), (caddr_t) &spi);
	else if (sproto == IPPROTO_AH)
	m_copydata(m, skip + sizeof(u_int32_t), sizeof(u_int32_t),
	(caddr_t) &spi);
	else if (sproto == IPPROTO_IPCOMP) {
	u_int16_t cpi;
	m_copydata(m, skip + sizeof(u_int16_t), sizeof(u_int16_t),
	(caddr_t) &cpi);
	spi = ntohl(htons(cpi));
	}

	/*
	* Find the SA and (indirectly) call the appropriate
	* kernel crypto routine. The resulting mbuf chain is a valid
	* IP packet ready to go through input processing.
	*/
	bzero(&dst_address, sizeof (dst_address));
	dst_address.sa.sa_family = af;
	switch (af) {
	#ifdef INET
	case AF_INET:
	dst_address.sin.sin_len = sizeof(struct sockaddr_in);
	m_copydata(m, offsetof(struct ip, ip_dst),
	sizeof(struct in_addr),
	(caddr_t) &dst_address.sin.sin_addr);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	dst_address.sin6.sin6_len = sizeof(struct sockaddr_in6);
	m_copydata(m, offsetof(struct ip6_hdr, ip6_dst),
	sizeof(struct in6_addr),
	(caddr_t) &dst_address.sin6.sin6_addr);
	break;
	#endif /* INET6 */
	default:
	DPRINTF(("%s: unsupported protocol family %u\n", __func__, af));
	m_freem(m);
	IPSEC_ISTAT(sproto, V_espstat.esps_nopf, V_ahstat.ahs_nopf,
	V_ipcompstat.ipcomps_nopf);
	return EPFNOSUPPORT;
	}

	/* NB: only pass dst since key_allocsa follows RFC2401 */
	sav = KEY_ALLOCSA(&dst_address, sproto, spi);
	if (sav == NULL) {
	DPRINTF(("%s: no key association found for SA %s/%08lx/%u\n",
	__func__, ipsec_address(&dst_address),
	(u_long) ntohl(spi), sproto));
	IPSEC_ISTAT(sproto, V_espstat.esps_notdb, V_ahstat.ahs_notdb,
	V_ipcompstat.ipcomps_notdb);
	m_freem(m);
	return ENOENT;
	}

	if (sav->tdb_xform == NULL) {
	DPRINTF(("%s: attempted to use uninitialized SA %s/%08lx/%u\n",
	__func__, ipsec_address(&dst_address),
	(u_long) ntohl(spi), sproto));
	IPSEC_ISTAT(sproto, V_espstat.esps_noxform, V_ahstat.ahs_noxform,
	V_ipcompstat.ipcomps_noxform);
	KEY_FREESAV(&sav);
	m_freem(m);
	return ENXIO;
	}

	/*
	* Call appropriate transform and return -- callback takes care of
	* everything else.
	*/
	error = (*sav->tdb_xform->xf_input)(m, sav, skip, protoff);
	KEY_FREESAV(&sav);
	return error;
	}

	#ifdef INET
	/*
	* Common input handler for IPv4 AH, ESP, and IPCOMP.
	*/
	int
	ipsec4_common_input(struct mbuf *m, ...)
	{
	va_list ap;
	int off, nxt;

	va_start(ap, m);
	off = va_arg(ap, int);
	nxt = va_arg(ap, int);
	va_end(ap);

	return ipsec_common_input(m, off, offsetof(struct ip, ip_p),
	AF_INET, nxt);
	}

	void
	ah4_input(struct mbuf *m, int off)
	{
	ipsec4_common_input(m, off, IPPROTO_AH);
	}
	void
	ah4_ctlinput(int cmd, struct sockaddr sa, void v)
	{
	if (sa->sa_family == AF_INET &&
	sa->sa_len == sizeof(struct sockaddr_in))
	ipsec4_common_ctlinput(cmd, sa, v, IPPROTO_AH);
	}

	void
	esp4_input(struct mbuf *m, int off)
	{
	ipsec4_common_input(m, off, IPPROTO_ESP);
	}
	void
	esp4_ctlinput(int cmd, struct sockaddr sa, void v)
	{
	if (sa->sa_family == AF_INET &&
	sa->sa_len == sizeof(struct sockaddr_in))
	ipsec4_common_ctlinput(cmd, sa, v, IPPROTO_ESP);
	}

	void
	ipcomp4_input(struct mbuf *m, int off)
	{
	ipsec4_common_input(m, off, IPPROTO_IPCOMP);
	}

	/*
	* IPsec input callback for INET protocols.
	* This routine is called as the transform callback.
	* Takes care of filtering and other sanity checks on
	* the processed packet.
	*/
	int
	ipsec4_common_input_cb(struct mbuf m, struct secasvar sav,
	int skip, int protoff, struct m_tag *mt)
	{
	+ INIT_VNET_IPSEC(curvnet);
	int prot, af, sproto;
	struct ip *ip;
	struct m_tag *mtag;
	struct tdb_ident *tdbi;
	struct secasindex *saidx;
	int error;
	#ifdef INET6
	#ifdef notyet
	char ip6buf[INET6_ADDRSTRLEN];
	#endif
	#endif

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
	saidx = &sav->sah->saidx;
	af = saidx->dst.sa.sa_family;
	IPSEC_ASSERT(af == AF_INET, ("unexpected af %u", af));
	sproto = saidx->proto;
	IPSEC_ASSERT(sproto == IPPROTO_ESP \|\| sproto == IPPROTO_AH \|\|
	sproto == IPPROTO_IPCOMP,
	("unexpected security protocol %u", sproto));

	/* Sanity check */
	if (m == NULL) {
	DPRINTF(("%s: null mbuf", __func__));
	IPSEC_ISTAT(sproto, V_espstat.esps_badkcr, V_ahstat.ahs_badkcr,
	V_ipcompstat.ipcomps_badkcr);
	KEY_FREESAV(&sav);
	return EINVAL;
	}

	if (skip != 0) {
	/* Fix IPv4 header */
	if (m->m_len < skip && (m = m_pullup(m, skip)) == NULL) {
	DPRINTF(("%s: processing failed for SA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	error = ENOBUFS;
	goto bad;
	}

	ip = mtod(m, struct ip *);
	ip->ip_len = htons(m->m_pkthdr.len);
	ip->ip_off = htons(ip->ip_off);
	ip->ip_sum = 0;
	ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
	} else {
	ip = mtod(m, struct ip *);
	}
	prot = ip->ip_p;

	#ifdef notyet
	/* IP-in-IP encapsulation */
	if (prot == IPPROTO_IPIP) {
	struct ip ipn;

	if (m->m_pkthdr.len - skip < sizeof(struct ip)) {
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	error = EINVAL;
	goto bad;
	}
	/* ipn will now contain the inner IPv4 header */
	m_copydata(m, ip->ip_hl << 2, sizeof(struct ip),
	(caddr_t) &ipn);

	/* XXX PROXY address isn't recorded in SAH */
	/*
	* Check that the inner source address is the same as
	* the proxy address, if available.
	*/
	if ((saidx->proxy.sa.sa_family == AF_INET &&
	saidx->proxy.sin.sin_addr.s_addr !=
	INADDR_ANY &&
	ipn.ip_src.s_addr !=
	saidx->proxy.sin.sin_addr.s_addr) \|\|
	(saidx->proxy.sa.sa_family != AF_INET &&
	saidx->proxy.sa.sa_family != 0)) {

	DPRINTF(("%s: inner source address %s doesn't "
	"correspond to expected proxy source %s, "
	"SA %s/%08lx\n", __func__,
	inet_ntoa4(ipn.ip_src),
	ipsp_address(saidx->proxy),
	ipsp_address(saidx->dst),
	(u_long) ntohl(sav->spi)));

	IPSEC_ISTAT(sproto, V_espstat.esps_pdrops,
	V_ahstat.ahs_pdrops,
	V_ipcompstat.ipcomps_pdrops);
	error = EACCES;
	goto bad;
	}
	}
	#ifdef INET6
	/* IPv6-in-IP encapsulation. */
	if (prot == IPPROTO_IPV6) {
	struct ip6_hdr ip6n;

	if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) {
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	error = EINVAL;
	goto bad;
	}
	/* ip6n will now contain the inner IPv6 header. */
	m_copydata(m, ip->ip_hl << 2, sizeof(struct ip6_hdr),
	(caddr_t) &ip6n);

	/*
	* Check that the inner source address is the same as
	* the proxy address, if available.
	*/
	if ((saidx->proxy.sa.sa_family == AF_INET6 &&
	!IN6_IS_ADDR_UNSPECIFIED(&saidx->proxy.sin6.sin6_addr) &&
	!IN6_ARE_ADDR_EQUAL(&ip6n.ip6_src,
	&saidx->proxy.sin6.sin6_addr)) \|\|
	(saidx->proxy.sa.sa_family != AF_INET6 &&
	saidx->proxy.sa.sa_family != 0)) {

	DPRINTF(("%s: inner source address %s doesn't "
	"correspond to expected proxy source %s, "
	"SA %s/%08lx\n", __func__,
	ip6_sprintf(ip6buf, &ip6n.ip6_src),
	ipsec_address(&saidx->proxy),
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));

	IPSEC_ISTAT(sproto, V_espstat.esps_pdrops,
	V_ahstat.ahs_pdrops,
	V_ipcompstat.ipcomps_pdrops);
	error = EACCES;
	goto bad;
	}
	}
	#endif /* INET6 */
	#endif /XXX/

	/*
	* Record what we've done to the packet (under what SA it was
	* processed). If we've been passed an mtag, it means the packet
	* was already processed by an ethernet/crypto combo card and
	* thus has a tag attached with all the right information, but
	* with a PACKET_TAG_IPSEC_IN_CRYPTO_DONE as opposed to
	* PACKET_TAG_IPSEC_IN_DONE type; in that case, just change the type.
	*/
	if (mt == NULL && sproto != IPPROTO_IPCOMP) {
	mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE,
	sizeof(struct tdb_ident), M_NOWAIT);
	if (mtag == NULL) {
	DPRINTF(("%s: failed to get tag\n", __func__));
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops, V_ipcompstat.ipcomps_hdrops);
	error = ENOMEM;
	goto bad;
	}

	tdbi = (struct tdb_ident *)(mtag + 1);
	bcopy(&saidx->dst, &tdbi->dst, saidx->dst.sa.sa_len);
	tdbi->proto = sproto;
	tdbi->spi = sav->spi;
	/* Cache those two for enc(4) in xform_ipip. */
	tdbi->alg_auth = sav->alg_auth;
	tdbi->alg_enc = sav->alg_enc;

	m_tag_prepend(m, mtag);
	} else if (mt != NULL) {
	mt->m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
	/* XXX do we need to mark m_flags??? */
	}

	key_sa_recordxfer(sav, m); /* record data transfer */

	#ifdef DEV_ENC
	encif->if_ipackets++;
	encif->if_ibytes += m->m_pkthdr.len;

	/*
	* Pass the mbuf to enc0 for bpf and pfil. We will filter the IPIP
	* packet later after it has been decapsulated.
	*/
	ipsec_bpf(m, sav, AF_INET, ENC_IN\|ENC_BEFORE);

	if (prot != IPPROTO_IPIP)
	if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN\|ENC_BEFORE)) != 0)
	return (error);
	#endif

	/*
	* Re-dispatch via software interrupt.
	*/
	if ((error = netisr_queue(NETISR_IP, m))) {
	IPSEC_ISTAT(sproto, V_espstat.esps_qfull, V_ahstat.ahs_qfull,
	V_ipcompstat.ipcomps_qfull);

	DPRINTF(("%s: queue full; proto %u packet dropped\n",
	__func__, sproto));
	return error;
	}
	return 0;
	bad:
	m_freem(m);
	return error;
	}

	void
	ipsec4_common_ctlinput(int cmd, struct sockaddr sa, void v, int proto)
	{
	/* XXX nothing just yet */
	}
	#endif /* INET */

	#ifdef INET6
	/* IPv6 AH wrapper. */
	int
	ipsec6_common_input(struct mbuf *mp, int offp, int proto)
	{
	+ INIT_VNET_IPSEC(curvnet);
	int l = 0;
	int protoff;
	struct ip6_ext ip6e;

	if (*offp < sizeof(struct ip6_hdr)) {
	DPRINTF(("%s: bad offset %u\n", __func__, *offp));
	return IPPROTO_DONE;
	} else if (*offp == sizeof(struct ip6_hdr)) {
	protoff = offsetof(struct ip6_hdr, ip6_nxt);
	} else {
	/* Chase down the header chain... */
	protoff = sizeof(struct ip6_hdr);

	do {
	protoff += l;
	m_copydata(*mp, protoff, sizeof(ip6e),
	(caddr_t) &ip6e);

	if (ip6e.ip6e_nxt == IPPROTO_AH)
	l = (ip6e.ip6e_len + 2) << 2;
	else
	l = (ip6e.ip6e_len + 1) << 3;
	IPSEC_ASSERT(l > 0, ("l went zero or negative"));
	} while (protoff + l < *offp);

	/* Malformed packet check */
	if (protoff + l != *offp) {
	DPRINTF(("%s: bad packet header chain, protoff %u, "
	"l %u, off %u\n", __func__, protoff, l, *offp));
	IPSEC_ISTAT(proto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	m_freem(*mp);
	*mp = NULL;
	return IPPROTO_DONE;
	}
	protoff += offsetof(struct ip6_ext, ip6e_nxt);
	}
	(void) ipsec_common_input(mp, offp, protoff, AF_INET6, proto);
	return IPPROTO_DONE;
	}

	/*
	* IPsec input callback, called by the transform callback. Takes care of
	* filtering and other sanity checks on the processed packet.
	*/
	int
	ipsec6_common_input_cb(struct mbuf m, struct secasvar sav, int skip, int protoff,
	struct m_tag *mt)
	{
	+ INIT_VNET_INET6(curvnet);
	+ INIT_VNET_IPSEC(curvnet);
	int prot, af, sproto;
	struct ip6_hdr *ip6;
	struct m_tag *mtag;
	struct tdb_ident *tdbi;
	struct secasindex *saidx;
	int nxt;
	u_int8_t nxt8;
	int error, nest;
	#ifdef notyet
	char ip6buf[INET6_ADDRSTRLEN];
	#endif

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
	saidx = &sav->sah->saidx;
	af = saidx->dst.sa.sa_family;
	IPSEC_ASSERT(af == AF_INET6, ("unexpected af %u", af));
	sproto = saidx->proto;
	IPSEC_ASSERT(sproto == IPPROTO_ESP \|\| sproto == IPPROTO_AH \|\|
	sproto == IPPROTO_IPCOMP,
	("unexpected security protocol %u", sproto));

	/* Sanity check */
	if (m == NULL) {
	DPRINTF(("%s: null mbuf", __func__));
	IPSEC_ISTAT(sproto, V_espstat.esps_badkcr, V_ahstat.ahs_badkcr,
	V_ipcompstat.ipcomps_badkcr);
	error = EINVAL;
	goto bad;
	}

	/* Fix IPv6 header */
	if (m->m_len < sizeof(struct ip6_hdr) &&
	(m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {

	DPRINTF(("%s: processing failed for SA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));

	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops, V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	error = EACCES;
	goto bad;
	}

	ip6 = mtod(m, struct ip6_hdr *);
	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));

	/* Save protocol */
	m_copydata(m, protoff, 1, (unsigned char *) &prot);

	#ifdef notyet
	#ifdef INET
	/* IP-in-IP encapsulation */
	if (prot == IPPROTO_IPIP) {
	struct ip ipn;

	if (m->m_pkthdr.len - skip < sizeof(struct ip)) {
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	error = EINVAL;
	goto bad;
	}
	/* ipn will now contain the inner IPv4 header */
	m_copydata(m, skip, sizeof(struct ip), (caddr_t) &ipn);

	/*
	* Check that the inner source address is the same as
	* the proxy address, if available.
	*/
	if ((saidx->proxy.sa.sa_family == AF_INET &&
	saidx->proxy.sin.sin_addr.s_addr != INADDR_ANY &&
	ipn.ip_src.s_addr != saidx->proxy.sin.sin_addr.s_addr) \|\|
	(saidx->proxy.sa.sa_family != AF_INET &&
	saidx->proxy.sa.sa_family != 0)) {

	DPRINTF(("%s: inner source address %s doesn't "
	"correspond to expected proxy source %s, "
	"SA %s/%08lx\n", __func__,
	inet_ntoa4(ipn.ip_src),
	ipsec_address(&saidx->proxy),
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));

	IPSEC_ISTATsproto, (V_espstat.esps_pdrops,
	V_ahstat.ahs_pdrops, V_ipcompstat.ipcomps_pdrops);
	error = EACCES;
	goto bad;
	}
	}
	#endif /* INET */

	/* IPv6-in-IP encapsulation */
	if (prot == IPPROTO_IPV6) {
	struct ip6_hdr ip6n;

	if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) {
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops,
	V_ipcompstat.ipcomps_hdrops);
	error = EINVAL;
	goto bad;
	}
	/* ip6n will now contain the inner IPv6 header. */
	m_copydata(m, skip, sizeof(struct ip6_hdr),
	(caddr_t) &ip6n);

	/*
	* Check that the inner source address is the same as
	* the proxy address, if available.
	*/
	if ((saidx->proxy.sa.sa_family == AF_INET6 &&
	!IN6_IS_ADDR_UNSPECIFIED(&saidx->proxy.sin6.sin6_addr) &&
	!IN6_ARE_ADDR_EQUAL(&ip6n.ip6_src,
	&saidx->proxy.sin6.sin6_addr)) \|\|
	(saidx->proxy.sa.sa_family != AF_INET6 &&
	saidx->proxy.sa.sa_family != 0)) {

	DPRINTF(("%s: inner source address %s doesn't "
	"correspond to expected proxy source %s, "
	"SA %s/%08lx\n", __func__,
	ip6_sprintf(ip6buf, &ip6n.ip6_src),
	ipsec_address(&saidx->proxy),
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));

	IPSEC_ISTAT(sproto, V_espstat.esps_pdrops,
	V_ahstat.ahs_pdrops, V_ipcompstat.ipcomps_pdrops);
	error = EACCES;
	goto bad;
	}
	}
	#endif /XXX/

	/*
	* Record what we've done to the packet (under what SA it was
	* processed). If we've been passed an mtag, it means the packet
	* was already processed by an ethernet/crypto combo card and
	* thus has a tag attached with all the right information, but
	* with a PACKET_TAG_IPSEC_IN_CRYPTO_DONE as opposed to
	* PACKET_TAG_IPSEC_IN_DONE type; in that case, just change the type.
	*/
	if (mt == NULL && sproto != IPPROTO_IPCOMP) {
	mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE,
	sizeof(struct tdb_ident), M_NOWAIT);
	if (mtag == NULL) {
	DPRINTF(("%s: failed to get tag\n", __func__));
	IPSEC_ISTAT(sproto, V_espstat.esps_hdrops,
	V_ahstat.ahs_hdrops, V_ipcompstat.ipcomps_hdrops);
	error = ENOMEM;
	goto bad;
	}

	tdbi = (struct tdb_ident *)(mtag + 1);
	bcopy(&saidx->dst, &tdbi->dst, sizeof(union sockaddr_union));
	tdbi->proto = sproto;
	tdbi->spi = sav->spi;
	/* Cache those two for enc(4) in xform_ipip. */
	tdbi->alg_auth = sav->alg_auth;
	tdbi->alg_enc = sav->alg_enc;

	m_tag_prepend(m, mtag);
	} else {
	if (mt != NULL)
	mt->m_tag_id = PACKET_TAG_IPSEC_IN_DONE;
	/* XXX do we need to mark m_flags??? */
	}

	key_sa_recordxfer(sav, m);

	#ifdef DEV_ENC
	encif->if_ipackets++;
	encif->if_ibytes += m->m_pkthdr.len;

	/*
	* Pass the mbuf to enc0 for bpf and pfil. We will filter the IPIP
	* packet later after it has been decapsulated.
	*/
	ipsec_bpf(m, sav, AF_INET6, ENC_IN\|ENC_BEFORE);

	/* XXX-BZ does not make sense. */
	if (prot != IPPROTO_IPIP)
	if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN\|ENC_BEFORE)) != 0)
	return (error);
	#endif

	/* Retrieve new protocol */
	m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &nxt8);

	/*
	* See the end of ip6_input for this logic.
	* IPPROTO_IPV[46] case will be processed just like other ones
	*/
	nest = 0;
	nxt = nxt8;
	while (nxt != IPPROTO_DONE) {
	if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
	V_ip6stat.ip6s_toomanyhdr++;
	error = EINVAL;
	goto bad;
	}

	/*
	* Protection against faulty packet - there should be
	* more sanity checks in header chain processing.
	*/
	if (m->m_pkthdr.len < skip) {
	V_ip6stat.ip6s_tooshort++;
	in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
	error = EINVAL;
	goto bad;
	}
	/*
	* Enforce IPsec policy checking if we are seeing last header.
	* note that we do not visit this with protocols with pcb layer
	* code - like udp/tcp/raw ip.
	*/
	if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 &&
	ipsec6_in_reject(m, NULL)) {
	error = EINVAL;
	goto bad;
	}
	nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &skip, nxt);
	}
	return 0;
	bad:
	if (m)
	m_freem(m);
	return error;
	}

	void
	esp6_ctlinput(int cmd, struct sockaddr sa, void d)
	{
	struct ip6ctlparam *ip6cp = NULL;
	struct mbuf *m = NULL;
	struct ip6_hdr *ip6;
	int off;

	if (sa->sa_family != AF_INET6 \|\|
	sa->sa_len != sizeof(struct sockaddr_in6))
	return;
	if ((unsigned)cmd >= PRC_NCMDS)
	return;

	/* if the parameter is from icmp6, decode it. */
	if (d != NULL) {
	ip6cp = (struct ip6ctlparam *)d;
	m = ip6cp->ip6c_m;
	ip6 = ip6cp->ip6c_ip6;
	off = ip6cp->ip6c_off;
	} else {
	m = NULL;
	ip6 = NULL;
	off = 0; /* calm gcc */
	}

	if (ip6 != NULL) {

	struct ip6ctlparam ip6cp1;

	/*
	* Notify the error to all possible sockets via pfctlinput2.
	* Since the upper layer information (such as protocol type,
	* source and destination ports) is embedded in the encrypted
	* data and might have been cut, we can't directly call
	* an upper layer ctlinput function. However, the pcbnotify
	* function will consider source and destination addresses
	* as well as the flow info value, and may be able to find
	* some PCB that should be notified.
	* Although pfctlinput2 will call esp6_ctlinput(), there is
	* no possibility of an infinite loop of function calls,
	* because we don't pass the inner IPv6 header.
	*/
	bzero(&ip6cp1, sizeof(ip6cp1));
	ip6cp1.ip6c_src = ip6cp->ip6c_src;
	pfctlinput2(cmd, sa, (void *)&ip6cp1);

	/*
	* Then go to special cases that need ESP header information.
	* XXX: We assume that when ip6 is non NULL,
	* M and OFF are valid.
	*/

	if (cmd == PRC_MSGSIZE) {
	struct secasvar *sav;
	u_int32_t spi;
	int valid;

	/* check header length before using m_copydata */
	if (m->m_pkthdr.len < off + sizeof (struct esp))
	return;
	m_copydata(m, off + offsetof(struct esp, esp_spi),
	sizeof(u_int32_t), (caddr_t) &spi);
	/*
	* Check to see if we have a valid SA corresponding to
	* the address in the ICMP message payload.
	*/
	sav = KEY_ALLOCSA((union sockaddr_union *)sa,
	IPPROTO_ESP, spi);
	valid = (sav != NULL);
	if (sav)
	KEY_FREESAV(&sav);

	/* XXX Further validation? */

	/*
	* Depending on whether the SA is "valid" and
	* routing table size (mtudisc_{hi,lo}wat), we will:
	* - recalcurate the new MTU and create the
	* corresponding routing entry, or
	* - ignore the MTU change notification.
	*/
	icmp6_mtudisc_update(ip6cp, valid);
	}
	} else {
	/* we normally notify any pcb here */
	}
	}
	#endif /* INET6 */
	Index: head/sys/netipsec/ipsec_mbuf.c
	===================================================================
	--- head/sys/netipsec/ipsec_mbuf.c (revision 183549)
	+++ head/sys/netipsec/ipsec_mbuf.c (revision 183550)
	@@ -1,325 +1,328 @@
	/*-
	* Copyright (c) 2002, 2003 Sam Leffler, Errno Consulting
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* IPsec-specific mbuf routines.
	*/

	#include "opt_param.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/vimage.h>

	#include <net/route.h>
	#include <netinet/in.h>

	#include <netipsec/ipsec.h>

	/*
	* Make space for a new header of length hlen at skip bytes
	* into the packet. When doing this we allocate new mbufs only
	* when absolutely necessary. The mbuf where the new header
	* is to go is returned together with an offset into the mbuf.
	* If NULL is returned then the mbuf chain may have been modified;
	* the caller is assumed to always free the chain.
	*/
	struct mbuf *
	m_makespace(struct mbuf m0, int skip, int hlen, int off)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf *m;
	unsigned remain;

	IPSEC_ASSERT(m0 != NULL, ("null mbuf"));
	IPSEC_ASSERT(hlen < MHLEN, ("hlen too big: %u", hlen));

	for (m = m0; m && skip > m->m_len; m = m->m_next)
	skip -= m->m_len;
	if (m == NULL)
	return (NULL);
	/*
	* At this point skip is the offset into the mbuf m
	* where the new header should be placed. Figure out
	* if there's space to insert the new header. If so,
	* and copying the remainder makese sense then do so.
	* Otherwise insert a new mbuf in the chain, splitting
	* the contents of m as needed.
	*/
	remain = m->m_len - skip; /* data to move */
	if (hlen > M_TRAILINGSPACE(m)) {
	struct mbuf *n;

	/* XXX code doesn't handle clusters XXX */
	IPSEC_ASSERT(remain < MLEN, ("remainder too big: %u", remain));
	/*
	* Not enough space in m, split the contents
	* of m, inserting new mbufs as required.
	*
	* NB: this ignores mbuf types.
	*/
	MGET(n, M_DONTWAIT, MT_DATA);
	if (n == NULL)
	return (NULL);
	n->m_next = m->m_next; /* splice new mbuf */
	m->m_next = n;
	V_ipsec4stat.ips_mbinserted++;
	if (hlen <= M_TRAILINGSPACE(m) + remain) {
	/*
	* New header fits in the old mbuf if we copy
	* the remainder; just do the copy to the new
	* mbuf and we're good to go.
	*/
	memcpy(mtod(n, caddr_t),
	mtod(m, caddr_t) + skip, remain);
	n->m_len = remain;
	m->m_len = skip + hlen;
	*off = skip;
	} else {
	/*
	* No space in the old mbuf for the new header.
	* Make space in the new mbuf and check the
	* remainder'd data fits too. If not then we
	* must allocate an additional mbuf (yech).
	*/
	n->m_len = 0;
	if (remain + hlen > M_TRAILINGSPACE(n)) {
	struct mbuf *n2;

	MGET(n2, M_DONTWAIT, MT_DATA);
	/* NB: new mbuf is on chain, let caller free */
	if (n2 == NULL)
	return (NULL);
	n2->m_len = 0;
	memcpy(mtod(n2, caddr_t),
	mtod(m, caddr_t) + skip, remain);
	n2->m_len = remain;
	/* splice in second mbuf */
	n2->m_next = n->m_next;
	n->m_next = n2;
	V_ipsec4stat.ips_mbinserted++;
	} else {
	memcpy(mtod(n, caddr_t) + hlen,
	mtod(m, caddr_t) + skip, remain);
	n->m_len += remain;
	}
	m->m_len -= remain;
	n->m_len += hlen;
	m = n; /* header is at front ... */
	off = 0; / ... of new mbuf */
	}
	} else {
	/*
	* Copy the remainder to the back of the mbuf
	* so there's space to write the new header.
	*/
	bcopy(mtod(m, caddr_t) + skip,
	mtod(m, caddr_t) + skip + hlen, remain);
	m->m_len += hlen;
	*off = skip;
	}
	m0->m_pkthdr.len += hlen; /* adjust packet length */
	return m;
	}

	/*
	* m_pad(m, n) pads <m> with <n> bytes at the end. The packet header
	* length is updated, and a pointer to the first byte of the padding
	* (which is guaranteed to be all in one mbuf) is returned.
	*/
	caddr_t
	m_pad(struct mbuf *m, int n)
	{
	+ INIT_VNET_IPSEC(curvnet);
	register struct mbuf m0, m1;
	register int len, pad;
	caddr_t retval;

	if (n <= 0) { /* No stupid arguments. */
	DPRINTF(("%s: pad length invalid (%d)\n", __func__, n));
	m_freem(m);
	return NULL;
	}

	len = m->m_pkthdr.len;
	pad = n;
	m0 = m;

	while (m0->m_len < len) {
	len -= m0->m_len;
	m0 = m0->m_next;
	}

	if (m0->m_len != len) {
	DPRINTF(("%s: length mismatch (should be %d instead of %d)\n",
	__func__, m->m_pkthdr.len,
	m->m_pkthdr.len + m0->m_len - len));

	m_freem(m);
	return NULL;
	}

	/* Check for zero-length trailing mbufs, and find the last one. */
	for (m1 = m0; m1->m_next; m1 = m1->m_next) {
	if (m1->m_next->m_len != 0) {
	DPRINTF(("%s: length mismatch (should be %d instead "
	"of %d)\n", __func__,
	m->m_pkthdr.len,
	m->m_pkthdr.len + m1->m_next->m_len));

	m_freem(m);
	return NULL;
	}

	m0 = m1->m_next;
	}

	if (pad > M_TRAILINGSPACE(m0)) {
	/* Add an mbuf to the chain. */
	MGET(m1, M_DONTWAIT, MT_DATA);
	if (m1 == 0) {
	m_freem(m0);
	DPRINTF(("%s: unable to get extra mbuf\n", __func__));
	return NULL;
	}

	m0->m_next = m1;
	m0 = m1;
	m0->m_len = 0;
	}

	retval = m0->m_data + m0->m_len;
	m0->m_len += pad;
	m->m_pkthdr.len += pad;

	return retval;
	}

	/*
	* Remove hlen data at offset skip in the packet. This is used by
	* the protocols strip protocol headers and associated data (e.g. IV,
	* authenticator) on input.
	*/
	int
	m_striphdr(struct mbuf *m, int skip, int hlen)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf *m1;
	int roff;

	/* Find beginning of header */
	m1 = m_getptr(m, skip, &roff);
	if (m1 == NULL)
	return (EINVAL);

	/* Remove the header and associated data from the mbuf. */
	if (roff == 0) {
	/* The header was at the beginning of the mbuf */
	V_ipsec4stat.ips_input_front++;
	m_adj(m1, hlen);
	if ((m1->m_flags & M_PKTHDR) == 0)
	m->m_pkthdr.len -= hlen;
	} else if (roff + hlen >= m1->m_len) {
	struct mbuf *mo;

	/*
	* Part or all of the header is at the end of this mbuf,
	* so first let's remove the remainder of the header from
	* the beginning of the remainder of the mbuf chain, if any.
	*/
	V_ipsec4stat.ips_input_end++;
	if (roff + hlen > m1->m_len) {
	/* Adjust the next mbuf by the remainder */
	m_adj(m1->m_next, roff + hlen - m1->m_len);

	/* The second mbuf is guaranteed not to have a pkthdr... */
	m->m_pkthdr.len -= (roff + hlen - m1->m_len);
	}

	/* Now, let's unlink the mbuf chain for a second...*/
	mo = m1->m_next;
	m1->m_next = NULL;

	/* ...and trim the end of the first part of the chain...sick */
	m_adj(m1, -(m1->m_len - roff));
	if ((m1->m_flags & M_PKTHDR) == 0)
	m->m_pkthdr.len -= (m1->m_len - roff);

	/* Finally, let's relink */
	m1->m_next = mo;
	} else {
	/*
	* The header lies in the "middle" of the mbuf; copy
	* the remainder of the mbuf down over the header.
	*/
	V_ipsec4stat.ips_input_middle++;
	bcopy(mtod(m1, u_char *) + roff + hlen,
	mtod(m1, u_char *) + roff,
	m1->m_len - (roff + hlen));
	m1->m_len -= hlen;
	m->m_pkthdr.len -= hlen;
	}
	return (0);
	}

	/*
	* Diagnostic routine to check mbuf alignment as required by the
	* crypto device drivers (that use DMA).
	*/
	void
	m_checkalignment(const char* where, struct mbuf *m0, int off, int len)
	{
	int roff;
	struct mbuf *m = m_getptr(m0, off, &roff);
	caddr_t addr;

	if (m == NULL)
	return;
	printf("%s (off %u len %u): ", where, off, len);
	addr = mtod(m, caddr_t) + roff;
	do {
	int mlen;

	if (((uintptr_t) addr) & 3) {
	printf("addr misaligned %p,", addr);
	break;
	}
	mlen = m->m_len;
	if (mlen > len)
	mlen = len;
	len -= mlen;
	if (len && (mlen & 3)) {
	printf("len mismatch %u,", mlen);
	break;
	}
	m = m->m_next;
	addr = m ? mtod(m, caddr_t) : NULL;
	} while (m && len > 0);
	for (m = m0; m; m = m->m_next)
	printf(" [%p:%u]", mtod(m, caddr_t), m->m_len);
	printf("\n");
	}
	Index: head/sys/netipsec/ipsec_output.c
	===================================================================
	--- head/sys/netipsec/ipsec_output.c (revision 183549)
	+++ head/sys/netipsec/ipsec_output.c (revision 183550)
	@@ -1,836 +1,843 @@
	/*-
	* Copyright (c) 2002, 2003 Sam Leffler, Errno Consulting
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* IPsec output processing.
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"
	#include "opt_enc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/socket.h>
	#include <sys/errno.h>
	#include <sys/syslog.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/pfil.h>
	#include <net/route.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	#include <netinet/in_var.h>
	#include <netinet/ip_ecn.h>
	#ifdef INET6
	#include <netinet6/ip6_ecn.h>
	#endif

	#include <netinet/ip6.h>
	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#endif
	#include <netinet/in_pcb.h>
	#ifdef INET6
	#include <netinet/icmp6.h>
	#endif

	#include <netipsec/ipsec.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif
	#include <netipsec/ah_var.h>
	#include <netipsec/esp_var.h>
	#include <netipsec/ipcomp_var.h>

	#include <netipsec/xform.h>

	#include <netipsec/key.h>
	#include <netipsec/keydb.h>
	#include <netipsec/key_debug.h>

	#include <machine/in_cksum.h>

	#ifdef DEV_ENC
	#include <net/if_enc.h>
	#endif


	int
	ipsec_process_done(struct mbuf m, struct ipsecrequest isr)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct tdb_ident *tdbi;
	struct m_tag *mtag;
	struct secasvar *sav;
	struct secasindex *saidx;
	int error;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(isr != NULL, ("null ISR"));
	sav = isr->sav;
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));

	saidx = &sav->sah->saidx;
	switch (saidx->dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	/* Fix the header length, for AH processing. */
	mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	/* Fix the header length, for AH processing. */
	if (m->m_pkthdr.len < sizeof (struct ip6_hdr)) {
	error = ENXIO;
	goto bad;
	}
	if (m->m_pkthdr.len - sizeof (struct ip6_hdr) > IPV6_MAXPACKET) {
	/* No jumbogram support. */
	error = ENXIO; /?/
	goto bad;
	}
	mtod(m, struct ip6_hdr *)->ip6_plen =
	htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
	break;
	#endif /* INET6 */
	default:
	DPRINTF(("%s: unknown protocol family %u\n", __func__,
	saidx->dst.sa.sa_family));
	error = ENXIO;
	goto bad;
	}

	/*
	* Add a record of what we've done or what needs to be done to the
	* packet.
	*/
	mtag = m_tag_get(PACKET_TAG_IPSEC_OUT_DONE,
	sizeof(struct tdb_ident), M_NOWAIT);
	if (mtag == NULL) {
	DPRINTF(("%s: could not get packet tag\n", __func__));
	error = ENOMEM;
	goto bad;
	}

	tdbi = (struct tdb_ident *)(mtag + 1);
	tdbi->dst = saidx->dst;
	tdbi->proto = saidx->proto;
	tdbi->spi = sav->spi;
	m_tag_prepend(m, mtag);

	/*
	* If there's another (bundled) SA to apply, do so.
	* Note that this puts a burden on the kernel stack size.
	* If this is a problem we'll need to introduce a queue
	* to set the packet on so we can unwind the stack before
	* doing further processing.
	*/
	if (isr->next) {
	V_ipsec4stat.ips_out_bundlesa++;
	return ipsec4_process_packet(m, isr->next, 0, 0);
	}
	key_sa_recordxfer(sav, m); /* record data transfer */

	/*
	* We're done with IPsec processing, transmit the packet using the
	* appropriate network protocol (IP or IPv6). SPD lookup will be
	* performed again there.
	*/
	switch (saidx->dst.sa.sa_family) {
	#ifdef INET
	struct ip *ip;
	case AF_INET:
	ip = mtod(m, struct ip *);
	ip->ip_len = ntohs(ip->ip_len);
	ip->ip_off = ntohs(ip->ip_off);

	return ip_output(m, NULL, NULL, IP_RAWOUTPUT, NULL, NULL);
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	/*
	* We don't need massage, IPv6 header fields are always in
	* net endian.
	*/
	return ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
	#endif /* INET6 */
	}
	panic("ipsec_process_done");
	bad:
	m_freem(m);
	KEY_FREESAV(&sav);
	return (error);
	}

	static struct ipsecrequest *
	ipsec_nextisr(
	struct mbuf *m,
	struct ipsecrequest *isr,
	int af,
	struct secasindex *saidx,
	int *error
	)
	{
	#define IPSEC_OSTAT(x,y,z) (isr->saidx.proto == IPPROTO_ESP ? (x)++ : \
	isr->saidx.proto == IPPROTO_AH ? (y)++ : (z)++)
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar *sav;

	IPSECREQUEST_LOCK_ASSERT(isr);

	IPSEC_ASSERT(af == AF_INET \|\| af == AF_INET6,
	("invalid address family %u", af));
	again:
	/*
	* Craft SA index to search for proper SA. Note that
	* we only fillin unspecified SA peers for transport
	* mode; for tunnel mode they must already be filled in.
	*/
	*saidx = isr->saidx;
	if (isr->saidx.mode == IPSEC_MODE_TRANSPORT) {
	/* Fillin unspecified SA peers only for transport mode */
	if (af == AF_INET) {
	struct sockaddr_in *sin;
	struct ip ip = mtod(m, struct ip );

	if (saidx->src.sa.sa_len == 0) {
	sin = &saidx->src.sin;
	sin->sin_len = sizeof(*sin);
	sin->sin_family = AF_INET;
	sin->sin_port = IPSEC_PORT_ANY;
	sin->sin_addr = ip->ip_src;
	}
	if (saidx->dst.sa.sa_len == 0) {
	sin = &saidx->dst.sin;
	sin->sin_len = sizeof(*sin);
	sin->sin_family = AF_INET;
	sin->sin_port = IPSEC_PORT_ANY;
	sin->sin_addr = ip->ip_dst;
	}
	} else {
	struct sockaddr_in6 *sin6;
	struct ip6_hdr ip6 = mtod(m, struct ip6_hdr );

	if (saidx->src.sin6.sin6_len == 0) {
	sin6 = (struct sockaddr_in6 *)&saidx->src;
	sin6->sin6_len = sizeof(*sin6);
	sin6->sin6_family = AF_INET6;
	sin6->sin6_port = IPSEC_PORT_ANY;
	sin6->sin6_addr = ip6->ip6_src;
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
	/* fix scope id for comparing SPD */
	sin6->sin6_addr.s6_addr16[1] = 0;
	sin6->sin6_scope_id =
	ntohs(ip6->ip6_src.s6_addr16[1]);
	}
	}
	if (saidx->dst.sin6.sin6_len == 0) {
	sin6 = (struct sockaddr_in6 *)&saidx->dst;
	sin6->sin6_len = sizeof(*sin6);
	sin6->sin6_family = AF_INET6;
	sin6->sin6_port = IPSEC_PORT_ANY;
	sin6->sin6_addr = ip6->ip6_dst;
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) {
	/* fix scope id for comparing SPD */
	sin6->sin6_addr.s6_addr16[1] = 0;
	sin6->sin6_scope_id =
	ntohs(ip6->ip6_dst.s6_addr16[1]);
	}
	}
	}
	}

	/*
	* Lookup SA and validate it.
	*/
	*error = key_checkrequest(isr, saidx);
	if (*error != 0) {
	/*
	* IPsec processing is required, but no SA found.
	* I assume that key_acquire() had been called
	* to get/establish the SA. Here I discard
	* this packet because it is responsibility for
	* upper layer to retransmit the packet.
	*/
	V_ipsec4stat.ips_out_nosa++;
	goto bad;
	}
	sav = isr->sav;
	if (sav == NULL) {
	IPSEC_ASSERT(ipsec_get_reqlevel(isr) == IPSEC_LEVEL_USE,
	("no SA found, but required; level %u",
	ipsec_get_reqlevel(isr)));
	IPSECREQUEST_UNLOCK(isr);
	isr = isr->next;
	/*
	* If isr is NULL, we found a 'use' policy w/o SA.
	* Return w/o error and w/o isr so we can drop out
	* and continue w/o IPsec processing.
	*/
	if (isr == NULL)
	return isr;
	IPSECREQUEST_LOCK(isr);
	goto again;
	}

	/*
	* Check system global policy controls.
	*/
	if ((isr->saidx.proto == IPPROTO_ESP && !V_esp_enable) \|\|
	(isr->saidx.proto == IPPROTO_AH && !V_ah_enable) \|\|
	(isr->saidx.proto == IPPROTO_IPCOMP && !V_ipcomp_enable)) {
	DPRINTF(("%s: IPsec outbound packet dropped due"
	" to policy (check your sysctls)\n", __func__));
	IPSEC_OSTAT(V_espstat.esps_pdrops, V_ahstat.ahs_pdrops,
	V_ipcompstat.ipcomps_pdrops);
	*error = EHOSTUNREACH;
	goto bad;
	}

	/*
	* Sanity check the SA contents for the caller
	* before they invoke the xform output method.
	*/
	if (sav->tdb_xform == NULL) {
	DPRINTF(("%s: no transform for SA\n", __func__));
	IPSEC_OSTAT(V_espstat.esps_noxform, V_ahstat.ahs_noxform,
	V_ipcompstat.ipcomps_noxform);
	*error = EHOSTUNREACH;
	goto bad;
	}
	return isr;
	bad:
	IPSEC_ASSERT(*error != 0, ("error return w/ no error code"));
	IPSECREQUEST_UNLOCK(isr);
	return NULL;
	#undef IPSEC_OSTAT
	}

	#ifdef INET
	/*
	* IPsec output logic for IPv4.
	*/
	int
	ipsec4_process_packet(
	struct mbuf *m,
	struct ipsecrequest *isr,
	int flags,
	int tunalready)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasindex saidx;
	struct secasvar *sav;
	struct ip *ip;
	int error, i, off;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(isr != NULL, ("null isr"));

	IPSECREQUEST_LOCK(isr); /* insure SA contents don't change */

	isr = ipsec_nextisr(m, isr, AF_INET, &saidx, &error);
	if (isr == NULL) {
	if (error != 0)
	goto bad;
	return EJUSTRETURN;
	}

	sav = isr->sav;

	#ifdef DEV_ENC
	encif->if_opackets++;
	encif->if_obytes += m->m_pkthdr.len;

	/* pass the mbuf to enc0 for bpf processing */
	ipsec_bpf(m, sav, AF_INET, ENC_OUT\|ENC_BEFORE);
	/* pass the mbuf to enc0 for packet filtering */
	if ((error = ipsec_filter(&m, PFIL_OUT, ENC_OUT\|ENC_BEFORE)) != 0)
	goto bad;
	#endif

	if (!tunalready) {
	union sockaddr_union *dst = &sav->sah->saidx.dst;
	int setdf;

	/*
	* Collect IP_DF state from the outer header.
	*/
	if (dst->sa.sa_family == AF_INET) {
	if (m->m_len < sizeof (struct ip) &&
	(m = m_pullup(m, sizeof (struct ip))) == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	ip = mtod(m, struct ip *);
	/* Honor system-wide control of how to handle IP_DF */
	switch (V_ip4_ipsec_dfbit) {
	case 0: /* clear in outer header */
	case 1: /* set in outer header */
	setdf = V_ip4_ipsec_dfbit;
	break;
	default: /* propagate to outer header */
	setdf = ntohs(ip->ip_off & IP_DF);
	break;
	}
	} else {
	ip = NULL; /* keep compiler happy */
	setdf = 0;
	}
	/* Do the appropriate encapsulation, if necessary */
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL \|\| /* Tunnel requ'd */
	dst->sa.sa_family != AF_INET \|\| /* PF mismatch */
	#if 0
	(sav->flags & SADB_X_SAFLAGS_TUNNEL) \|\| /* Tunnel requ'd */
	sav->tdb_xform->xf_type == XF_IP4 \|\| /* ditto */
	#endif
	(dst->sa.sa_family == AF_INET && /* Proxy */
	dst->sin.sin_addr.s_addr != INADDR_ANY &&
	dst->sin.sin_addr.s_addr != ip->ip_dst.s_addr)) {
	struct mbuf *mp;

	/* Fix IPv4 header checksum and length */
	if (m->m_len < sizeof (struct ip) &&
	(m = m_pullup(m, sizeof (struct ip))) == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	ip = mtod(m, struct ip *);
	ip->ip_len = htons(m->m_pkthdr.len);
	ip->ip_sum = 0;
	#ifdef _IP_VHL
	if (ip->ip_vhl == IP_VHL_BORING)
	ip->ip_sum = in_cksum_hdr(ip);
	else
	ip->ip_sum = in_cksum(m,
	_IP_VHL_HL(ip->ip_vhl) << 2);
	#else
	ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
	#endif

	/* Encapsulate the packet */
	error = ipip_output(m, isr, &mp, 0, 0);
	if (mp == NULL && !error) {
	/* Should never happen. */
	DPRINTF(("%s: ipip_output returns no mbuf and "
	"no error!", __func__));
	error = EFAULT;
	}
	if (error) {
	if (mp) {
	/* XXX: Should never happen! */
	m_freem(mp);
	}
	m = NULL; /* ipip_output() already freed it */
	goto bad;
	}
	m = mp, mp = NULL;
	/*
	* ipip_output clears IP_DF in the new header. If
	* we need to propagate IP_DF from the outer header,
	* then we have to do it here.
	*
	* XXX shouldn't assume what ipip_output does.
	*/
	if (dst->sa.sa_family == AF_INET && setdf) {
	if (m->m_len < sizeof (struct ip) &&
	(m = m_pullup(m, sizeof (struct ip))) == NULL) {
	error = ENOBUFS;
	goto bad;
	}
	ip = mtod(m, struct ip *);
	ip->ip_off = ntohs(ip->ip_off);
	ip->ip_off \|= IP_DF;
	ip->ip_off = htons(ip->ip_off);
	}
	}
	}

	#ifdef DEV_ENC
	/* pass the mbuf to enc0 for bpf processing */
	ipsec_bpf(m, sav, AF_INET, ENC_OUT\|ENC_AFTER);
	/* pass the mbuf to enc0 for packet filtering */
	if ((error = ipsec_filter(&m, PFIL_OUT, ENC_OUT\|ENC_AFTER)) != 0)
	goto bad;
	#endif

	/*
	* Dispatch to the appropriate IPsec transform logic. The
	* packet will be returned for transmission after crypto
	* processing, etc. are completed. For encapsulation we
	* bypass this call because of the explicit call done above
	* (necessary to deal with IP_DF handling for IPv4).
	*
	* NB: m & sav are ``passed to caller'' who's reponsible for
	* for reclaiming their resources.
	*/
	if (sav->tdb_xform->xf_type != XF_IP4) {
	ip = mtod(m, struct ip *);
	i = ip->ip_hl << 2;
	off = offsetof(struct ip, ip_p);
	error = (*sav->tdb_xform->xf_output)(m, isr, NULL, i, off);
	} else {
	error = ipsec_process_done(m, isr);
	}
	IPSECREQUEST_UNLOCK(isr);
	return error;
	bad:
	if (isr)
	IPSECREQUEST_UNLOCK(isr);
	if (m)
	m_freem(m);
	return error;
	}
	#endif

	#ifdef INET6
	/*
	* Chop IP6 header from the payload.
	*/
	static struct mbuf *
	ipsec6_splithdr(struct mbuf *m)
	{
	struct mbuf *mh;
	struct ip6_hdr *ip6;
	int hlen;

	IPSEC_ASSERT(m->m_len >= sizeof (struct ip6_hdr),
	("first mbuf too short, len %u", m->m_len));
	ip6 = mtod(m, struct ip6_hdr *);
	hlen = sizeof(struct ip6_hdr);
	if (m->m_len > hlen) {
	MGETHDR(mh, M_DONTWAIT, MT_DATA);
	if (!mh) {
	m_freem(m);
	return NULL;
	}
	M_MOVE_PKTHDR(mh, m);
	MH_ALIGN(mh, hlen);
	m->m_len -= hlen;
	m->m_data += hlen;
	mh->m_next = m;
	m = mh;
	m->m_len = hlen;
	bcopy((caddr_t)ip6, mtod(m, caddr_t), hlen);
	} else if (m->m_len < hlen) {
	m = m_pullup(m, hlen);
	if (!m)
	return NULL;
	}
	return m;
	}

	/*
	* IPsec output logic for IPv6, transport mode.
	*/
	int
	ipsec6_output_trans(
	struct ipsec_output_state *state,
	u_char *nexthdrp,
	struct mbuf *mprev,
	struct secpolicy *sp,
	int flags,
	int *tun)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct ipsecrequest *isr;
	struct secasindex saidx;
	int error = 0;
	struct mbuf *m;

	IPSEC_ASSERT(state != NULL, ("null state"));
	IPSEC_ASSERT(state->m != NULL, ("null m"));
	IPSEC_ASSERT(nexthdrp != NULL, ("null nexthdrp"));
	IPSEC_ASSERT(mprev != NULL, ("null mprev"));
	IPSEC_ASSERT(sp != NULL, ("null sp"));
	IPSEC_ASSERT(tun != NULL, ("null tun"));

	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("%s: applied SP\n", __func__);
	kdebug_secpolicy(sp));

	isr = sp->req;
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
	/* the rest will be handled by ipsec6_output_tunnel() */
	tun = 1; / need tunnel-mode processing */
	return 0;
	}

	*tun = 0;
	m = state->m;

	IPSECREQUEST_LOCK(isr); /* insure SA contents don't change */
	isr = ipsec_nextisr(m, isr, AF_INET6, &saidx, &error);
	if (isr == NULL) {
	if (error != 0) {
	#ifdef notdef
	/* XXX should notification be done for all errors ? */
	/*
	* Notify the fact that the packet is discarded
	* to ourselves. I believe this is better than
	* just silently discarding. (jinmei@kame.net)
	* XXX: should we restrict the error to TCP packets?
	* XXX: should we directly notify sockets via
	* pfctlinputs?
	*/
	icmp6_error(m, ICMP6_DST_UNREACH,
	ICMP6_DST_UNREACH_ADMIN, 0);
	m = NULL; /* NB: icmp6_error frees mbuf */
	#endif
	goto bad;
	}
	return EJUSTRETURN;
	}

	error = (*isr->sav->tdb_xform->xf_output)(m, isr, NULL,
	sizeof (struct ip6_hdr),
	offsetof(struct ip6_hdr,
	ip6_nxt));
	IPSECREQUEST_UNLOCK(isr);
	return error;
	bad:
	if (isr)
	IPSECREQUEST_UNLOCK(isr);
	if (m)
	m_freem(m);
	state->m = NULL;
	return error;
	}

	static int
	ipsec6_encapsulate(struct mbuf m, struct secasvar sav)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct ip6_hdr *oip6;
	struct ip6_hdr *ip6;
	size_t plen;

	/* can't tunnel between different AFs */
	if (sav->sah->saidx.src.sa.sa_family != AF_INET6 \|\|
	sav->sah->saidx.dst.sa.sa_family != AF_INET6) {
	m_freem(m);
	return EINVAL;
	}
	IPSEC_ASSERT(m->m_len == sizeof (struct ip6_hdr),
	("mbuf wrong size; len %u", m->m_len));


	/*
	* grow the mbuf to accomodate the new IPv6 header.
	*/
	plen = m->m_pkthdr.len;
	if (M_LEADINGSPACE(m->m_next) < sizeof(struct ip6_hdr)) {
	struct mbuf *n;
	MGET(n, M_DONTWAIT, MT_DATA);
	if (!n) {
	m_freem(m);
	return ENOBUFS;
	}
	n->m_len = sizeof(struct ip6_hdr);
	n->m_next = m->m_next;
	m->m_next = n;
	m->m_pkthdr.len += sizeof(struct ip6_hdr);
	oip6 = mtod(n, struct ip6_hdr *);
	} else {
	m->m_next->m_len += sizeof(struct ip6_hdr);
	m->m_next->m_data -= sizeof(struct ip6_hdr);
	m->m_pkthdr.len += sizeof(struct ip6_hdr);
	oip6 = mtod(m->m_next, struct ip6_hdr *);
	}
	ip6 = mtod(m, struct ip6_hdr *);
	bcopy((caddr_t)ip6, (caddr_t)oip6, sizeof(struct ip6_hdr));

	/* Fake link-local scope-class addresses */
	if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src))
	oip6->ip6_src.s6_addr16[1] = 0;
	if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst))
	oip6->ip6_dst.s6_addr16[1] = 0;

	/* construct new IPv6 header. see RFC 2401 5.1.2.2 */
	/* ECN consideration. */
	ip6_ecn_ingress(V_ip6_ipsec_ecn, &ip6->ip6_flow, &oip6->ip6_flow);
	if (plen < IPV6_MAXPACKET - sizeof(struct ip6_hdr))
	ip6->ip6_plen = htons(plen);
	else {
	/* ip6->ip6_plen will be updated in ip6_output() */
	}
	ip6->ip6_nxt = IPPROTO_IPV6;
	ip6->ip6_src = sav->sah->saidx.src.sin6.sin6_addr;
	ip6->ip6_dst = sav->sah->saidx.dst.sin6.sin6_addr;
	ip6->ip6_hlim = IPV6_DEFHLIM;

	/* XXX Should ip6_src be updated later ? */

	return 0;
	}

	/*
	* IPsec output logic for IPv6, tunnel mode.
	*/
	int
	ipsec6_output_tunnel(struct ipsec_output_state state, struct secpolicy sp, int flags)
	{
	+ INIT_VNET_INET6(curvnet);
	+ INIT_VNET_IPSEC(curvnet);
	struct ip6_hdr *ip6;
	struct ipsecrequest *isr;
	struct secasindex saidx;
	int error;
	struct sockaddr_in6* dst6;
	struct mbuf *m;

	IPSEC_ASSERT(state != NULL, ("null state"));
	IPSEC_ASSERT(state->m != NULL, ("null m"));
	IPSEC_ASSERT(sp != NULL, ("null sp"));

	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("%s: applied SP\n", __func__);
	kdebug_secpolicy(sp));

	m = state->m;
	/*
	* transport mode ipsec (before the 1st tunnel mode) is already
	* processed by ipsec6_output_trans().
	*/
	for (isr = sp->req; isr; isr = isr->next) {
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL)
	break;
	}

	IPSECREQUEST_LOCK(isr); /* insure SA contents don't change */
	isr = ipsec_nextisr(m, isr, AF_INET6, &saidx, &error);
	if (isr == NULL) {
	if (error != 0)
	goto bad;
	return EJUSTRETURN;
	}

	#ifdef DEV_ENC
	encif->if_opackets++;
	encif->if_obytes += m->m_pkthdr.len;

	/* pass the mbuf to enc0 for bpf processing */
	ipsec_bpf(m, isr->sav, AF_INET6, ENC_OUT\|ENC_BEFORE);
	/* pass the mbuf to enc0 for packet filtering */
	if ((error = ipsec_filter(&m, PFIL_OUT, ENC_OUT\|ENC_BEFORE)) != 0)
	goto bad;
	#endif

	/*
	* There may be the case that SA status will be changed when
	* we are refering to one. So calling splsoftnet().
	*/
	if (isr->saidx.mode == IPSEC_MODE_TUNNEL) {
	/*
	* build IPsec tunnel.
	*/
	/* XXX should be processed with other familiy */
	if (isr->sav->sah->saidx.src.sa.sa_family != AF_INET6) {
	ipseclog((LOG_ERR, "%s: family mismatched between "
	"inner and outer, spi=%u\n", __func__,
	ntohl(isr->sav->spi)));
	V_ipsec6stat.ips_out_inval++;
	error = EAFNOSUPPORT;
	goto bad;
	}

	m = ipsec6_splithdr(m);
	if (!m) {
	V_ipsec6stat.ips_out_nomem++;
	error = ENOMEM;
	goto bad;
	}
	error = ipsec6_encapsulate(m, isr->sav);
	if (error) {
	m = NULL;
	goto bad;
	}
	ip6 = mtod(m, struct ip6_hdr *);

	state->ro = &isr->sav->sah->sa_route;
	state->dst = (struct sockaddr *)&state->ro->ro_dst;
	dst6 = (struct sockaddr_in6 *)state->dst;
	if (state->ro->ro_rt
	&& ((state->ro->ro_rt->rt_flags & RTF_UP) == 0
	\|\| !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) {
	RTFREE(state->ro->ro_rt);
	state->ro->ro_rt = NULL;
	}
	if (state->ro->ro_rt == 0) {
	bzero(dst6, sizeof(*dst6));
	dst6->sin6_family = AF_INET6;
	dst6->sin6_len = sizeof(*dst6);
	dst6->sin6_addr = ip6->ip6_dst;
	rtalloc(state->ro);
	}
	if (state->ro->ro_rt == 0) {
	V_ip6stat.ip6s_noroute++;
	V_ipsec6stat.ips_out_noroute++;
	error = EHOSTUNREACH;
	goto bad;
	}

	/* adjust state->dst if tunnel endpoint is offlink */
	if (state->ro->ro_rt->rt_flags & RTF_GATEWAY) {
	state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway;
	dst6 = (struct sockaddr_in6 *)state->dst;
	}
	}

	m = ipsec6_splithdr(m);
	if (!m) {
	V_ipsec6stat.ips_out_nomem++;
	error = ENOMEM;
	goto bad;
	}
	ip6 = mtod(m, struct ip6_hdr *);

	#ifdef DEV_ENC
	/* pass the mbuf to enc0 for bpf processing */
	ipsec_bpf(m, isr->sav, AF_INET6, ENC_OUT\|ENC_AFTER);
	/* pass the mbuf to enc0 for packet filtering */
	if ((error = ipsec_filter(&m, PFIL_OUT, ENC_OUT\|ENC_AFTER)) != 0)
	goto bad;
	#endif

	error = (*isr->sav->tdb_xform->xf_output)(m, isr, NULL,
	sizeof (struct ip6_hdr),
	offsetof(struct ip6_hdr, ip6_nxt));
	IPSECREQUEST_UNLOCK(isr);
	return error;
	bad:
	if (isr)
	IPSECREQUEST_UNLOCK(isr);
	if (m)
	m_freem(m);
	state->m = NULL;
	return error;
	}
	#endif /INET6/
	Index: head/sys/netipsec/key.c
	===================================================================
	--- head/sys/netipsec/key.c (revision 183549)
	+++ head/sys/netipsec/key.c (revision 183550)
	@@ -1,7373 +1,7446 @@
	/* $FreeBSD$ */
	/* $KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* This code is referd to RFC 2367
	*/

	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_ipsec.h"

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/mbuf.h>
	#include <sys/domain.h>
	#include <sys/protosw.h>
	#include <sys/malloc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/errno.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/refcount.h>
	#include <sys/syslog.h>
	+#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <net/raw_cb.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/in_var.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/ip6_var.h>
	#endif /* INET6 */

	#ifdef INET
	#include <netinet/in_pcb.h>
	#endif
	#ifdef INET6
	#include <netinet6/in6_pcb.h>
	#endif /* INET6 */

	#include <net/pfkeyv2.h>
	#include <netipsec/keydb.h>
	#include <netipsec/key.h>
	#include <netipsec/keysock.h>
	#include <netipsec/key_debug.h>

	#include <netipsec/ipsec.h>
	#ifdef INET6
	#include <netipsec/ipsec6.h>
	#endif

	#include <netipsec/xform.h>

	#include <machine/stdarg.h>

	/* randomness */
	#include <sys/random.h>
	#include <sys/vimage.h>

	#define FULLMASK 0xff
	#define _BITS(bytes) ((bytes) << 3)

	/*
	* Note on SA reference counting:
	* - SAs that are not in DEAD state will have (total external reference + 1)
	* following value in reference count field. they cannot be freed and are
	* referenced from SA header.
	* - SAs that are in DEAD state will have (total external reference)
	* in reference count field. they are ready to be freed. reference from
	* SA header will be removed in key_delsav(), when the reference count
	* field hits 0 (= no external reference other than from SA header.
	*/

	u_int32_t key_debug_level = 0;
	static u_int key_spi_trycnt = 1000;
	static u_int32_t key_spi_minval = 0x100;
	static u_int32_t key_spi_maxval = 0x0fffffff; /* XXX */
	static u_int32_t policy_id = 0;
	static u_int key_int_random = 60; /interval to initialize randseed,1(m)/
	static u_int key_larval_lifetime = 30; /* interval to expire acquiring, 30(s)*/
	static int key_blockacq_count = 10; /* counter for blocking SADB_ACQUIRE.*/
	static int key_blockacq_lifetime = 20; /* lifetime for blocking SADB_ACQUIRE.*/
	static int key_preferred_oldsa = 1; /* preferred old sa rather than new sa.*/

	static u_int32_t acq_seq = 0;

	static LIST_HEAD(_sptree, secpolicy) sptree[IPSEC_DIR_MAX]; /* SPD */
	static struct mtx sptree_lock;
	#define SPTREE_LOCK_INIT() \
	mtx_init(&sptree_lock, "sptree", \
	"fast ipsec security policy database", MTX_DEF)
	#define SPTREE_LOCK_DESTROY() mtx_destroy(&sptree_lock)
	#define SPTREE_LOCK() mtx_lock(&sptree_lock)
	#define SPTREE_UNLOCK() mtx_unlock(&sptree_lock)
	#define SPTREE_LOCK_ASSERT() mtx_assert(&sptree_lock, MA_OWNED)

	static LIST_HEAD(_sahtree, secashead) sahtree; /* SAD */
	static struct mtx sahtree_lock;
	#define SAHTREE_LOCK_INIT() \
	mtx_init(&sahtree_lock, "sahtree", \
	"fast ipsec security association database", MTX_DEF)
	#define SAHTREE_LOCK_DESTROY() mtx_destroy(&sahtree_lock)
	#define SAHTREE_LOCK() mtx_lock(&sahtree_lock)
	#define SAHTREE_UNLOCK() mtx_unlock(&sahtree_lock)
	#define SAHTREE_LOCK_ASSERT() mtx_assert(&sahtree_lock, MA_OWNED)

	/* registed list */
	static LIST_HEAD(_regtree, secreg) regtree[SADB_SATYPE_MAX + 1];
	static struct mtx regtree_lock;
	#define REGTREE_LOCK_INIT() \
	mtx_init(&regtree_lock, "regtree", "fast ipsec regtree", MTX_DEF)
	#define REGTREE_LOCK_DESTROY() mtx_destroy(&regtree_lock)
	#define REGTREE_LOCK() mtx_lock(&regtree_lock)
	#define REGTREE_UNLOCK() mtx_unlock(&regtree_lock)
	#define REGTREE_LOCK_ASSERT() mtx_assert(&regtree_lock, MA_OWNED)

	static LIST_HEAD(_acqtree, secacq) acqtree; /* acquiring list */
	static struct mtx acq_lock;
	#define ACQ_LOCK_INIT() \
	mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF)
	#define ACQ_LOCK_DESTROY() mtx_destroy(&acq_lock)
	#define ACQ_LOCK() mtx_lock(&acq_lock)
	#define ACQ_UNLOCK() mtx_unlock(&acq_lock)
	#define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED)

	static LIST_HEAD(_spacqtree, secspacq) spacqtree; /* SP acquiring list */
	static struct mtx spacq_lock;
	#define SPACQ_LOCK_INIT() \
	mtx_init(&spacq_lock, "spacqtree", \
	"fast ipsec security policy acquire list", MTX_DEF)
	#define SPACQ_LOCK_DESTROY() mtx_destroy(&spacq_lock)
	#define SPACQ_LOCK() mtx_lock(&spacq_lock)
	#define SPACQ_UNLOCK() mtx_unlock(&spacq_lock)
	#define SPACQ_LOCK_ASSERT() mtx_assert(&spacq_lock, MA_OWNED)

	/* search order for SAs */
	static const u_int saorder_state_valid_prefer_old[] = {
	SADB_SASTATE_DYING, SADB_SASTATE_MATURE,
	};
	static const u_int saorder_state_valid_prefer_new[] = {
	SADB_SASTATE_MATURE, SADB_SASTATE_DYING,
	};
	static u_int saorder_state_alive[] = {
	/* except DEAD */
	SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL
	};
	static u_int saorder_state_any[] = {
	SADB_SASTATE_MATURE, SADB_SASTATE_DYING,
	SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD
	};

	static const int minsize[] = {
	sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */
	sizeof(struct sadb_sa), /* SADB_EXT_SA */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */
	sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_SRC */
	sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_DST */
	sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_PROXY */
	sizeof(struct sadb_key), /* SADB_EXT_KEY_AUTH */
	sizeof(struct sadb_key), /* SADB_EXT_KEY_ENCRYPT */
	sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_SRC */
	sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_DST */
	sizeof(struct sadb_sens), /* SADB_EXT_SENSITIVITY */
	sizeof(struct sadb_prop), /* SADB_EXT_PROPOSAL */
	sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_AUTH */
	sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_ENCRYPT */
	sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */
	0, /* SADB_X_EXT_KMPRIVATE */
	sizeof(struct sadb_x_policy), /* SADB_X_EXT_POLICY */
	sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */
	};
	static const int maxsize[] = {
	sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */
	sizeof(struct sadb_sa), /* SADB_EXT_SA */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */
	sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */
	0, /* SADB_EXT_ADDRESS_SRC */
	0, /* SADB_EXT_ADDRESS_DST */
	0, /* SADB_EXT_ADDRESS_PROXY */
	0, /* SADB_EXT_KEY_AUTH */
	0, /* SADB_EXT_KEY_ENCRYPT */
	0, /* SADB_EXT_IDENTITY_SRC */
	0, /* SADB_EXT_IDENTITY_DST */
	0, /* SADB_EXT_SENSITIVITY */
	0, /* SADB_EXT_PROPOSAL */
	0, /* SADB_EXT_SUPPORTED_AUTH */
	0, /* SADB_EXT_SUPPORTED_ENCRYPT */
	sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */
	0, /* SADB_X_EXT_KMPRIVATE */
	0, /* SADB_X_EXT_POLICY */
	sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */
	};

	static int ipsec_esp_keymin = 256;
	static int ipsec_esp_auth = 0;
	static int ipsec_ah_keymin = 128;

	#ifdef SYSCTL_DECL
	SYSCTL_DECL(_net_key);
	#endif

	-SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW, \
	- &key_debug_level, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_DEBUG_LEVEL, debug,
	+ CTLFLAG_RW, key_debug_level, 0, "");

	/* max count of trial for the decision of spi value */
	-SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW, \
	- &key_spi_trycnt, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec,_net_key, KEYCTL_SPI_TRY, spi_trycnt,
	+ CTLFLAG_RW, key_spi_trycnt, 0, "");

	/* minimum spi value to allocate automatically. */
	-SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW, \
	- &key_spi_minval, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MIN_VALUE,
	+ spi_minval, CTLFLAG_RW, key_spi_minval, 0, "");

	/* maximun spi value to allocate automatically. */
	-SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW, \
	- &key_spi_maxval, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_SPI_MAX_VALUE,
	+ spi_maxval, CTLFLAG_RW, key_spi_maxval, 0, "");

	/* interval to initialize randseed */
	-SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW, \
	- &key_int_random, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_RANDOM_INT,
	+ int_random, CTLFLAG_RW, key_int_random, 0, "");

	/* lifetime for larval SA */
	-SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW, \
	- &key_larval_lifetime, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_LARVAL_LIFETIME,
	+ larval_lifetime, CTLFLAG_RW, key_larval_lifetime, 0, "");

	/* counter for blocking to send SADB_ACQUIRE to IKEd */
	-SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW, \
	- &key_blockacq_count, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_COUNT,
	+ blockacq_count, CTLFLAG_RW, key_blockacq_count, 0, "");

	/* lifetime for blocking to send SADB_ACQUIRE to IKEd */
	-SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW, \
	- &key_blockacq_lifetime, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_BLOCKACQ_LIFETIME,
	+ blockacq_lifetime, CTLFLAG_RW, key_blockacq_lifetime, 0, "");

	/* ESP auth */
	-SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW, \
	- &ipsec_esp_auth, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_AUTH, esp_auth,
	+ CTLFLAG_RW, ipsec_esp_auth, 0, "");

	/* minimum ESP key length */
	-SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW, \
	- &ipsec_esp_keymin, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_ESP_KEYMIN,
	+ esp_keymin, CTLFLAG_RW, ipsec_esp_keymin, 0, "");

	/* minimum AH key length */
	-SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW, \
	- &ipsec_ah_keymin, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_AH_KEYMIN, ah_keymin,
	+ CTLFLAG_RW, ipsec_ah_keymin, 0, "");

	/* perfered old SA rather than new SA */
	-SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa, CTLFLAG_RW,\
	- &key_preferred_oldsa, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_key, KEYCTL_PREFERED_OLDSA,
	+ preferred_oldsa, CTLFLAG_RW, key_preferred_oldsa, 0, "");

	#define __LIST_CHAINED(elm) \
	(!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL))
	#define LIST_INSERT_TAIL(head, elm, type, field) \
	do {\
	struct type *curelm = LIST_FIRST(head); \
	if (curelm == NULL) {\
	LIST_INSERT_HEAD(head, elm, field); \
	} else { \
	while (LIST_NEXT(curelm, field)) \
	curelm = LIST_NEXT(curelm, field);\
	LIST_INSERT_AFTER(curelm, elm, field);\
	}\
	} while (0)

	#define KEY_CHKSASTATE(head, sav, name) \
	do { \
	if ((head) != (sav)) { \
	ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \
	(name), (head), (sav))); \
	continue; \
	} \
	} while (0)

	#define KEY_CHKSPDIR(head, sp, name) \
	do { \
	if ((head) != (sp)) { \
	ipseclog((LOG_DEBUG, "%s: direction mismatched (TREE=%d SP=%d), " \
	"anyway continue.\n", \
	(name), (head), (sp))); \
	} \
	} while (0)

	MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association");
	MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head");
	MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy");
	MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request");
	MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous");
	MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire");
	MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire");

	/*
	* set parameters into secpolicyindex buffer.
	* Must allocate secpolicyindex buffer passed to this function.
	*/
	#define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \
	do { \
	bzero((idx), sizeof(struct secpolicyindex)); \
	(idx)->dir = (_dir); \
	(idx)->prefs = (ps); \
	(idx)->prefd = (pd); \
	(idx)->ul_proto = (ulp); \
	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \
	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \
	} while (0)

	/*
	* set parameters into secasindex buffer.
	* Must allocate secasindex buffer before calling this function.
	*/
	#define KEY_SETSECASIDX(p, m, r, s, d, idx) \
	do { \
	bzero((idx), sizeof(struct secasindex)); \
	(idx)->proto = (p); \
	(idx)->mode = (m); \
	(idx)->reqid = (r); \
	bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \
	bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \
	} while (0)

	/* key statistics */
	struct _keystat {
	u_long getspi_count; /* the avarage of count to try to get new SPI */
	} keystat;

	struct sadb_msghdr {
	struct sadb_msg *msg;
	struct sadb_ext *ext[SADB_EXT_MAX + 1];
	int extoff[SADB_EXT_MAX + 1];
	int extlen[SADB_EXT_MAX + 1];
	};

	static struct secasvar key_allocsa_policy __P((const struct secasindex ));
	static void key_freesp_so __P((struct secpolicy **));
	static struct secasvar key_do_allocsa_policy __P((struct secashead , u_int));
	static void key_delsp __P((struct secpolicy *));
	static struct secpolicy key_getsp __P((struct secpolicyindex ));
	static void _key_delsp(struct secpolicy *sp);
	static struct secpolicy *key_getspbyid __P((u_int32_t));
	static u_int32_t key_newreqid __P((void));
	static struct mbuf key_gather_mbuf __P((struct mbuf ,
	const struct sadb_msghdr *, int, int, ...));
	static int key_spdadd __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static u_int32_t key_getnewspid __P((void));
	static int key_spddelete __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_spddelete2 __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_spdget __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_spdflush __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_spddump __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static struct mbuf key_setdumpsp __P((struct secpolicy ,
	u_int8_t, u_int32_t, u_int32_t));
	static u_int key_getspreqmsglen __P((struct secpolicy *));
	static int key_spdexpire __P((struct secpolicy *));
	static struct secashead key_newsah __P((struct secasindex ));
	static void key_delsah __P((struct secashead *));
	static struct secasvar key_newsav __P((struct mbuf ,
	const struct sadb_msghdr , struct secashead , int *,
	const char*, int));
	#define KEY_NEWSAV(m, sadb, sah, e) \
	key_newsav(m, sadb, sah, e, __FILE__, __LINE__)
	static void key_delsav __P((struct secasvar *));
	static struct secashead key_getsah __P((struct secasindex ));
	static struct secasvar key_checkspidup __P((struct secasindex , u_int32_t));
	static struct secasvar key_getsavbyspi __P((struct secashead , u_int32_t));
	static int key_setsaval __P((struct secasvar , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_mature __P((struct secasvar *));
	static struct mbuf key_setdumpsa __P((struct secasvar , u_int8_t,
	u_int8_t, u_int32_t, u_int32_t));
	static struct mbuf *key_setsadbmsg __P((u_int8_t, u_int16_t, u_int8_t,
	u_int32_t, pid_t, u_int16_t));
	static struct mbuf key_setsadbsa __P((struct secasvar ));
	static struct mbuf *key_setsadbaddr __P((u_int16_t,
	const struct sockaddr *, u_int8_t, u_int16_t));
	static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t));
	static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t,
	u_int32_t));
	static struct seckey key_dup_keymsg(const struct sadb_key , u_int,
	struct malloc_type *);
	static struct seclifetime key_dup_lifemsg(const struct sadb_lifetime src,
	struct malloc_type *type);
	#ifdef INET6
	static int key_ismyaddr6 __P((struct sockaddr_in6 *));
	#endif

	/* flags for key_cmpsaidx() */
	#define CMP_HEAD 1 /* protocol, addresses. */
	#define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */
	#define CMP_REQID 3 /* additionally HEAD, reaid. */
	#define CMP_EXACTLY 4 /* all elements. */
	static int key_cmpsaidx
	__P((const struct secasindex , const struct secasindex , int));

	static int key_cmpspidx_exactly
	__P((struct secpolicyindex , struct secpolicyindex ));
	static int key_cmpspidx_withmask
	__P((struct secpolicyindex , struct secpolicyindex ));
	static int key_sockaddrcmp __P((const struct sockaddr , const struct sockaddr , int));
	static int key_bbcmp __P((const void , const void , u_int));
	static u_int16_t key_satype2proto __P((u_int8_t));
	static u_int8_t key_proto2satype __P((u_int16_t));

	static int key_getspi __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static u_int32_t key_do_getnewspi __P((struct sadb_spirange *,
	struct secasindex *));
	static int key_update __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	#ifdef IPSEC_DOSEQCHECK
	static struct secasvar key_getsavbyseq __P((struct secashead , u_int32_t));
	#endif
	static int key_add __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_setident __P((struct secashead , struct mbuf ,
	const struct sadb_msghdr *));
	static struct mbuf key_getmsgbuf_x1 __P((struct mbuf ,
	const struct sadb_msghdr *));
	static int key_delete __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_get __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));

	static void key_getcomb_setlifetime __P((struct sadb_comb *));
	static struct mbuf *key_getcomb_esp __P((void));
	static struct mbuf *key_getcomb_ah __P((void));
	static struct mbuf *key_getcomb_ipcomp __P((void));
	static struct mbuf key_getprop __P((const struct secasindex ));

	static int key_acquire __P((const struct secasindex , struct secpolicy ));
	static struct secacq key_newacq __P((const struct secasindex ));
	static struct secacq key_getacq __P((const struct secasindex ));
	static struct secacq *key_getacqbyseq __P((u_int32_t));
	static struct secspacq key_newspacq __P((struct secpolicyindex ));
	static struct secspacq key_getspacq __P((struct secpolicyindex ));
	static int key_acquire2 __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_register __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_expire __P((struct secasvar *));
	static int key_flush __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_dump __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_promisc __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *));
	static int key_senderror __P((struct socket , struct mbuf , int));
	static int key_validate_ext __P((const struct sadb_ext *, int));
	static int key_align __P((struct mbuf , struct sadb_msghdr ));
	static struct mbuf key_setlifetime(struct seclifetime src,
	u_int16_t exttype);
	static struct mbuf key_setkey(struct seckey src, u_int16_t exttype);

	#if 0
	static const char *key_getfqdn __P((void));
	static const char *key_getuserfqdn __P((void));
	#endif
	static void key_sa_chgstate __P((struct secasvar *, u_int8_t));
	static struct mbuf *key_alloc_mbuf __P((int));

	static __inline void
	sa_initref(struct secasvar *sav)
	{

	refcount_init(&sav->refcnt, 1);
	}
	static __inline void
	sa_addref(struct secasvar *sav)
	{

	refcount_acquire(&sav->refcnt);
	IPSEC_ASSERT(sav->refcnt != 0, ("SA refcnt overflow"));
	}
	static __inline int
	sa_delref(struct secasvar *sav)
	{

	IPSEC_ASSERT(sav->refcnt > 0, ("SA refcnt underflow"));
	return (refcount_release(&sav->refcnt));
	}

	#define SP_ADDREF(p) do { \
	(p)->refcnt++; \
	IPSEC_ASSERT((p)->refcnt != 0, ("SP refcnt overflow")); \
	} while (0)
	#define SP_DELREF(p) do { \
	IPSEC_ASSERT((p)->refcnt > 0, ("SP refcnt underflow")); \
	(p)->refcnt--; \
	} while (0)


	/*
	* Update the refcnt while holding the SPTREE lock.
	*/
	void
	key_addref(struct secpolicy *sp)
	{
	SPTREE_LOCK();
	SP_ADDREF(sp);
	SPTREE_UNLOCK();
	}

	/*
	* Return 0 when there are known to be no SP's for the specified
	* direction. Otherwise return 1. This is used by IPsec code
	* to optimize performance.
	*/
	int
	key_havesp(u_int dir)
	{
	+ INIT_VNET_IPSEC(curvnet);
	+
	return (dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND ?
	LIST_FIRST(&V_sptree[dir]) != NULL : 1);
	}

	/* %%% IPsec policy management */
	/*
	* allocating a SP for OUTBOUND or INBOUND packet.
	* Must call key_freesp() later.
	* OUT: NULL: not found
	* others: found and return the pointer.
	*/
	struct secpolicy *
	key_allocsp(struct secpolicyindex spidx, u_int dir, const char where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;

	IPSEC_ASSERT(spidx != NULL, ("null spidx"));
	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", dir));

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s from %s:%u\n", __func__, where, tag));

	/* get a SP entry */
	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("*** objects\n");
	kdebug_secpolicyindex(spidx));

	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[dir], chain) {
	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("*** in SPD\n");
	kdebug_secpolicyindex(&sp->spidx));

	if (sp->state == IPSEC_SPSTATE_DEAD)
	continue;
	if (key_cmpspidx_withmask(&sp->spidx, spidx))
	goto found;
	}
	sp = NULL;
	found:
	if (sp) {
	/* sanity check */
	KEY_CHKSPDIR(sp->spidx.dir, dir, __func__);

	/* found a SPD entry */
	sp->lastused = time_second;
	SP_ADDREF(sp);
	}
	SPTREE_UNLOCK();

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
	sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
	return sp;
	}

	/*
	* allocating a SP for OUTBOUND or INBOUND packet.
	* Must call key_freesp() later.
	* OUT: NULL: not found
	* others: found and return the pointer.
	*/
	struct secpolicy *
	key_allocsp2(u_int32_t spi,
	union sockaddr_union *dst,
	u_int8_t proto,
	u_int dir,
	const char* where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;

	IPSEC_ASSERT(dst != NULL, ("null dst"));
	IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND \|\| dir == IPSEC_DIR_OUTBOUND,
	("invalid direction %u", dir));

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s from %s:%u\n", __func__, where, tag));

	/* get a SP entry */
	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("*** objects\n");
	printf("spi %u proto %u dir %u\n", spi, proto, dir);
	kdebug_sockaddr(&dst->sa));

	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[dir], chain) {
	KEYDEBUG(KEYDEBUG_IPSEC_DATA,
	printf("*** in SPD\n");
	kdebug_secpolicyindex(&sp->spidx));

	if (sp->state == IPSEC_SPSTATE_DEAD)
	continue;
	/* compare simple values, then dst address */
	if (sp->spidx.ul_proto != proto)
	continue;
	/* NB: spi's must exist and match */
	if (!sp->req \|\| !sp->req->sav \|\| sp->req->sav->spi != spi)
	continue;
	if (key_sockaddrcmp(&sp->spidx.dst.sa, &dst->sa, 1) == 0)
	goto found;
	}
	sp = NULL;
	found:
	if (sp) {
	/* sanity check */
	KEY_CHKSPDIR(sp->spidx.dir, dir, __func__);

	/* found a SPD entry */
	sp->lastused = time_second;
	SP_ADDREF(sp);
	}
	SPTREE_UNLOCK();

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
	sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
	return sp;
	}

	/*
	* return a policy that matches this particular inbound packet.
	* XXX slow
	*/
	struct secpolicy *
	key_gettunnel(const struct sockaddr *osrc,
	const struct sockaddr *odst,
	const struct sockaddr *isrc,
	const struct sockaddr *idst,
	const char* where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;
	const int dir = IPSEC_DIR_INBOUND;
	struct ipsecrequest r1, r2, *p;
	struct secpolicyindex spidx;

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s from %s:%u\n", __func__, where, tag));

	if (isrc->sa_family != idst->sa_family) {
	ipseclog((LOG_ERR, "%s: protocol family mismatched %d != %d\n.",
	__func__, isrc->sa_family, idst->sa_family));
	sp = NULL;
	goto done;
	}

	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[dir], chain) {
	if (sp->state == IPSEC_SPSTATE_DEAD)
	continue;

	r1 = r2 = NULL;
	for (p = sp->req; p; p = p->next) {
	if (p->saidx.mode != IPSEC_MODE_TUNNEL)
	continue;

	r1 = r2;
	r2 = p;

	if (!r1) {
	/* here we look at address matches only */
	spidx = sp->spidx;
	if (isrc->sa_len > sizeof(spidx.src) \|\|
	idst->sa_len > sizeof(spidx.dst))
	continue;
	bcopy(isrc, &spidx.src, isrc->sa_len);
	bcopy(idst, &spidx.dst, idst->sa_len);
	if (!key_cmpspidx_withmask(&sp->spidx, &spidx))
	continue;
	} else {
	if (key_sockaddrcmp(&r1->saidx.src.sa, isrc, 0) \|\|
	key_sockaddrcmp(&r1->saidx.dst.sa, idst, 0))
	continue;
	}

	if (key_sockaddrcmp(&r2->saidx.src.sa, osrc, 0) \|\|
	key_sockaddrcmp(&r2->saidx.dst.sa, odst, 0))
	continue;

	goto found;
	}
	}
	sp = NULL;
	found:
	if (sp) {
	sp->lastused = time_second;
	SP_ADDREF(sp);
	}
	SPTREE_UNLOCK();
	done:
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__,
	sp, sp ? sp->id : 0, sp ? sp->refcnt : 0));
	return sp;
	}

	/*
	* allocating an SA entry for an OUTBOUND packet.
	* checking each request entries in SP, and acquire an SA if need.
	* OUT: 0: there are valid requests.
	* ENOENT: policy may be valid, but SA with REQUIRE is on acquiring.
	*/
	int
	key_checkrequest(struct ipsecrequest isr, const struct secasindex saidx)
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int level;
	int error;

	IPSEC_ASSERT(isr != NULL, ("null isr"));
	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
	IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT \|\|
	saidx->mode == IPSEC_MODE_TUNNEL,
	("unexpected policy %u", saidx->mode));

	/*
	* XXX guard against protocol callbacks from the crypto
	* thread as they reference ipsecrequest.sav which we
	* temporarily null out below. Need to rethink how we
	* handle bundled SA's in the callback thread.
	*/
	IPSECREQUEST_LOCK_ASSERT(isr);

	/* get current level */
	level = ipsec_get_reqlevel(isr);
	#if 0
	/*
	* We do allocate new SA only if the state of SA in the holder is
	* SADB_SASTATE_DEAD. The SA for outbound must be the oldest.
	*/
	if (isr->sav != NULL) {
	if (isr->sav->sah == NULL)
	panic("%s: sah is null.\n", __func__);
	if (isr->sav == (struct secasvar *)LIST_FIRST(
	&isr->sav->sah->savtree[SADB_SASTATE_DEAD])) {
	KEY_FREESAV(&isr->sav);
	isr->sav = NULL;
	}
	}
	#else
	/*
	* we free any SA stashed in the IPsec request because a different
	* SA may be involved each time this request is checked, either
	* because new SAs are being configured, or this request is
	* associated with an unconnected datagram socket, or this request
	* is associated with a system default policy.
	*
	* The operation may have negative impact to performance. We may
	* want to check cached SA carefully, rather than picking new SA
	* every time.
	*/
	if (isr->sav != NULL) {
	KEY_FREESAV(&isr->sav);
	isr->sav = NULL;
	}
	#endif

	/*
	* new SA allocation if no SA found.
	* key_allocsa_policy should allocate the oldest SA available.
	* See key_do_allocsa_policy(), and draft-jenkins-ipsec-rekeying-03.txt.
	*/
	if (isr->sav == NULL)
	isr->sav = key_allocsa_policy(saidx);

	/* When there is SA. */
	if (isr->sav != NULL) {
	if (isr->sav->state != SADB_SASTATE_MATURE &&
	isr->sav->state != SADB_SASTATE_DYING)
	return EINVAL;
	return 0;
	}

	/* there is no SA */
	error = key_acquire(saidx, isr->sp);
	if (error != 0) {
	/* XXX What should I do ? */
	ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n",
	__func__, error));
	return error;
	}

	if (level != IPSEC_LEVEL_REQUIRE) {
	/* XXX sigh, the interface to this routine is botched */
	IPSEC_ASSERT(isr->sav == NULL, ("unexpected SA"));
	return 0;
	} else {
	return ENOENT;
	}
	}

	/*
	* allocating a SA for policy entry from SAD.
	* NOTE: searching SAD of aliving state.
	* OUT: NULL: not found.
	* others: found and return the pointer.
	*/
	static struct secasvar *
	key_allocsa_policy(const struct secasindex *saidx)
	{
	#define N(a) _ARRAYLEN(a)
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *sah;
	struct secasvar *sav;
	u_int stateidx, arraysize;
	const u_int *state_valid;

	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (sah->state == SADB_SASTATE_DEAD)
	continue;
	if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) {
	if (V_key_preferred_oldsa) {
	state_valid = saorder_state_valid_prefer_old;
	arraysize = N(saorder_state_valid_prefer_old);
	} else {
	state_valid = saorder_state_valid_prefer_new;
	arraysize = N(saorder_state_valid_prefer_new);
	}
	SAHTREE_UNLOCK();
	goto found;
	}
	}
	SAHTREE_UNLOCK();

	return NULL;

	found:
	/* search valid state */
	for (stateidx = 0; stateidx < arraysize; stateidx++) {
	sav = key_do_allocsa_policy(sah, state_valid[stateidx]);
	if (sav != NULL)
	return sav;
	}

	return NULL;
	#undef N
	}

	/*
	* searching SAD with direction, protocol, mode and state.
	* called by key_allocsa_policy().
	* OUT:
	* NULL : not found
	* others : found, pointer to a SA.
	*/
	static struct secasvar *
	key_do_allocsa_policy(struct secashead *sah, u_int state)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar sav, nextsav, candidate, d;

	/* initilize */
	candidate = NULL;

	SAHTREE_LOCK();
	for (sav = LIST_FIRST(&sah->savtree[state]);
	sav != NULL;
	sav = nextsav) {

	nextsav = LIST_NEXT(sav, chain);

	/* sanity check */
	KEY_CHKSASTATE(sav->state, state, __func__);

	/* initialize */
	if (candidate == NULL) {
	candidate = sav;
	continue;
	}

	/* Which SA is the better ? */

	IPSEC_ASSERT(candidate->lft_c != NULL,
	("null candidate lifetime"));
	IPSEC_ASSERT(sav->lft_c != NULL, ("null sav lifetime"));

	/* What the best method is to compare ? */
	if (V_key_preferred_oldsa) {
	if (candidate->lft_c->addtime >
	sav->lft_c->addtime) {
	candidate = sav;
	}
	continue;
	/NOTREACHED/
	}

	/* preferred new sa rather than old sa */
	if (candidate->lft_c->addtime <
	sav->lft_c->addtime) {
	d = candidate;
	candidate = sav;
	} else
	d = sav;

	/*
	* prepared to delete the SA when there is more
	* suitable candidate and the lifetime of the SA is not
	* permanent.
	*/
	if (d->lft_h->addtime != 0) {
	struct mbuf m, result;
	u_int8_t satype;

	key_sa_chgstate(d, SADB_SASTATE_DEAD);

	IPSEC_ASSERT(d->refcnt > 0, ("bogus ref count"));

	satype = key_proto2satype(d->sah->saidx.proto);
	if (satype == 0)
	goto msgfail;

	m = key_setsadbmsg(SADB_DELETE, 0,
	satype, 0, 0, d->refcnt - 1);
	if (!m)
	goto msgfail;
	result = m;

	/* set sadb_address for saidx's. */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&d->sah->saidx.src.sa,
	d->sah->saidx.src.sa.sa_len << 3,
	IPSEC_ULPROTO_ANY);
	if (!m)
	goto msgfail;
	m_cat(result, m);

	/* set sadb_address for saidx's. */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&d->sah->saidx.dst.sa,
	d->sah->saidx.dst.sa.sa_len << 3,
	IPSEC_ULPROTO_ANY);
	if (!m)
	goto msgfail;
	m_cat(result, m);

	/* create SA extension */
	m = key_setsadbsa(d);
	if (!m)
	goto msgfail;
	m_cat(result, m);

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result,
	sizeof(struct sadb_msg));
	if (result == NULL)
	goto msgfail;
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;
	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	if (key_sendup_mbuf(NULL, result,
	KEY_SENDUP_REGISTERED))
	goto msgfail;
	msgfail:
	KEY_FREESAV(&d);
	}
	}
	if (candidate) {
	sa_addref(candidate);
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s cause refcnt++:%d SA:%p\n",
	__func__, candidate->refcnt, candidate));
	}
	SAHTREE_UNLOCK();

	return candidate;
	}

	/*
	* allocating a usable SA entry for a INBOUND packet.
	* Must call key_freesav() later.
	* OUT: positive: pointer to a usable sav (i.e. MATURE or DYING state).
	* NULL: not found, or error occured.
	*
	* In the comparison, no source address is used--for RFC2401 conformance.
	* To quote, from section 4.1:
	* A security association is uniquely identified by a triple consisting
	* of a Security Parameter Index (SPI), an IP Destination Address, and a
	* security protocol (AH or ESP) identifier.
	* Note that, however, we do need to keep source address in IPsec SA.
	* IKE specification and PF_KEY specification do assume that we
	* keep source address in IPsec SA. We see a tricky situation here.
	*/
	struct secasvar *
	key_allocsa(
	union sockaddr_union *dst,
	u_int proto,
	u_int32_t spi,
	const char* where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *sah;
	struct secasvar *sav;
	u_int stateidx, arraysize, state;
	const u_int *saorder_state_valid;

	IPSEC_ASSERT(dst != NULL, ("null dst address"));

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s from %s:%u\n", __func__, where, tag));

	/*
	* searching SAD.
	* XXX: to be checked internal IP header somewhere. Also when
	* IPsec tunnel packet is received. But ESP tunnel mode is
	* encrypted so we can't check internal IP header.
	*/
	SAHTREE_LOCK();
	if (V_key_preferred_oldsa) {
	saorder_state_valid = saorder_state_valid_prefer_old;
	arraysize = _ARRAYLEN(saorder_state_valid_prefer_old);
	} else {
	saorder_state_valid = saorder_state_valid_prefer_new;
	arraysize = _ARRAYLEN(saorder_state_valid_prefer_new);
	}
	LIST_FOREACH(sah, &V_sahtree, chain) {
	/* search valid state */
	for (stateidx = 0; stateidx < arraysize; stateidx++) {
	state = saorder_state_valid[stateidx];
	LIST_FOREACH(sav, &sah->savtree[state], chain) {
	/* sanity check */
	KEY_CHKSASTATE(sav->state, state, __func__);
	/* do not return entries w/ unusable state */
	if (sav->state != SADB_SASTATE_MATURE &&
	sav->state != SADB_SASTATE_DYING)
	continue;
	if (proto != sav->sah->saidx.proto)
	continue;
	if (spi != sav->spi)
	continue;
	#if 0 /* don't check src */
	/* check src address */
	if (key_sockaddrcmp(&src->sa, &sav->sah->saidx.src.sa, 0) != 0)
	continue;
	#endif
	/* check dst address */
	if (key_sockaddrcmp(&dst->sa, &sav->sah->saidx.dst.sa, 0) != 0)
	continue;
	sa_addref(sav);
	goto done;
	}
	}
	}
	sav = NULL;
	done:
	SAHTREE_UNLOCK();

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s return SA:%p; refcnt %u\n", __func__,
	sav, sav ? sav->refcnt : 0));
	return sav;
	}

	/*
	* Must be called after calling key_allocsp().
	* For both the packet without socket and key_freeso().
	*/
	void
	_key_freesp(struct secpolicy *spp, const char where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy sp = spp;

	IPSEC_ASSERT(sp != NULL, ("null sp"));

	SPTREE_LOCK();
	SP_DELREF(sp);

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s SP:%p (ID=%u) from %s:%u; refcnt now %u\n",
	__func__, sp, sp->id, where, tag, sp->refcnt));

	if (sp->refcnt == 0) {
	*spp = NULL;
	key_delsp(sp);
	}
	SPTREE_UNLOCK();
	}

	/*
	* Must be called after calling key_allocsp().
	* For the packet with socket.
	*/
	void
	key_freeso(struct socket *so)
	{
	+ INIT_VNET_IPSEC(curvnet);
	IPSEC_ASSERT(so != NULL, ("null so"));

	switch (so->so_proto->pr_domain->dom_family) {
	#ifdef INET
	case PF_INET:
	{
	struct inpcb *pcb = sotoinpcb(so);

	/* Does it have a PCB ? */
	if (pcb == NULL)
	return;
	key_freesp_so(&pcb->inp_sp->sp_in);
	key_freesp_so(&pcb->inp_sp->sp_out);
	}
	break;
	#endif
	#ifdef INET6
	case PF_INET6:
	{
	#ifdef HAVE_NRL_INPCB
	struct inpcb *pcb = sotoinpcb(so);

	/* Does it have a PCB ? */
	if (pcb == NULL)
	return;
	key_freesp_so(&pcb->inp_sp->sp_in);
	key_freesp_so(&pcb->inp_sp->sp_out);
	#else
	struct in6pcb *pcb = sotoin6pcb(so);

	/* Does it have a PCB ? */
	if (pcb == NULL)
	return;
	key_freesp_so(&pcb->in6p_sp->sp_in);
	key_freesp_so(&pcb->in6p_sp->sp_out);
	#endif
	}
	break;
	#endif /* INET6 */
	default:
	ipseclog((LOG_DEBUG, "%s: unknown address family=%d.\n",
	__func__, so->so_proto->pr_domain->dom_family));
	return;
	}
	}

	static void
	key_freesp_so(struct secpolicy **sp)
	{
	IPSEC_ASSERT(sp != NULL && *sp != NULL, ("null sp"));

	if ((*sp)->policy == IPSEC_POLICY_ENTRUST \|\|
	(*sp)->policy == IPSEC_POLICY_BYPASS)
	return;

	IPSEC_ASSERT((*sp)->policy == IPSEC_POLICY_IPSEC,
	("invalid policy %u", (*sp)->policy));
	KEY_FREESP(sp);
	}

	/*
	* Must be called after calling key_allocsa().
	* This function is called by key_freesp() to free some SA allocated
	* for a policy.
	*/
	void
	key_freesav(struct secasvar *psav, const char where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar sav = psav;

	IPSEC_ASSERT(sav != NULL, ("null sav"));

	if (sa_delref(sav)) {
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n",
	__func__, sav, ntohl(sav->spi), where, tag, sav->refcnt));
	*psav = NULL;
	key_delsav(sav);
	} else {
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n",
	__func__, sav, ntohl(sav->spi), where, tag, sav->refcnt));
	}
	}

	/* %%% SPD management */
	/*
	* free security policy entry.
	*/
	static void
	key_delsp(struct secpolicy *sp)
	{
	struct ipsecrequest isr, nextisr;

	IPSEC_ASSERT(sp != NULL, ("null sp"));
	SPTREE_LOCK_ASSERT();

	sp->state = IPSEC_SPSTATE_DEAD;

	IPSEC_ASSERT(sp->refcnt == 0,
	("SP with references deleted (refcnt %u)", sp->refcnt));

	/* remove from SP index */
	if (__LIST_CHAINED(sp))
	LIST_REMOVE(sp, chain);

	for (isr = sp->req; isr != NULL; isr = nextisr) {
	if (isr->sav != NULL) {
	KEY_FREESAV(&isr->sav);
	isr->sav = NULL;
	}

	nextisr = isr->next;
	ipsec_delisr(isr);
	}
	_key_delsp(sp);
	}

	/*
	* search SPD
	* OUT: NULL : not found
	* others : found, pointer to a SP.
	*/
	static struct secpolicy *
	key_getsp(struct secpolicyindex *spidx)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;

	IPSEC_ASSERT(spidx != NULL, ("null spidx"));

	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[spidx->dir], chain) {
	if (sp->state == IPSEC_SPSTATE_DEAD)
	continue;
	if (key_cmpspidx_exactly(spidx, &sp->spidx)) {
	SP_ADDREF(sp);
	break;
	}
	}
	SPTREE_UNLOCK();

	return sp;
	}

	/*
	* get SP by index.
	* OUT: NULL : not found
	* others : found, pointer to a SP.
	*/
	static struct secpolicy *
	key_getspbyid(u_int32_t id)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;

	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_INBOUND], chain) {
	if (sp->state == IPSEC_SPSTATE_DEAD)
	continue;
	if (sp->id == id) {
	SP_ADDREF(sp);
	goto done;
	}
	}

	LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_OUTBOUND], chain) {
	if (sp->state == IPSEC_SPSTATE_DEAD)
	continue;
	if (sp->id == id) {
	SP_ADDREF(sp);
	goto done;
	}
	}
	done:
	SPTREE_UNLOCK();

	return sp;
	}

	struct secpolicy *
	key_newsp(const char* where, int tag)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *newsp = NULL;

	newsp = (struct secpolicy *)
	malloc(sizeof(struct secpolicy), M_IPSEC_SP, M_NOWAIT\|M_ZERO);
	if (newsp) {
	SECPOLICY_LOCK_INIT(newsp);
	newsp->refcnt = 1;
	newsp->req = NULL;
	}

	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s from %s:%u return SP:%p\n", __func__,
	where, tag, newsp));
	return newsp;
	}

	static void
	_key_delsp(struct secpolicy *sp)
	{
	SECPOLICY_LOCK_DESTROY(sp);
	free(sp, M_IPSEC_SP);
	}

	/*
	* create secpolicy structure from sadb_x_policy structure.
	* NOTE: `state', `secpolicyindex' in secpolicy structure are not set,
	* so must be set properly later.
	*/
	struct secpolicy *
	key_msg2sp(xpl0, len, error)
	struct sadb_x_policy *xpl0;
	size_t len;
	int *error;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *newsp;

	IPSEC_ASSERT(xpl0 != NULL, ("null xpl0"));
	IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len));

	if (len != PFKEY_EXTLEN(xpl0)) {
	ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__));
	*error = EINVAL;
	return NULL;
	}

	if ((newsp = KEY_NEWSP()) == NULL) {
	*error = ENOBUFS;
	return NULL;
	}

	newsp->spidx.dir = xpl0->sadb_x_policy_dir;
	newsp->policy = xpl0->sadb_x_policy_type;

	/* check policy */
	switch (xpl0->sadb_x_policy_type) {
	case IPSEC_POLICY_DISCARD:
	case IPSEC_POLICY_NONE:
	case IPSEC_POLICY_ENTRUST:
	case IPSEC_POLICY_BYPASS:
	newsp->req = NULL;
	break;

	case IPSEC_POLICY_IPSEC:
	{
	int tlen;
	struct sadb_x_ipsecrequest *xisr;
	struct ipsecrequest **p_isr = &newsp->req;

	/* validity check */
	if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) {
	ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n",
	__func__));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}

	tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0);
	xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1);

	while (tlen > 0) {
	/* length check */
	if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr)) {
	ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest "
	"length.\n", __func__));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}

	/* allocate request buffer */
	/* NB: data structure is zero'd */
	*p_isr = ipsec_newisr();
	if ((*p_isr) == NULL) {
	ipseclog((LOG_DEBUG,
	"%s: No more memory.\n", __func__));
	KEY_FREESP(&newsp);
	*error = ENOBUFS;
	return NULL;
	}

	/* set values */
	switch (xisr->sadb_x_ipsecrequest_proto) {
	case IPPROTO_ESP:
	case IPPROTO_AH:
	case IPPROTO_IPCOMP:
	break;
	default:
	ipseclog((LOG_DEBUG,
	"%s: invalid proto type=%u\n", __func__,
	xisr->sadb_x_ipsecrequest_proto));
	KEY_FREESP(&newsp);
	*error = EPROTONOSUPPORT;
	return NULL;
	}
	(*p_isr)->saidx.proto = xisr->sadb_x_ipsecrequest_proto;

	switch (xisr->sadb_x_ipsecrequest_mode) {
	case IPSEC_MODE_TRANSPORT:
	case IPSEC_MODE_TUNNEL:
	break;
	case IPSEC_MODE_ANY:
	default:
	ipseclog((LOG_DEBUG,
	"%s: invalid mode=%u\n", __func__,
	xisr->sadb_x_ipsecrequest_mode));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}
	(*p_isr)->saidx.mode = xisr->sadb_x_ipsecrequest_mode;

	switch (xisr->sadb_x_ipsecrequest_level) {
	case IPSEC_LEVEL_DEFAULT:
	case IPSEC_LEVEL_USE:
	case IPSEC_LEVEL_REQUIRE:
	break;
	case IPSEC_LEVEL_UNIQUE:
	/* validity check */
	/*
	* If range violation of reqid, kernel will
	* update it, don't refuse it.
	*/
	if (xisr->sadb_x_ipsecrequest_reqid
	> IPSEC_MANUAL_REQID_MAX) {
	ipseclog((LOG_DEBUG,
	"%s: reqid=%d range "
	"violation, updated by kernel.\n",
	__func__,
	xisr->sadb_x_ipsecrequest_reqid));
	xisr->sadb_x_ipsecrequest_reqid = 0;
	}

	/* allocate new reqid id if reqid is zero. */
	if (xisr->sadb_x_ipsecrequest_reqid == 0) {
	u_int32_t reqid;
	if ((reqid = key_newreqid()) == 0) {
	KEY_FREESP(&newsp);
	*error = ENOBUFS;
	return NULL;
	}
	(*p_isr)->saidx.reqid = reqid;
	xisr->sadb_x_ipsecrequest_reqid = reqid;
	} else {
	/* set it for manual keying. */
	(*p_isr)->saidx.reqid =
	xisr->sadb_x_ipsecrequest_reqid;
	}
	break;

	default:
	ipseclog((LOG_DEBUG, "%s: invalid level=%u\n",
	__func__,
	xisr->sadb_x_ipsecrequest_level));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}
	(*p_isr)->level = xisr->sadb_x_ipsecrequest_level;

	/* set IP addresses if there */
	if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) {
	struct sockaddr *paddr;

	paddr = (struct sockaddr *)(xisr + 1);

	/* validity check */
	if (paddr->sa_len
	> sizeof((*p_isr)->saidx.src)) {
	ipseclog((LOG_DEBUG, "%s: invalid "
	"request address length.\n",
	__func__));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}
	bcopy(paddr, &(*p_isr)->saidx.src,
	paddr->sa_len);

	paddr = (struct sockaddr *)((caddr_t)paddr
	+ paddr->sa_len);

	/* validity check */
	if (paddr->sa_len
	> sizeof((*p_isr)->saidx.dst)) {
	ipseclog((LOG_DEBUG, "%s: invalid "
	"request address length.\n",
	__func__));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}
	bcopy(paddr, &(*p_isr)->saidx.dst,
	paddr->sa_len);
	}

	(*p_isr)->sp = newsp;

	/* initialization for the next. */
	p_isr = &(*p_isr)->next;
	tlen -= xisr->sadb_x_ipsecrequest_len;

	/* validity check */
	if (tlen < 0) {
	ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n",
	__func__));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}

	xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr
	+ xisr->sadb_x_ipsecrequest_len);
	}
	}
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__));
	KEY_FREESP(&newsp);
	*error = EINVAL;
	return NULL;
	}

	*error = 0;
	return newsp;
	}

	static u_int32_t
	key_newreqid()
	{
	static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1;

	auto_reqid = (auto_reqid == ~0
	? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1);

	/* XXX should be unique check */

	return auto_reqid;
	}

	/*
	* copy secpolicy struct to sadb_x_policy structure indicated.
	*/
	struct mbuf *
	key_sp2msg(sp)
	struct secpolicy *sp;
	{
	struct sadb_x_policy *xpl;
	int tlen;
	caddr_t p;
	struct mbuf *m;

	IPSEC_ASSERT(sp != NULL, ("null policy"));

	tlen = key_getspreqmsglen(sp);

	m = key_alloc_mbuf(tlen);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	return NULL;
	}

	m->m_len = tlen;
	m->m_next = NULL;
	xpl = mtod(m, struct sadb_x_policy *);
	bzero(xpl, tlen);

	xpl->sadb_x_policy_len = PFKEY_UNIT64(tlen);
	xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
	xpl->sadb_x_policy_type = sp->policy;
	xpl->sadb_x_policy_dir = sp->spidx.dir;
	xpl->sadb_x_policy_id = sp->id;
	p = (caddr_t)xpl + sizeof(*xpl);

	/* if is the policy for ipsec ? */
	if (sp->policy == IPSEC_POLICY_IPSEC) {
	struct sadb_x_ipsecrequest *xisr;
	struct ipsecrequest *isr;

	for (isr = sp->req; isr != NULL; isr = isr->next) {

	xisr = (struct sadb_x_ipsecrequest *)p;

	xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto;
	xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode;
	xisr->sadb_x_ipsecrequest_level = isr->level;
	xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid;

	p += sizeof(*xisr);
	bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len);
	p += isr->saidx.src.sa.sa_len;
	bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len);
	p += isr->saidx.src.sa.sa_len;

	xisr->sadb_x_ipsecrequest_len =
	PFKEY_ALIGN8(sizeof(*xisr)
	+ isr->saidx.src.sa.sa_len
	+ isr->saidx.dst.sa.sa_len);
	}
	}

	return m;
	}

	/* m will not be freed nor modified */
	static struct mbuf *
	#ifdef __STDC__
	key_gather_mbuf(struct mbuf m, const struct sadb_msghdr mhp,
	int ndeep, int nitem, ...)
	#else
	key_gather_mbuf(m, mhp, ndeep, nitem, va_alist)
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	int ndeep;
	int nitem;
	va_dcl
	#endif
	{
	va_list ap;
	int idx;
	int i;
	struct mbuf result = NULL, n;
	int len;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));

	va_start(ap, nitem);
	for (i = 0; i < nitem; i++) {
	idx = va_arg(ap, int);
	if (idx < 0 \|\| idx > SADB_EXT_MAX)
	goto fail;
	/* don't attempt to pull empty extension */
	if (idx == SADB_EXT_RESERVED && mhp->msg == NULL)
	continue;
	if (idx != SADB_EXT_RESERVED &&
	(mhp->ext[idx] == NULL \|\| mhp->extlen[idx] == 0))
	continue;

	if (idx == SADB_EXT_RESERVED) {
	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));

	IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len));

	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (!n)
	goto fail;
	n->m_len = len;
	n->m_next = NULL;
	m_copydata(m, 0, sizeof(struct sadb_msg),
	mtod(n, caddr_t));
	} else if (i < ndeep) {
	len = mhp->extlen[idx];
	n = key_alloc_mbuf(len);
	if (!n \|\| n->m_next) { /XXX/
	if (n)
	m_freem(n);
	goto fail;
	}
	m_copydata(m, mhp->extoff[idx], mhp->extlen[idx],
	mtod(n, caddr_t));
	} else {
	n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx],
	M_DONTWAIT);
	}
	if (n == NULL)
	goto fail;

	if (result)
	m_cat(result, n);
	else
	result = n;
	}
	va_end(ap);

	if ((result->m_flags & M_PKTHDR) != 0) {
	result->m_pkthdr.len = 0;
	for (n = result; n; n = n->m_next)
	result->m_pkthdr.len += n->m_len;
	}

	return result;

	fail:
	m_freem(result);
	return NULL;
	}

	/*
	* SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing
	* add an entry to SP database, when received
	* <base, address(SD), (lifetime(H),) policy>
	* from the user(?).
	* Adding to SP database,
	* and send
	* <base, address(SD), (lifetime(H),) policy>
	* to the socket which was send.
	*
	* SPDADD set a unique policy entry.
	* SPDSETIDX like SPDADD without a part of policy requests.
	* SPDUPDATE replace a unique policy entry.
	*
	* m will always be freed.
	*/
	static int
	key_spdadd(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_address src0, dst0;
	struct sadb_x_policy xpl0, xpl;
	struct sadb_lifetime *lft = NULL;
	struct secpolicyindex spidx;
	struct secpolicy *newsp;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL \|\|
	mhp->ext[SADB_X_EXT_POLICY] == NULL) {
	ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n"));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL) {
	if (mhp->extlen[SADB_EXT_LIFETIME_HARD]
	< sizeof(struct sadb_lifetime)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
	}

	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];

	/* make secindex */
	/* XXX boundary check against sa_len */
	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
	src0 + 1,
	dst0 + 1,
	src0->sadb_address_prefixlen,
	dst0->sadb_address_prefixlen,
	src0->sadb_address_proto,
	&spidx);

	/* checking the direciton. */
	switch (xpl0->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	case IPSEC_DIR_OUTBOUND:
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__));
	mhp->msg->sadb_msg_errno = EINVAL;
	return 0;
	}

	/* check policy */
	/* key_spdadd() accepts DISCARD, NONE and IPSEC. */
	if (xpl0->sadb_x_policy_type == IPSEC_POLICY_ENTRUST
	\|\| xpl0->sadb_x_policy_type == IPSEC_POLICY_BYPASS) {
	ipseclog((LOG_DEBUG, "%s: Invalid policy type.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	/* policy requests are mandatory when action is ipsec. */
	if (mhp->msg->sadb_msg_type != SADB_X_SPDSETIDX
	&& xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC
	&& mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) {
	ipseclog((LOG_DEBUG, "%s: some policy requests part required\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	/*
	* checking there is SP already or not.
	* SPDUPDATE doesn't depend on whether there is a SP or not.
	* If the type is either SPDADD or SPDSETIDX AND a SP is found,
	* then error.
	*/
	newsp = key_getsp(&spidx);
	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
	if (newsp) {
	newsp->state = IPSEC_SPSTATE_DEAD;
	KEY_FREESP(&newsp);
	}
	} else {
	if (newsp != NULL) {
	KEY_FREESP(&newsp);
	ipseclog((LOG_DEBUG, "%s: a SP entry exists already.\n",
	__func__));
	return key_senderror(so, m, EEXIST);
	}
	}

	/* allocation new SP entry */
	if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) {
	return key_senderror(so, m, error);
	}

	if ((newsp->id = key_getnewspid()) == 0) {
	_key_delsp(newsp);
	return key_senderror(so, m, ENOBUFS);
	}

	/* XXX boundary check against sa_len */
	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
	src0 + 1,
	dst0 + 1,
	src0->sadb_address_prefixlen,
	dst0->sadb_address_prefixlen,
	src0->sadb_address_proto,
	&newsp->spidx);

	/* sanity check on addr pair */
	if (((struct sockaddr *)(src0 + 1))->sa_family !=
	((struct sockaddr *)(dst0+ 1))->sa_family) {
	_key_delsp(newsp);
	return key_senderror(so, m, EINVAL);
	}
	if (((struct sockaddr *)(src0 + 1))->sa_len !=
	((struct sockaddr *)(dst0+ 1))->sa_len) {
	_key_delsp(newsp);
	return key_senderror(so, m, EINVAL);
	}
	#if 1
	if (newsp->req && newsp->req->saidx.src.sa.sa_family) {
	struct sockaddr *sa;
	sa = (struct sockaddr *)(src0 + 1);
	if (sa->sa_family != newsp->req->saidx.src.sa.sa_family) {
	_key_delsp(newsp);
	return key_senderror(so, m, EINVAL);
	}
	}
	if (newsp->req && newsp->req->saidx.dst.sa.sa_family) {
	struct sockaddr *sa;
	sa = (struct sockaddr *)(dst0 + 1);
	if (sa->sa_family != newsp->req->saidx.dst.sa.sa_family) {
	_key_delsp(newsp);
	return key_senderror(so, m, EINVAL);
	}
	}
	#endif

	newsp->created = time_second;
	newsp->lastused = newsp->created;
	newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0;
	newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0;

	newsp->refcnt = 1; /* do not reclaim until I say I do */
	newsp->state = IPSEC_SPSTATE_ALIVE;
	LIST_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, secpolicy, chain);

	/* delete the entry in spacqtree */
	if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) {
	struct secspacq *spacq = key_getspacq(&spidx);
	if (spacq != NULL) {
	/* reset counter in order to deletion by timehandler. */
	spacq->created = time_second;
	spacq->count = 0;
	SPACQ_UNLOCK();
	}
	}

	{
	struct mbuf n, mpolicy;
	struct sadb_msg *newmsg;
	int off;

	/* create new sadb_msg to reply. */
	if (lft) {
	n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED,
	SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	} else {
	n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED,
	SADB_X_EXT_POLICY,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	if (n->m_len < sizeof(*newmsg)) {
	n = m_pullup(n, sizeof(*newmsg));
	if (!n)
	return key_senderror(so, m, ENOBUFS);
	}
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	off = 0;
	mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)),
	sizeof(*xpl), &off);
	if (mpolicy == NULL) {
	/* n is already freed */
	return key_senderror(so, m, ENOBUFS);
	}
	xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off);
	if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) {
	m_freem(n);
	return key_senderror(so, m, EINVAL);
	}
	xpl->sadb_x_policy_id = newsp->id;

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* get new policy id.
	* OUT:
	* 0: failure.
	* others: success.
	*/
	static u_int32_t
	key_getnewspid()
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int32_t newid = 0;
	int count = V_key_spi_trycnt; /* XXX */
	struct secpolicy *sp;

	/* when requesting to allocate spi ranged */
	while (count--) {
	newid = (V_policy_id = (V_policy_id == ~0 ? 1 : V_policy_id + 1));

	if ((sp = key_getspbyid(newid)) == NULL)
	break;

	KEY_FREESP(&sp);
	}

	if (count == 0 \|\| newid == 0) {
	ipseclog((LOG_DEBUG, "%s: to allocate policy id is failed.\n",
	__func__));
	return 0;
	}

	return newid;
	}

	/*
	* SADB_SPDDELETE processing
	* receive
	* <base, address(SD), policy(*)>
	* from the user(?), and set SADB_SASTATE_DEAD,
	* and send,
	* <base, address(SD), policy(*)>
	* to the ikmpd.
	* policy(*) including direction of policy.
	*
	* m will always be freed.
	*/
	static int
	key_spddelete(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_address src0, dst0;
	struct sadb_x_policy *xpl0;
	struct secpolicyindex spidx;
	struct secpolicy *sp;

	IPSEC_ASSERT(so != NULL, ("null so"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL \|\|
	mhp->ext[SADB_X_EXT_POLICY] == NULL) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];
	xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY];

	/* make secindex */
	/* XXX boundary check against sa_len */
	KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir,
	src0 + 1,
	dst0 + 1,
	src0->sadb_address_prefixlen,
	dst0->sadb_address_prefixlen,
	src0->sadb_address_proto,
	&spidx);

	/* checking the direciton. */
	switch (xpl0->sadb_x_policy_dir) {
	case IPSEC_DIR_INBOUND:
	case IPSEC_DIR_OUTBOUND:
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	/* Is there SP in SPD ? */
	if ((sp = key_getsp(&spidx)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	/* save policy id to buffer to be returned. */
	xpl0->sadb_x_policy_id = sp->id;

	sp->state = IPSEC_SPSTATE_DEAD;
	KEY_FREESP(&sp);

	{
	struct mbuf *n;
	struct sadb_msg *newmsg;

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
	SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* SADB_SPDDELETE2 processing
	* receive
	* <base, policy(*)>
	* from the user(?), and set SADB_SASTATE_DEAD,
	* and send,
	* <base, policy(*)>
	* to the ikmpd.
	* policy(*) including direction of policy.
	*
	* m will always be freed.
	*/
	static int
	key_spddelete2(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int32_t id;
	struct secpolicy *sp;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (mhp->ext[SADB_X_EXT_POLICY] == NULL \|\|
	mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;

	/* Is there SP in SPD ? */
	if ((sp = key_getspbyid(id)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id));
	return key_senderror(so, m, EINVAL);
	}

	sp->state = IPSEC_SPSTATE_DEAD;
	KEY_FREESP(&sp);

	{
	struct mbuf n, nn;
	struct sadb_msg *newmsg;
	int off, len;

	/* create new sadb_msg to reply. */
	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));

	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (n && len > MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	n->m_len = len;
	n->m_next = NULL;
	off = 0;

	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));

	IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)",
	off, len));

	n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY],
	mhp->extlen[SADB_X_EXT_POLICY], M_DONTWAIT);
	if (!n->m_next) {
	m_freem(n);
	return key_senderror(so, m, ENOBUFS);
	}

	n->m_pkthdr.len = 0;
	for (nn = n; nn; nn = nn->m_next)
	n->m_pkthdr.len += nn->m_len;

	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* SADB_X_GET processing
	* receive
	* <base, policy(*)>
	* from the user(?),
	* and send,
	* <base, address(SD), policy>
	* to the ikmpd.
	* policy(*) including direction of policy.
	*
	* m will always be freed.
	*/
	static int
	key_spdget(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int32_t id;
	struct secpolicy *sp;
	struct mbuf *n;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (mhp->ext[SADB_X_EXT_POLICY] == NULL \|\|
	mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id;

	/* Is there SP in SPD ? */
	if ((sp = key_getspbyid(id)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id));
	return key_senderror(so, m, ENOENT);
	}

	n = key_setdumpsp(sp, SADB_X_SPDGET, 0, mhp->msg->sadb_msg_pid);
	if (n != NULL) {
	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	} else
	return key_senderror(so, m, ENOBUFS);
	}

	/*
	* SADB_X_SPDACQUIRE processing.
	* Acquire policy and SA(s) for a OUTBOUND packet.
	* send
	* <base, policy(*)>
	* to KMD, and expect to receive
	* <base> with SADB_X_SPDACQUIRE if error occured,
	* or
	* <base, policy>
	* with SADB_X_SPDUPDATE from KMD by PF_KEY.
	* policy(*) is without policy requests.
	*
	* 0 : succeed
	* others: error number
	*/
	int
	key_spdacquire(sp)
	struct secpolicy *sp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf result = NULL, m;
	struct secspacq *newspacq;

	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));
	IPSEC_ASSERT(sp->req == NULL, ("policy exists"));
	IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC,
	("policy not IPSEC %u", sp->policy));

	/* Get an entry to check whether sent message or not. */
	newspacq = key_getspacq(&sp->spidx);
	if (newspacq != NULL) {
	if (V_key_blockacq_count < newspacq->count) {
	/* reset counter and do send message. */
	newspacq->count = 0;
	} else {
	/* increment counter and do nothing. */
	newspacq->count++;
	return 0;
	}
	SPACQ_UNLOCK();
	} else {
	/* make new entry for blocking to send SADB_ACQUIRE. */
	newspacq = key_newspacq(&sp->spidx);
	if (newspacq == NULL)
	return ENOBUFS;
	}

	/* create new sadb_msg to reply. */
	m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0);
	if (!m)
	return ENOBUFS;

	result = m;

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED);
	}

	/*
	* SADB_SPDFLUSH processing
	* receive
	* <base>
	* from the user, and free all entries in secpctree.
	* and send,
	* <base>
	* to the user.
	* NOTE: what to do is only marking SADB_SASTATE_DEAD.
	*
	* m will always be freed.
	*/
	static int
	key_spdflush(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_msg *newmsg;
	struct secpolicy *sp;
	u_int dir;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg)))
	return key_senderror(so, m, EINVAL);

	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[dir], chain)
	sp->state = IPSEC_SPSTATE_DEAD;
	SPTREE_UNLOCK();
	}

	if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	if (m->m_next)
	m_freem(m->m_next);
	m->m_next = NULL;
	m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
	newmsg = mtod(m, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);

	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	}

	/*
	* SADB_SPDDUMP processing
	* receive
	* <base>
	* from the user, and dump all SP leaves
	* and send,
	* <base> .....
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_spddump(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secpolicy *sp;
	int cnt;
	u_int dir;
	struct mbuf *n;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* search SPD entry and get buffer size. */
	cnt = 0;
	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	LIST_FOREACH(sp, &V_sptree[dir], chain) {
	cnt++;
	}
	}

	if (cnt == 0)
	return key_senderror(so, m, ENOENT);

	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	LIST_FOREACH(sp, &V_sptree[dir], chain) {
	--cnt;
	n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt,
	mhp->msg->sadb_msg_pid);

	if (n)
	key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}

	m_freem(m);
	return 0;
	}

	static struct mbuf *
	key_setdumpsp(sp, type, seq, pid)
	struct secpolicy *sp;
	u_int8_t type;
	u_int32_t seq, pid;
	{
	struct mbuf result = NULL, m;
	struct seclifetime lt;

	m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt);
	if (!m)
	goto fail;
	result = m;

	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sp->spidx.src.sa, sp->spidx.prefs,
	sp->spidx.ul_proto);
	if (!m)
	goto fail;
	m_cat(result, m);

	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sp->spidx.dst.sa, sp->spidx.prefd,
	sp->spidx.ul_proto);
	if (!m)
	goto fail;
	m_cat(result, m);

	m = key_sp2msg(sp);
	if (!m)
	goto fail;
	m_cat(result, m);

	if(sp->lifetime){
	lt.addtime=sp->created;
	lt.usetime= sp->lastused;
	m = key_setlifetime(&lt, SADB_EXT_LIFETIME_CURRENT);
	if (!m)
	goto fail;
	m_cat(result, m);

	lt.addtime=sp->lifetime;
	lt.usetime= sp->validtime;
	m = key_setlifetime(&lt, SADB_EXT_LIFETIME_HARD);
	if (!m)
	goto fail;
	m_cat(result, m);
	}

	if ((result->m_flags & M_PKTHDR) == 0)
	goto fail;

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL)
	goto fail;
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return result;

	fail:
	m_freem(result);
	return NULL;
	}

	/*
	* get PFKEY message length for security policy and request.
	*/
	static u_int
	key_getspreqmsglen(sp)
	struct secpolicy *sp;
	{
	u_int tlen;

	tlen = sizeof(struct sadb_x_policy);

	/* if is the policy for ipsec ? */
	if (sp->policy != IPSEC_POLICY_IPSEC)
	return tlen;

	/* get length of ipsec requests */
	{
	struct ipsecrequest *isr;
	int len;

	for (isr = sp->req; isr != NULL; isr = isr->next) {
	len = sizeof(struct sadb_x_ipsecrequest)
	+ isr->saidx.src.sa.sa_len
	+ isr->saidx.dst.sa.sa_len;

	tlen += PFKEY_ALIGN8(len);
	}
	}

	return tlen;
	}

	/*
	* SADB_SPDEXPIRE processing
	* send
	* <base, address(SD), lifetime(CH), policy>
	* to KMD by PF_KEY.
	*
	* OUT: 0 : succeed
	* others : error number
	*/
	static int
	key_spdexpire(sp)
	struct secpolicy *sp;
	{
	struct mbuf result = NULL, m;
	int len;
	int error = -1;
	struct sadb_lifetime *lt;

	/* XXX: Why do we lock ? */

	IPSEC_ASSERT(sp != NULL, ("null secpolicy"));

	/* set msg header */
	m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	result = m;

	/* create lifetime extension (current and hard) */
	len = PFKEY_ALIGN8(sizeof(lt)) 2;
	m = key_alloc_mbuf(len);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	error = ENOBUFS;
	goto fail;
	}
	bzero(mtod(m, caddr_t), len);
	lt = mtod(m, struct sadb_lifetime *);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
	lt->sadb_lifetime_allocations = 0;
	lt->sadb_lifetime_bytes = 0;
	lt->sadb_lifetime_addtime = sp->created;
	lt->sadb_lifetime_usetime = sp->lastused;
	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD;
	lt->sadb_lifetime_allocations = 0;
	lt->sadb_lifetime_bytes = 0;
	lt->sadb_lifetime_addtime = sp->lifetime;
	lt->sadb_lifetime_usetime = sp->validtime;
	m_cat(result, m);

	/* set sadb_address for source */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sp->spidx.src.sa,
	sp->spidx.prefs, sp->spidx.ul_proto);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* set sadb_address for destination */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sp->spidx.dst.sa,
	sp->spidx.prefd, sp->spidx.ul_proto);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* set secpolicy */
	m = key_sp2msg(sp);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	if ((result->m_flags & M_PKTHDR) == 0) {
	error = EINVAL;
	goto fail;
	}

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);

	fail:
	if (result)
	m_freem(result);
	return error;
	}

	/* %%% SAD management */
	/*
	* allocating a memory for new SA head, and copy from the values of mhp.
	* OUT: NULL : failure due to the lack of memory.
	* others : pointer to new SA head.
	*/
	static struct secashead *
	key_newsah(saidx)
	struct secasindex *saidx;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *newsah;

	IPSEC_ASSERT(saidx != NULL, ("null saidx"));

	newsah = malloc(sizeof(struct secashead), M_IPSEC_SAH, M_NOWAIT\|M_ZERO);
	if (newsah != NULL) {
	int i;
	for (i = 0; i < sizeof(newsah->savtree)/sizeof(newsah->savtree[0]); i++)
	LIST_INIT(&newsah->savtree[i]);
	newsah->saidx = *saidx;

	/* add to saidxtree */
	newsah->state = SADB_SASTATE_MATURE;

	SAHTREE_LOCK();
	LIST_INSERT_HEAD(&V_sahtree, newsah, chain);
	SAHTREE_UNLOCK();
	}
	return(newsah);
	}

	/*
	* delete SA index and all SA registerd.
	*/
	static void
	key_delsah(sah)
	struct secashead *sah;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar sav, nextsav;
	u_int stateidx;
	int zombie = 0;

	IPSEC_ASSERT(sah != NULL, ("NULL sah"));
	SAHTREE_LOCK_ASSERT();

	/* searching all SA registerd in the secindex. */
	for (stateidx = 0;
	stateidx < _ARRAYLEN(V_saorder_state_any);
	stateidx++) {
	u_int state = V_saorder_state_any[stateidx];
	LIST_FOREACH_SAFE(sav, &sah->savtree[state], chain, nextsav) {
	if (sav->refcnt == 0) {
	/* sanity check */
	KEY_CHKSASTATE(state, sav->state, __func__);
	KEY_FREESAV(&sav);
	} else {
	/* give up to delete this sa */
	zombie++;
	}
	}
	}
	if (!zombie) { /* delete only if there are savs */
	/* remove from tree of SA index */
	if (__LIST_CHAINED(sah))
	LIST_REMOVE(sah, chain);
	if (sah->sa_route.ro_rt) {
	RTFREE(sah->sa_route.ro_rt);
	sah->sa_route.ro_rt = (struct rtentry *)NULL;
	}
	free(sah, M_IPSEC_SAH);
	}
	}

	/*
	* allocating a new SA with LARVAL state. key_add() and key_getspi() call,
	* and copy the values of mhp into new buffer.
	* When SAD message type is GETSPI:
	* to set sequence number from acq_seq++,
	* to set zero to SPI.
	* not to call key_setsava().
	* OUT: NULL : fail
	* others : pointer to new secasvar.
	*
	* does not modify mbuf. does not free mbuf on error.
	*/
	static struct secasvar *
	key_newsav(m, mhp, sah, errp, where, tag)
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	struct secashead *sah;
	int *errp;
	const char* where;
	int tag;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar *newsav;
	const struct sadb_sa *xsa;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));
	IPSEC_ASSERT(sah != NULL, ("null secashead"));

	newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT\|M_ZERO);
	if (newsav == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	*errp = ENOBUFS;
	goto done;
	}

	switch (mhp->msg->sadb_msg_type) {
	case SADB_GETSPI:
	newsav->spi = 0;

	#ifdef IPSEC_DOSEQCHECK
	/* sync sequence number */
	if (mhp->msg->sadb_msg_seq == 0)
	newsav->seq =
	(V_acq_seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq));
	else
	#endif
	newsav->seq = mhp->msg->sadb_msg_seq;
	break;

	case SADB_ADD:
	/* sanity check */
	if (mhp->ext[SADB_EXT_SA] == NULL) {
	free(newsav, M_IPSEC_SA);
	newsav = NULL;
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	*errp = EINVAL;
	goto done;
	}
	xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	newsav->spi = xsa->sadb_sa_spi;
	newsav->seq = mhp->msg->sadb_msg_seq;
	break;
	default:
	free(newsav, M_IPSEC_SA);
	newsav = NULL;
	*errp = EINVAL;
	goto done;
	}


	/* copy sav values */
	if (mhp->msg->sadb_msg_type != SADB_GETSPI) {
	*errp = key_setsaval(newsav, m, mhp);
	if (*errp) {
	free(newsav, M_IPSEC_SA);
	newsav = NULL;
	goto done;
	}
	}

	SECASVAR_LOCK_INIT(newsav);

	/* reset created */
	newsav->created = time_second;
	newsav->pid = mhp->msg->sadb_msg_pid;

	/* add to satree */
	newsav->sah = sah;
	sa_initref(newsav);
	newsav->state = SADB_SASTATE_LARVAL;

	/* XXX locking??? */
	LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav,
	secasvar, chain);
	done:
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s from %s:%u return SP:%p\n", __func__,
	where, tag, newsav));

	return newsav;
	}

	/*
	* free() SA variable entry.
	*/
	static void
	key_cleansav(struct secasvar *sav)
	{
	/*
	* Cleanup xform state. Note that zeroize'ing causes the
	* keys to be cleared; otherwise we must do it ourself.
	*/
	if (sav->tdb_xform != NULL) {
	sav->tdb_xform->xf_zeroize(sav);
	sav->tdb_xform = NULL;
	} else {
	KASSERT(sav->iv == NULL, ("iv but no xform"));
	if (sav->key_auth != NULL)
	bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));
	if (sav->key_enc != NULL)
	bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
	}
	if (sav->key_auth != NULL) {
	if (sav->key_auth->key_data != NULL)
	free(sav->key_auth->key_data, M_IPSEC_MISC);
	free(sav->key_auth, M_IPSEC_MISC);
	sav->key_auth = NULL;
	}
	if (sav->key_enc != NULL) {
	if (sav->key_enc->key_data != NULL)
	free(sav->key_enc->key_data, M_IPSEC_MISC);
	free(sav->key_enc, M_IPSEC_MISC);
	sav->key_enc = NULL;
	}
	if (sav->sched) {
	bzero(sav->sched, sav->schedlen);
	free(sav->sched, M_IPSEC_MISC);
	sav->sched = NULL;
	}
	if (sav->replay != NULL) {
	free(sav->replay, M_IPSEC_MISC);
	sav->replay = NULL;
	}
	if (sav->lft_c != NULL) {
	free(sav->lft_c, M_IPSEC_MISC);
	sav->lft_c = NULL;
	}
	if (sav->lft_h != NULL) {
	free(sav->lft_h, M_IPSEC_MISC);
	sav->lft_h = NULL;
	}
	if (sav->lft_s != NULL) {
	free(sav->lft_s, M_IPSEC_MISC);
	sav->lft_s = NULL;
	}
	}

	/*
	* free() SA variable entry.
	*/
	static void
	key_delsav(sav)
	struct secasvar *sav;
	{
	IPSEC_ASSERT(sav != NULL, ("null sav"));
	IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0", sav->refcnt));

	/* remove from SA header */
	if (__LIST_CHAINED(sav))
	LIST_REMOVE(sav, chain);
	key_cleansav(sav);
	SECASVAR_LOCK_DESTROY(sav);
	free(sav, M_IPSEC_SA);
	}

	/*
	* search SAD.
	* OUT:
	* NULL : not found
	* others : found, pointer to a SA.
	*/
	static struct secashead *
	key_getsah(saidx)
	struct secasindex *saidx;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *sah;

	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (sah->state == SADB_SASTATE_DEAD)
	continue;
	if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID))
	break;
	}
	SAHTREE_UNLOCK();

	return sah;
	}

	/*
	* check not to be duplicated SPI.
	* NOTE: this function is too slow due to searching all SAD.
	* OUT:
	* NULL : not found
	* others : found, pointer to a SA.
	*/
	static struct secasvar *
	key_checkspidup(saidx, spi)
	struct secasindex *saidx;
	u_int32_t spi;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *sah;
	struct secasvar *sav;

	/* check address family */
	if (saidx->src.sa.sa_family != saidx->dst.sa.sa_family) {
	ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
	__func__));
	return NULL;
	}

	sav = NULL;
	/* check all SAD */
	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst))
	continue;
	sav = key_getsavbyspi(sah, spi);
	if (sav != NULL)
	break;
	}
	SAHTREE_UNLOCK();

	return sav;
	}

	/*
	* search SAD litmited alive SA, protocol, SPI.
	* OUT:
	* NULL : not found
	* others : found, pointer to a SA.
	*/
	static struct secasvar *
	key_getsavbyspi(sah, spi)
	struct secashead *sah;
	u_int32_t spi;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar *sav;
	u_int stateidx, state;

	sav = NULL;
	SAHTREE_LOCK_ASSERT();
	/* search all status */
	for (stateidx = 0;
	stateidx < _ARRAYLEN(V_saorder_state_alive);
	stateidx++) {

	state = V_saorder_state_alive[stateidx];
	LIST_FOREACH(sav, &sah->savtree[state], chain) {

	/* sanity check */
	if (sav->state != state) {
	ipseclog((LOG_DEBUG, "%s: "
	"invalid sav->state (queue: %d SA: %d)\n",
	__func__, state, sav->state));
	continue;
	}

	if (sav->spi == spi)
	return sav;
	}
	}

	return NULL;
	}

	/*
	* copy SA values from PF_KEY message except SPI, SEQ, PID, STATE and TYPE.
	* You must update these if need.
	* OUT: 0: success.
	* !0: failure.
	*
	* does not modify mbuf. does not free mbuf on error.
	*/
	static int
	key_setsaval(sav, m, mhp)
	struct secasvar *sav;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	int error = 0;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* initialization */
	sav->replay = NULL;
	sav->key_auth = NULL;
	sav->key_enc = NULL;
	sav->sched = NULL;
	sav->schedlen = 0;
	sav->iv = NULL;
	sav->lft_c = NULL;
	sav->lft_h = NULL;
	sav->lft_s = NULL;
	sav->tdb_xform = NULL; /* transform */
	sav->tdb_encalgxform = NULL; /* encoding algorithm */
	sav->tdb_authalgxform = NULL; /* authentication algorithm */
	sav->tdb_compalgxform = NULL; /* compression algorithm */

	/* SA */
	if (mhp->ext[SADB_EXT_SA] != NULL) {
	const struct sadb_sa *sa0;

	sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	if (mhp->extlen[SADB_EXT_SA] < sizeof(*sa0)) {
	error = EINVAL;
	goto fail;
	}

	sav->alg_auth = sa0->sadb_sa_auth;
	sav->alg_enc = sa0->sadb_sa_encrypt;
	sav->flags = sa0->sadb_sa_flags;

	/* replay window */
	if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) {
	sav->replay = (struct secreplay *)
	malloc(sizeof(struct secreplay)+sa0->sadb_sa_replay, M_IPSEC_MISC, M_NOWAIT\|M_ZERO);
	if (sav->replay == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	error = ENOBUFS;
	goto fail;
	}
	if (sa0->sadb_sa_replay != 0)
	sav->replay->bitmap = (caddr_t)(sav->replay+1);
	sav->replay->wsize = sa0->sadb_sa_replay;
	}
	}

	/* Authentication keys */
	if (mhp->ext[SADB_EXT_KEY_AUTH] != NULL) {
	const struct sadb_key *key0;
	int len;

	key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH];
	len = mhp->extlen[SADB_EXT_KEY_AUTH];

	error = 0;
	if (len < sizeof(*key0)) {
	error = EINVAL;
	goto fail;
	}
	switch (mhp->msg->sadb_msg_satype) {
	case SADB_SATYPE_AH:
	case SADB_SATYPE_ESP:
	case SADB_X_SATYPE_TCPSIGNATURE:
	if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
	sav->alg_auth != SADB_X_AALG_NULL)
	error = EINVAL;
	break;
	case SADB_X_SATYPE_IPCOMP:
	default:
	error = EINVAL;
	break;
	}
	if (error) {
	ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n",
	__func__));
	goto fail;
	}

	sav->key_auth = (struct seckey *)key_dup_keymsg(key0, len,
	M_IPSEC_MISC);
	if (sav->key_auth == NULL ) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	error = ENOBUFS;
	goto fail;
	}
	}

	/* Encryption key */
	if (mhp->ext[SADB_EXT_KEY_ENCRYPT] != NULL) {
	const struct sadb_key *key0;
	int len;

	key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT];
	len = mhp->extlen[SADB_EXT_KEY_ENCRYPT];

	error = 0;
	if (len < sizeof(*key0)) {
	error = EINVAL;
	goto fail;
	}
	switch (mhp->msg->sadb_msg_satype) {
	case SADB_SATYPE_ESP:
	if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) &&
	sav->alg_enc != SADB_EALG_NULL) {
	error = EINVAL;
	break;
	}
	sav->key_enc = (struct seckey *)key_dup_keymsg(key0,
	len,
	M_IPSEC_MISC);
	if (sav->key_enc == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	error = ENOBUFS;
	goto fail;
	}
	break;
	case SADB_X_SATYPE_IPCOMP:
	if (len != PFKEY_ALIGN8(sizeof(struct sadb_key)))
	error = EINVAL;
	sav->key_enc = NULL; /just in case/
	break;
	case SADB_SATYPE_AH:
	case SADB_X_SATYPE_TCPSIGNATURE:
	default:
	error = EINVAL;
	break;
	}
	if (error) {
	ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n",
	__func__));
	goto fail;
	}
	}

	/* set iv */
	sav->ivlen = 0;

	switch (mhp->msg->sadb_msg_satype) {
	case SADB_SATYPE_AH:
	error = xform_init(sav, XF_AH);
	break;
	case SADB_SATYPE_ESP:
	error = xform_init(sav, XF_ESP);
	break;
	case SADB_X_SATYPE_IPCOMP:
	error = xform_init(sav, XF_IPCOMP);
	break;
	case SADB_X_SATYPE_TCPSIGNATURE:
	error = xform_init(sav, XF_TCPSIGNATURE);
	break;
	}
	if (error) {
	ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n",
	__func__, mhp->msg->sadb_msg_satype));
	goto fail;
	}

	/* reset created */
	sav->created = time_second;

	/* make lifetime for CURRENT */
	sav->lft_c = malloc(sizeof(struct seclifetime), M_IPSEC_MISC, M_NOWAIT);
	if (sav->lft_c == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	error = ENOBUFS;
	goto fail;
	}

	sav->lft_c->allocations = 0;
	sav->lft_c->bytes = 0;
	sav->lft_c->addtime = time_second;
	sav->lft_c->usetime = 0;

	/* lifetimes for HARD and SOFT */
	{
	const struct sadb_lifetime *lft0;

	lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD];
	if (lft0 != NULL) {
	if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(*lft0)) {
	error = EINVAL;
	goto fail;
	}
	sav->lft_h = key_dup_lifemsg(lft0, M_IPSEC_MISC);
	if (sav->lft_h == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
	error = ENOBUFS;
	goto fail;
	}
	/* to be initialize ? */
	}

	lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_SOFT];
	if (lft0 != NULL) {
	if (mhp->extlen[SADB_EXT_LIFETIME_SOFT] < sizeof(*lft0)) {
	error = EINVAL;
	goto fail;
	}
	sav->lft_s = key_dup_lifemsg(lft0, M_IPSEC_MISC);
	if (sav->lft_s == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
	error = ENOBUFS;
	goto fail;
	}
	/* to be initialize ? */
	}
	}

	return 0;

	fail:
	/* initialization */
	key_cleansav(sav);

	return error;
	}

	/*
	* validation with a secasvar entry, and set SADB_SATYPE_MATURE.
	* OUT: 0: valid
	* other: errno
	*/
	static int
	key_mature(struct secasvar *sav)
	{
	+ INIT_VNET_IPSEC(curvnet);
	int error;

	/* check SPI value */
	switch (sav->sah->saidx.proto) {
	case IPPROTO_ESP:
	case IPPROTO_AH:
	/*
	* RFC 4302, 2.4. Security Parameters Index (SPI), SPI values
	* 1-255 reserved by IANA for future use,
	* 0 for implementation specific, local use.
	*/
	if (ntohl(sav->spi) <= 255) {
	ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n",
	__func__, (u_int32_t)ntohl(sav->spi)));
	return EINVAL;
	}
	break;
	}

	/* check satype */
	switch (sav->sah->saidx.proto) {
	case IPPROTO_ESP:
	/* check flags */
	if ((sav->flags & (SADB_X_EXT_OLD\|SADB_X_EXT_DERIV)) ==
	(SADB_X_EXT_OLD\|SADB_X_EXT_DERIV)) {
	ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
	"given to old-esp.\n", __func__));
	return EINVAL;
	}
	error = xform_init(sav, XF_ESP);
	break;
	case IPPROTO_AH:
	/* check flags */
	if (sav->flags & SADB_X_EXT_DERIV) {
	ipseclog((LOG_DEBUG, "%s: invalid flag (derived) "
	"given to AH SA.\n", __func__));
	return EINVAL;
	}
	if (sav->alg_enc != SADB_EALG_NONE) {
	ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
	"mismated.\n", __func__));
	return(EINVAL);
	}
	error = xform_init(sav, XF_AH);
	break;
	case IPPROTO_IPCOMP:
	if (sav->alg_auth != SADB_AALG_NONE) {
	ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
	"mismated.\n", __func__));
	return(EINVAL);
	}
	if ((sav->flags & SADB_X_EXT_RAWCPI) == 0
	&& ntohl(sav->spi) >= 0x10000) {
	ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n",
	__func__));
	return(EINVAL);
	}
	error = xform_init(sav, XF_IPCOMP);
	break;
	case IPPROTO_TCP:
	if (sav->alg_enc != SADB_EALG_NONE) {
	ipseclog((LOG_DEBUG, "%s: protocol and algorithm "
	"mismated.\n", __func__));
	return(EINVAL);
	}
	error = xform_init(sav, XF_TCPSIGNATURE);
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__));
	error = EPROTONOSUPPORT;
	break;
	}
	if (error == 0) {
	SAHTREE_LOCK();
	key_sa_chgstate(sav, SADB_SASTATE_MATURE);
	SAHTREE_UNLOCK();
	}
	return (error);
	}

	/*
	* subroutine for SADB_GET and SADB_DUMP.
	*/
	static struct mbuf *
	key_setdumpsa(sav, type, satype, seq, pid)
	struct secasvar *sav;
	u_int8_t type, satype;
	u_int32_t seq, pid;
	{
	struct mbuf result = NULL, tres = NULL, *m;
	int i;
	int dumporder[] = {
	SADB_EXT_SA, SADB_X_EXT_SA2,
	SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
	SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC,
	SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY, SADB_EXT_KEY_AUTH,
	SADB_EXT_KEY_ENCRYPT, SADB_EXT_IDENTITY_SRC,
	SADB_EXT_IDENTITY_DST, SADB_EXT_SENSITIVITY,
	};

	m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt);
	if (m == NULL)
	goto fail;
	result = m;

	for (i = sizeof(dumporder)/sizeof(dumporder[0]) - 1; i >= 0; i--) {
	m = NULL;
	switch (dumporder[i]) {
	case SADB_EXT_SA:
	m = key_setsadbsa(sav);
	if (!m)
	goto fail;
	break;

	case SADB_X_EXT_SA2:
	m = key_setsadbxsa2(sav->sah->saidx.mode,
	sav->replay ? sav->replay->count : 0,
	sav->sah->saidx.reqid);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_ADDRESS_SRC:
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sav->sah->saidx.src.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_ADDRESS_DST:
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sav->sah->saidx.dst.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_KEY_AUTH:
	if (!sav->key_auth)
	continue;
	m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_KEY_ENCRYPT:
	if (!sav->key_enc)
	continue;
	m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_LIFETIME_CURRENT:
	if (!sav->lft_c)
	continue;
	m = key_setlifetime(sav->lft_c,
	SADB_EXT_LIFETIME_CURRENT);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_LIFETIME_HARD:
	if (!sav->lft_h)
	continue;
	m = key_setlifetime(sav->lft_h,
	SADB_EXT_LIFETIME_HARD);
	if (!m)
	goto fail;
	break;

	case SADB_EXT_LIFETIME_SOFT:
	if (!sav->lft_s)
	continue;
	m = key_setlifetime(sav->lft_s,
	SADB_EXT_LIFETIME_SOFT);

	if (!m)
	goto fail;
	break;

	case SADB_EXT_ADDRESS_PROXY:
	case SADB_EXT_IDENTITY_SRC:
	case SADB_EXT_IDENTITY_DST:
	/* XXX: should we brought from SPD ? */
	case SADB_EXT_SENSITIVITY:
	default:
	continue;
	}

	if (!m)
	goto fail;
	if (tres)
	m_cat(m, tres);
	tres = m;

	}

	m_cat(result, tres);
	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL)
	goto fail;
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return result;

	fail:
	m_freem(result);
	m_freem(tres);
	return NULL;
	}

	/*
	* set data into sadb_msg.
	*/
	static struct mbuf *
	key_setsadbmsg(type, tlen, satype, seq, pid, reserved)
	u_int8_t type, satype;
	u_int16_t tlen;
	u_int32_t seq;
	pid_t pid;
	u_int16_t reserved;
	{
	struct mbuf *m;
	struct sadb_msg *p;
	int len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_msg));
	if (len > MCLBYTES)
	return NULL;
	MGETHDR(m, M_DONTWAIT, MT_DATA);
	if (m && len > MHLEN) {
	MCLGET(m, M_DONTWAIT);
	if ((m->m_flags & M_EXT) == 0) {
	m_freem(m);
	m = NULL;
	}
	}
	if (!m)
	return NULL;
	m->m_pkthdr.len = m->m_len = len;
	m->m_next = NULL;

	p = mtod(m, struct sadb_msg *);

	bzero(p, len);
	p->sadb_msg_version = PF_KEY_V2;
	p->sadb_msg_type = type;
	p->sadb_msg_errno = 0;
	p->sadb_msg_satype = satype;
	p->sadb_msg_len = PFKEY_UNIT64(tlen);
	p->sadb_msg_reserved = reserved;
	p->sadb_msg_seq = seq;
	p->sadb_msg_pid = (u_int32_t)pid;

	return m;
	}

	/*
	* copy secasvar data into sadb_address.
	*/
	static struct mbuf *
	key_setsadbsa(sav)
	struct secasvar *sav;
	{
	struct mbuf *m;
	struct sadb_sa *p;
	int len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_sa));
	m = key_alloc_mbuf(len);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	return NULL;
	}

	p = mtod(m, struct sadb_sa *);

	bzero(p, len);
	p->sadb_sa_len = PFKEY_UNIT64(len);
	p->sadb_sa_exttype = SADB_EXT_SA;
	p->sadb_sa_spi = sav->spi;
	p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0);
	p->sadb_sa_state = sav->state;
	p->sadb_sa_auth = sav->alg_auth;
	p->sadb_sa_encrypt = sav->alg_enc;
	p->sadb_sa_flags = sav->flags;

	return m;
	}

	/*
	* set data into sadb_address.
	*/
	static struct mbuf *
	key_setsadbaddr(exttype, saddr, prefixlen, ul_proto)
	u_int16_t exttype;
	const struct sockaddr *saddr;
	u_int8_t prefixlen;
	u_int16_t ul_proto;
	{
	struct mbuf *m;
	struct sadb_address *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_address)) +
	PFKEY_ALIGN8(saddr->sa_len);
	m = key_alloc_mbuf(len);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	return NULL;
	}

	p = mtod(m, struct sadb_address *);

	bzero(p, len);
	p->sadb_address_len = PFKEY_UNIT64(len);
	p->sadb_address_exttype = exttype;
	p->sadb_address_proto = ul_proto;
	if (prefixlen == FULLMASK) {
	switch (saddr->sa_family) {
	case AF_INET:
	prefixlen = sizeof(struct in_addr) << 3;
	break;
	case AF_INET6:
	prefixlen = sizeof(struct in6_addr) << 3;
	break;
	default:
	; /XXX/
	}
	}
	p->sadb_address_prefixlen = prefixlen;
	p->sadb_address_reserved = 0;

	bcopy(saddr,
	mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)),
	saddr->sa_len);

	return m;
	}

	/*
	* set data into sadb_x_sa2.
	*/
	static struct mbuf *
	key_setsadbxsa2(mode, seq, reqid)
	u_int8_t mode;
	u_int32_t seq, reqid;
	{
	struct mbuf *m;
	struct sadb_x_sa2 *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2));
	m = key_alloc_mbuf(len);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	return NULL;
	}

	p = mtod(m, struct sadb_x_sa2 *);

	bzero(p, len);
	p->sadb_x_sa2_len = PFKEY_UNIT64(len);
	p->sadb_x_sa2_exttype = SADB_X_EXT_SA2;
	p->sadb_x_sa2_mode = mode;
	p->sadb_x_sa2_reserved1 = 0;
	p->sadb_x_sa2_reserved2 = 0;
	p->sadb_x_sa2_sequence = seq;
	p->sadb_x_sa2_reqid = reqid;

	return m;
	}

	/*
	* set data into sadb_x_policy
	*/
	static struct mbuf *
	key_setsadbxpolicy(type, dir, id)
	u_int16_t type;
	u_int8_t dir;
	u_int32_t id;
	{
	struct mbuf *m;
	struct sadb_x_policy *p;
	size_t len;

	len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy));
	m = key_alloc_mbuf(len);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	return NULL;
	}

	p = mtod(m, struct sadb_x_policy *);

	bzero(p, len);
	p->sadb_x_policy_len = PFKEY_UNIT64(len);
	p->sadb_x_policy_exttype = SADB_X_EXT_POLICY;
	p->sadb_x_policy_type = type;
	p->sadb_x_policy_dir = dir;
	p->sadb_x_policy_id = id;

	return m;
	}

	/* %%% utilities */
	/* Take a key message (sadb_key) from the socket and turn it into one
	* of the kernel's key structures (seckey).
	*
	* IN: pointer to the src
	* OUT: NULL no more memory
	*/
	struct seckey *
	key_dup_keymsg(const struct sadb_key *src, u_int len,
	struct malloc_type *type)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct seckey *dst;
	dst = (struct seckey *)malloc(sizeof(struct seckey), type, M_NOWAIT);
	if (dst != NULL) {
	dst->bits = src->sadb_key_bits;
	dst->key_data = (char *)malloc(len, type, M_NOWAIT);
	if (dst->key_data != NULL) {
	bcopy((const char *)src + sizeof(struct sadb_key),
	dst->key_data, len);
	} else {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));
	free(dst, type);
	dst = NULL;
	}
	} else {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",
	__func__));

	}
	return dst;
	}

	/* Take a lifetime message (sadb_lifetime) passed in on a socket and
	* turn it into one of the kernel's lifetime structures (seclifetime).
	*
	* IN: pointer to the destination, source and malloc type
	* OUT: NULL, no more memory
	*/

	static struct seclifetime *
	key_dup_lifemsg(const struct sadb_lifetime *src,
	struct malloc_type *type)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct seclifetime *dst = NULL;

	dst = (struct seclifetime *)malloc(sizeof(struct seclifetime),
	type, M_NOWAIT);
	if (dst == NULL) {
	/* XXX counter */
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	} else {
	dst->allocations = src->sadb_lifetime_allocations;
	dst->bytes = src->sadb_lifetime_bytes;
	dst->addtime = src->sadb_lifetime_addtime;
	dst->usetime = src->sadb_lifetime_usetime;
	}
	return dst;
	}

	/* compare my own address
	* OUT: 1: true, i.e. my address.
	* 0: false
	*/
	int
	key_ismyaddr(sa)
	struct sockaddr *sa;
	{
	#ifdef INET
	+ INIT_VNET_INET(curvnet);
	struct sockaddr_in *sin;
	struct in_ifaddr *ia;
	#endif

	IPSEC_ASSERT(sa != NULL, ("null sockaddr"));

	switch (sa->sa_family) {
	#ifdef INET
	case AF_INET:
	sin = (struct sockaddr_in *)sa;
	for (ia = V_in_ifaddrhead.tqh_first; ia;
	ia = ia->ia_link.tqe_next)
	{
	if (sin->sin_family == ia->ia_addr.sin_family &&
	sin->sin_len == ia->ia_addr.sin_len &&
	sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
	{
	return 1;
	}
	}
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	return key_ismyaddr6((struct sockaddr_in6 *)sa);
	#endif
	}

	return 0;
	}

	#ifdef INET6
	/*
	* compare my own address for IPv6.
	* 1: ours
	* 0: other
	* NOTE: derived ip6_input() in KAME. This is necessary to modify more.
	*/
	#include <netinet6/in6_var.h>

	static int
	key_ismyaddr6(sin6)
	struct sockaddr_in6 *sin6;
	{
	+ INIT_VNET_INET6(curvnet);
	struct in6_ifaddr *ia;
	struct in6_multi *in6m;

	for (ia = V_in6_ifaddr; ia; ia = ia->ia_next) {
	if (key_sockaddrcmp((struct sockaddr *)&sin6,
	(struct sockaddr *)&ia->ia_addr, 0) == 0)
	return 1;

	/*
	* XXX Multicast
	* XXX why do we care about multlicast here while we don't care
	* about IPv4 multicast??
	* XXX scope
	*/
	in6m = NULL;
	IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m);
	if (in6m)
	return 1;
	}

	/* loopback, just for safety */
	if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))
	return 1;

	return 0;
	}
	#endif /INET6/

	/*
	* compare two secasindex structure.
	* flag can specify to compare 2 saidxes.
	* compare two secasindex structure without both mode and reqid.
	* don't compare port.
	* IN:
	* saidx0: source, it can be in SAD.
	* saidx1: object.
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_cmpsaidx(
	const struct secasindex *saidx0,
	const struct secasindex *saidx1,
	int flag)
	{
	/* sanity */
	if (saidx0 == NULL && saidx1 == NULL)
	return 1;

	if (saidx0 == NULL \|\| saidx1 == NULL)
	return 0;

	if (saidx0->proto != saidx1->proto)
	return 0;

	if (flag == CMP_EXACTLY) {
	if (saidx0->mode != saidx1->mode)
	return 0;
	if (saidx0->reqid != saidx1->reqid)
	return 0;
	if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.sa.sa_len) != 0 \|\|
	bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.sa.sa_len) != 0)
	return 0;
	} else {

	/* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */
	if (flag == CMP_MODE_REQID
	\|\|flag == CMP_REQID) {
	/*
	* If reqid of SPD is non-zero, unique SA is required.
	* The result must be of same reqid in this case.
	*/
	if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid)
	return 0;
	}

	if (flag == CMP_MODE_REQID) {
	if (saidx0->mode != IPSEC_MODE_ANY
	&& saidx0->mode != saidx1->mode)
	return 0;
	}

	if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, 0) != 0) {
	return 0;
	}
	if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, 0) != 0) {
	return 0;
	}
	}

	return 1;
	}

	/*
	* compare two secindex structure exactly.
	* IN:
	* spidx0: source, it is often in SPD.
	* spidx1: object, it is often from PFKEY message.
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_cmpspidx_exactly(
	struct secpolicyindex *spidx0,
	struct secpolicyindex *spidx1)
	{
	/* sanity */
	if (spidx0 == NULL && spidx1 == NULL)
	return 1;

	if (spidx0 == NULL \|\| spidx1 == NULL)
	return 0;

	if (spidx0->prefs != spidx1->prefs
	\|\| spidx0->prefd != spidx1->prefd
	\|\| spidx0->ul_proto != spidx1->ul_proto)
	return 0;

	return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 &&
	key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0;
	}

	/*
	* compare two secindex structure with mask.
	* IN:
	* spidx0: source, it is often in SPD.
	* spidx1: object, it is often from IP header.
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_cmpspidx_withmask(
	struct secpolicyindex *spidx0,
	struct secpolicyindex *spidx1)
	{
	/* sanity */
	if (spidx0 == NULL && spidx1 == NULL)
	return 1;

	if (spidx0 == NULL \|\| spidx1 == NULL)
	return 0;

	if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family \|\|
	spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family \|\|
	spidx0->src.sa.sa_len != spidx1->src.sa.sa_len \|\|
	spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len)
	return 0;

	/* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */
	if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY
	&& spidx0->ul_proto != spidx1->ul_proto)
	return 0;

	switch (spidx0->src.sa.sa_family) {
	case AF_INET:
	if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY
	&& spidx0->src.sin.sin_port != spidx1->src.sin.sin_port)
	return 0;
	if (!key_bbcmp(&spidx0->src.sin.sin_addr,
	&spidx1->src.sin.sin_addr, spidx0->prefs))
	return 0;
	break;
	case AF_INET6:
	if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY
	&& spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port)
	return 0;
	/*
	* scope_id check. if sin6_scope_id is 0, we regard it
	* as a wildcard scope, which matches any scope zone ID.
	*/
	if (spidx0->src.sin6.sin6_scope_id &&
	spidx1->src.sin6.sin6_scope_id &&
	spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id)
	return 0;
	if (!key_bbcmp(&spidx0->src.sin6.sin6_addr,
	&spidx1->src.sin6.sin6_addr, spidx0->prefs))
	return 0;
	break;
	default:
	/* XXX */
	if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0)
	return 0;
	break;
	}

	switch (spidx0->dst.sa.sa_family) {
	case AF_INET:
	if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY
	&& spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port)
	return 0;
	if (!key_bbcmp(&spidx0->dst.sin.sin_addr,
	&spidx1->dst.sin.sin_addr, spidx0->prefd))
	return 0;
	break;
	case AF_INET6:
	if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY
	&& spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port)
	return 0;
	/*
	* scope_id check. if sin6_scope_id is 0, we regard it
	* as a wildcard scope, which matches any scope zone ID.
	*/
	if (spidx0->dst.sin6.sin6_scope_id &&
	spidx1->dst.sin6.sin6_scope_id &&
	spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id)
	return 0;
	if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr,
	&spidx1->dst.sin6.sin6_addr, spidx0->prefd))
	return 0;
	break;
	default:
	/* XXX */
	if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0)
	return 0;
	break;
	}

	/* XXX Do we check other field ? e.g. flowinfo */

	return 1;
	}

	/* returns 0 on match */
	static int
	key_sockaddrcmp(
	const struct sockaddr *sa1,
	const struct sockaddr *sa2,
	int port)
	{
	#ifdef satosin
	#undef satosin
	#endif
	#define satosin(s) ((const struct sockaddr_in *)s)
	#ifdef satosin6
	#undef satosin6
	#endif
	#define satosin6(s) ((const struct sockaddr_in6 *)s)
	if (sa1->sa_family != sa2->sa_family \|\| sa1->sa_len != sa2->sa_len)
	return 1;

	switch (sa1->sa_family) {
	case AF_INET:
	if (sa1->sa_len != sizeof(struct sockaddr_in))
	return 1;
	if (satosin(sa1)->sin_addr.s_addr !=
	satosin(sa2)->sin_addr.s_addr) {
	return 1;
	}
	if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port)
	return 1;
	break;
	case AF_INET6:
	if (sa1->sa_len != sizeof(struct sockaddr_in6))
	return 1; /EINVAL/
	if (satosin6(sa1)->sin6_scope_id !=
	satosin6(sa2)->sin6_scope_id) {
	return 1;
	}
	if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr,
	&satosin6(sa2)->sin6_addr)) {
	return 1;
	}
	if (port &&
	satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) {
	return 1;
	}
	break;
	default:
	if (bcmp(sa1, sa2, sa1->sa_len) != 0)
	return 1;
	break;
	}

	return 0;
	#undef satosin
	#undef satosin6
	}

	/*
	* compare two buffers with mask.
	* IN:
	* addr1: source
	* addr2: object
	* bits: Number of bits to compare
	* OUT:
	* 1 : equal
	* 0 : not equal
	*/
	static int
	key_bbcmp(const void a1, const void a2, u_int bits)
	{
	const unsigned char *p1 = a1;
	const unsigned char *p2 = a2;

	/* XXX: This could be considerably faster if we compare a word
	* at a time, but it is complicated on LSB Endian machines */

	/* Handle null pointers */
	if (p1 == NULL \|\| p2 == NULL)
	return (p1 == p2);

	while (bits >= 8) {
	if (p1++ != p2++)
	return 0;
	bits -= 8;
	}

	if (bits > 0) {
	u_int8_t mask = ~((1<<(8-bits))-1);
	if ((p1 & mask) != (p2 & mask))
	return 0;
	}
	return 1; /* Match! */
	}

	static void
	key_flush_spd(time_t now)
	{
	+ INIT_VNET_IPSEC(curvnet);
	static u_int16_t sptree_scangen = 0;
	u_int16_t gen = sptree_scangen++;
	struct secpolicy *sp;
	u_int dir;

	/* SPD */
	for (dir = 0; dir < IPSEC_DIR_MAX; dir++) {
	restart:
	SPTREE_LOCK();
	LIST_FOREACH(sp, &V_sptree[dir], chain) {
	if (sp->scangen == gen) /* previously handled */
	continue;
	sp->scangen = gen;
	if (sp->state == IPSEC_SPSTATE_DEAD) {
	/* NB: clean entries created by key_spdflush */
	SPTREE_UNLOCK();
	KEY_FREESP(&sp);
	goto restart;
	}
	if (sp->lifetime == 0 && sp->validtime == 0)
	continue;
	if ((sp->lifetime && now - sp->created > sp->lifetime)
	\|\| (sp->validtime && now - sp->lastused > sp->validtime)) {
	sp->state = IPSEC_SPSTATE_DEAD;
	SPTREE_UNLOCK();
	key_spdexpire(sp);
	KEY_FREESP(&sp);
	goto restart;
	}
	}
	SPTREE_UNLOCK();
	}
	}

	static void
	key_flush_sad(time_t now)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead sah, nextsah;
	struct secasvar sav, nextsav;

	/* SAD */
	SAHTREE_LOCK();
	LIST_FOREACH_SAFE(sah, &V_sahtree, chain, nextsah) {
	/* if sah has been dead, then delete it and process next sah. */
	if (sah->state == SADB_SASTATE_DEAD) {
	key_delsah(sah);
	continue;
	}

	/* if LARVAL entry doesn't become MATURE, delete it. */
	LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_LARVAL], chain, nextsav) {
	if (now - sav->created > V_key_larval_lifetime)
	KEY_FREESAV(&sav);
	}

	/*
	* check MATURE entry to start to send expire message
	* whether or not.
	*/
	LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_MATURE], chain, nextsav) {
	/* we don't need to check. */
	if (sav->lft_s == NULL)
	continue;

	/* sanity check */
	if (sav->lft_c == NULL) {
	ipseclog((LOG_DEBUG,"%s: there is no CURRENT "
	"time, why?\n", __func__));
	continue;
	}

	/* check SOFT lifetime */
	if (sav->lft_s->addtime != 0 &&
	now - sav->created > sav->lft_s->addtime) {
	/*
	* check SA to be used whether or not.
	* when SA hasn't been used, delete it.
	*/
	if (sav->lft_c->usetime == 0) {
	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
	KEY_FREESAV(&sav);
	} else {
	key_sa_chgstate(sav, SADB_SASTATE_DYING);
	/*
	* XXX If we keep to send expire
	* message in the status of
	* DYING. Do remove below code.
	*/
	key_expire(sav);
	}
	}
	/* check SOFT lifetime by bytes */
	/*
	* XXX I don't know the way to delete this SA
	* when new SA is installed. Caution when it's
	* installed too big lifetime by time.
	*/
	else if (sav->lft_s->bytes != 0 &&
	sav->lft_s->bytes < sav->lft_c->bytes) {

	key_sa_chgstate(sav, SADB_SASTATE_DYING);
	/*
	* XXX If we keep to send expire
	* message in the status of
	* DYING. Do remove below code.
	*/
	key_expire(sav);
	}
	}

	/* check DYING entry to change status to DEAD. */
	LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DYING], chain, nextsav) {
	/* we don't need to check. */
	if (sav->lft_h == NULL)
	continue;

	/* sanity check */
	if (sav->lft_c == NULL) {
	ipseclog((LOG_DEBUG, "%s: there is no CURRENT "
	"time, why?\n", __func__));
	continue;
	}

	if (sav->lft_h->addtime != 0 &&
	now - sav->created > sav->lft_h->addtime) {
	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
	KEY_FREESAV(&sav);
	}
	#if 0 /* XXX Should we keep to send expire message until HARD lifetime ? */
	else if (sav->lft_s != NULL
	&& sav->lft_s->addtime != 0
	&& now - sav->created > sav->lft_s->addtime) {
	/*
	* XXX: should be checked to be
	* installed the valid SA.
	*/

	/*
	* If there is no SA then sending
	* expire message.
	*/
	key_expire(sav);
	}
	#endif
	/* check HARD lifetime by bytes */
	else if (sav->lft_h->bytes != 0 &&
	sav->lft_h->bytes < sav->lft_c->bytes) {
	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
	KEY_FREESAV(&sav);
	}
	}

	/* delete entry in DEAD */
	LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DEAD], chain, nextsav) {
	/* sanity check */
	if (sav->state != SADB_SASTATE_DEAD) {
	ipseclog((LOG_DEBUG, "%s: invalid sav->state "
	"(queue: %d SA: %d): kill it anyway\n",
	__func__,
	SADB_SASTATE_DEAD, sav->state));
	}
	/*
	* do not call key_freesav() here.
	* sav should already be freed, and sav->refcnt
	* shows other references to sav
	* (such as from SPD).
	*/
	}
	}
	SAHTREE_UNLOCK();
	}

	static void
	key_flush_acq(time_t now)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secacq acq, nextacq;

	/* ACQ tree */
	ACQ_LOCK();
	for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) {
	nextacq = LIST_NEXT(acq, chain);
	if (now - acq->created > V_key_blockacq_lifetime
	&& __LIST_CHAINED(acq)) {
	LIST_REMOVE(acq, chain);
	free(acq, M_IPSEC_SAQ);
	}
	}
	ACQ_UNLOCK();
	}

	static void
	key_flush_spacq(time_t now)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secspacq acq, nextacq;

	/* SP ACQ tree */
	SPACQ_LOCK();
	for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) {
	nextacq = LIST_NEXT(acq, chain);
	if (now - acq->created > V_key_blockacq_lifetime
	&& __LIST_CHAINED(acq)) {
	LIST_REMOVE(acq, chain);
	free(acq, M_IPSEC_SAQ);
	}
	}
	SPACQ_UNLOCK();
	}

	/*
	* time handler.
	* scanning SPD and SAD to check status for each entries,
	* and do to remove or to expire.
	* XXX: year 2038 problem may remain.
	*/
	void
	key_timehandler(void)
	{
	+ VNET_ITERATOR_DECL(vnet_iter);
	time_t now = time_second;

	- key_flush_spd(now);
	- key_flush_sad(now);
	- key_flush_acq(now);
	- key_flush_spacq(now);
	+ VNET_FOREACH(vnet_iter) {
	+ CURVNET_SET(vnet_iter);
	+ key_flush_spd(now);
	+ key_flush_sad(now);
	+ key_flush_acq(now);
	+ key_flush_spacq(now);
	+ CURVNET_RESTORE();
	+ }

	#ifndef IPSEC_DEBUG2
	/* do exchange to tick time !! */
	(void)timeout((void )key_timehandler, (void )0, hz);
	#endif /* IPSEC_DEBUG2 */
	}

	u_long
	key_random()
	{
	u_long value;

	key_randomfill(&value, sizeof(value));
	return value;
	}

	void
	key_randomfill(p, l)
	void *p;
	size_t l;
	{
	size_t n;
	u_long v;
	static int warn = 1;

	n = 0;
	n = (size_t)read_random(p, (u_int)l);
	/* last resort */
	while (n < l) {
	v = random();
	bcopy(&v, (u_int8_t *)p + n,
	l - n < sizeof(v) ? l - n : sizeof(v));
	n += sizeof(v);

	if (warn) {
	printf("WARNING: pseudo-random number generator "
	"used for IPsec processing\n");
	warn = 0;
	}
	}
	}

	/*
	* map SADB_SATYPE_* to IPPROTO_*.
	* if satype == SADB_SATYPE then satype is mapped to ~0.
	* OUT:
	* 0: invalid satype.
	*/
	static u_int16_t
	key_satype2proto(satype)
	u_int8_t satype;
	{
	switch (satype) {
	case SADB_SATYPE_UNSPEC:
	return IPSEC_PROTO_ANY;
	case SADB_SATYPE_AH:
	return IPPROTO_AH;
	case SADB_SATYPE_ESP:
	return IPPROTO_ESP;
	case SADB_X_SATYPE_IPCOMP:
	return IPPROTO_IPCOMP;
	case SADB_X_SATYPE_TCPSIGNATURE:
	return IPPROTO_TCP;
	default:
	return 0;
	}
	/* NOTREACHED */
	}

	/*
	* map IPPROTO_* to SADB_SATYPE_*
	* OUT:
	* 0: invalid protocol type.
	*/
	static u_int8_t
	key_proto2satype(proto)
	u_int16_t proto;
	{
	switch (proto) {
	case IPPROTO_AH:
	return SADB_SATYPE_AH;
	case IPPROTO_ESP:
	return SADB_SATYPE_ESP;
	case IPPROTO_IPCOMP:
	return SADB_X_SATYPE_IPCOMP;
	case IPPROTO_TCP:
	return SADB_X_SATYPE_TCPSIGNATURE;
	default:
	return 0;
	}
	/* NOTREACHED */
	}

	/* %%% PF_KEY */
	/*
	* SADB_GETSPI processing is to receive
	* <base, (SA2), src address, dst address, (SPI range)>
	* from the IKMPd, to assign a unique spi value, to hang on the INBOUND
	* tree with the status of LARVAL, and send
	* <base, SA(*), address(SD)>
	* to the IKMPd.
	*
	* IN: mhp: pointer to the pointer to each header.
	* OUT: NULL if fail.
	* other if success, return pointer to the message to send.
	*/
	static int
	key_getspi(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *newsah;
	struct secasvar *newsav;
	u_int8_t proto;
	u_int32_t spi;
	u_int8_t mode;
	u_int32_t reqid;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
	mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	} else {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	}

	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	/* make sure if port number is zero. */
	switch (((struct sockaddr *)(src0 + 1))->sa_family) {
	case AF_INET:
	if (((struct sockaddr *)(src0 + 1))->sa_len !=
	sizeof(struct sockaddr_in))
	return key_senderror(so, m, EINVAL);
	((struct sockaddr_in *)(src0 + 1))->sin_port = 0;
	break;
	case AF_INET6:
	if (((struct sockaddr *)(src0 + 1))->sa_len !=
	sizeof(struct sockaddr_in6))
	return key_senderror(so, m, EINVAL);
	((struct sockaddr_in6 *)(src0 + 1))->sin6_port = 0;
	break;
	default:
	; /???/
	}
	switch (((struct sockaddr *)(dst0 + 1))->sa_family) {
	case AF_INET:
	if (((struct sockaddr *)(dst0 + 1))->sa_len !=
	sizeof(struct sockaddr_in))
	return key_senderror(so, m, EINVAL);
	((struct sockaddr_in *)(dst0 + 1))->sin_port = 0;
	break;
	case AF_INET6:
	if (((struct sockaddr *)(dst0 + 1))->sa_len !=
	sizeof(struct sockaddr_in6))
	return key_senderror(so, m, EINVAL);
	((struct sockaddr_in6 *)(dst0 + 1))->sin6_port = 0;
	break;
	default:
	; /???/
	}

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);

	/* SPI allocation */
	spi = key_do_getnewspi((struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE],
	&saidx);
	if (spi == 0)
	return key_senderror(so, m, EINVAL);

	/* get a SA index */
	if ((newsah = key_getsah(&saidx)) == NULL) {
	/* create a new SA index */
	if ((newsah = key_newsah(&saidx)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
	return key_senderror(so, m, ENOBUFS);
	}
	}

	/* get a new SA */
	/* XXX rewrite */
	newsav = KEY_NEWSAV(m, mhp, newsah, &error);
	if (newsav == NULL) {
	/* XXX don't free new SA index allocated in above. */
	return key_senderror(so, m, error);
	}

	/* set spi */
	newsav->spi = htonl(spi);

	/* delete the entry in acqtree */
	if (mhp->msg->sadb_msg_seq != 0) {
	struct secacq *acq;
	if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) != NULL) {
	/* reset counter in order to deletion by timehandler. */
	acq->created = time_second;
	acq->count = 0;
	}
	}

	{
	struct mbuf n, nn;
	struct sadb_sa *m_sa;
	struct sadb_msg *newmsg;
	int off, len;

	/* create new sadb_msg to reply. */
	len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) +
	PFKEY_ALIGN8(sizeof(struct sadb_sa));

	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (len > MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	n->m_len = len;
	n->m_next = NULL;
	off = 0;

	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));

	m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off);
	m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa));
	m_sa->sadb_sa_exttype = SADB_EXT_SA;
	m_sa->sadb_sa_spi = htonl(spi);
	off += PFKEY_ALIGN8(sizeof(struct sadb_sa));

	IPSEC_ASSERT(off == len,
	("length inconsistency (off %u len %u)", off, len));

	n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC,
	SADB_EXT_ADDRESS_DST);
	if (!n->m_next) {
	m_freem(n);
	return key_senderror(so, m, ENOBUFS);
	}

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
	}

	n->m_pkthdr.len = 0;
	for (nn = n; nn; nn = nn->m_next)
	n->m_pkthdr.len += nn->m_len;

	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_seq = newsav->seq;
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}

	/*
	* allocating new SPI
	* called by key_getspi().
	* OUT:
	* 0: failure.
	* others: success.
	*/
	static u_int32_t
	key_do_getnewspi(spirange, saidx)
	struct sadb_spirange *spirange;
	struct secasindex *saidx;
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int32_t newspi;
	u_int32_t min, max;
	int count = V_key_spi_trycnt;

	/* set spi range to allocate */
	if (spirange != NULL) {
	min = spirange->sadb_spirange_min;
	max = spirange->sadb_spirange_max;
	} else {
	min = V_key_spi_minval;
	max = V_key_spi_maxval;
	}
	/* IPCOMP needs 2-byte SPI */
	if (saidx->proto == IPPROTO_IPCOMP) {
	u_int32_t t;
	if (min >= 0x10000)
	min = 0xffff;
	if (max >= 0x10000)
	max = 0xffff;
	if (min > max) {
	t = min; min = max; max = t;
	}
	}

	if (min == max) {
	if (key_checkspidup(saidx, min) != NULL) {
	ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n",
	__func__, min));
	return 0;
	}

	count--; /* taking one cost. */
	newspi = min;

	} else {

	/* init SPI */
	newspi = 0;

	/* when requesting to allocate spi ranged */
	while (count--) {
	/* generate pseudo-random SPI value ranged. */
	newspi = min + (key_random() % (max - min + 1));

	if (key_checkspidup(saidx, newspi) == NULL)
	break;
	}

	if (count == 0 \|\| newspi == 0) {
	ipseclog((LOG_DEBUG, "%s: to allocate spi is failed.\n",
	__func__));
	return 0;
	}
	}

	/* statistics */
	keystat.getspi_count =
	(keystat.getspi_count + V_key_spi_trycnt - count) / 2;

	return newspi;
	}

	/*
	* SADB_UPDATE processing
	* receive
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* key(AE), (identity(SD),) (sensitivity)>
	* from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL.
	* and send
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* (identity(SD),) (sensitivity)>
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_update(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_sa *sa0;
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *sah;
	struct secasvar *sav;
	u_int16_t proto;
	u_int8_t mode;
	u_int32_t reqid;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->ext[SADB_EXT_SA] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
	mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
	mhp->ext[SADB_EXT_KEY_AUTH] == NULL) \|\|
	(mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL &&
	mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) \|\|
	(mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL &&
	mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
	mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	} else {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	}
	/* XXX boundary checking for other extensions */

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);

	/* get a SA header */
	if ((sah = key_getsah(&saidx)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SA index found.\n", __func__));
	return key_senderror(so, m, ENOENT);
	}

	/* set spidx if there */
	/* XXX rewrite */
	error = key_setident(sah, m, mhp);
	if (error)
	return key_senderror(so, m, error);

	/* find a SA with sequence number. */
	#ifdef IPSEC_DOSEQCHECK
	if (mhp->msg->sadb_msg_seq != 0
	&& (sav = key_getsavbyseq(sah, mhp->msg->sadb_msg_seq)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: no larval SA with sequence %u "
	"exists.\n", __func__, mhp->msg->sadb_msg_seq));
	return key_senderror(so, m, ENOENT);
	}
	#else
	SAHTREE_LOCK();
	sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
	SAHTREE_UNLOCK();
	if (sav == NULL) {
	ipseclog((LOG_DEBUG, "%s: no such a SA found (spi:%u)\n",
	__func__, (u_int32_t)ntohl(sa0->sadb_sa_spi)));
	return key_senderror(so, m, EINVAL);
	}
	#endif

	/* validity check */
	if (sav->sah->saidx.proto != proto) {
	ipseclog((LOG_DEBUG, "%s: protocol mismatched "
	"(DB=%u param=%u)\n", __func__,
	sav->sah->saidx.proto, proto));
	return key_senderror(so, m, EINVAL);
	}
	#ifdef IPSEC_DOSEQCHECK
	if (sav->spi != sa0->sadb_sa_spi) {
	ipseclog((LOG_DEBUG, "%s: SPI mismatched (DB:%u param:%u)\n",
	__func__,
	(u_int32_t)ntohl(sav->spi),
	(u_int32_t)ntohl(sa0->sadb_sa_spi)));
	return key_senderror(so, m, EINVAL);
	}
	#endif
	if (sav->pid != mhp->msg->sadb_msg_pid) {
	ipseclog((LOG_DEBUG, "%s: pid mismatched (DB:%u param:%u)\n",
	__func__, sav->pid, mhp->msg->sadb_msg_pid));
	return key_senderror(so, m, EINVAL);
	}

	/* copy sav values */
	error = key_setsaval(sav, m, mhp);
	if (error) {
	KEY_FREESAV(&sav);
	return key_senderror(so, m, error);
	}

	/* check SA values to be mature. */
	if ((mhp->msg->sadb_msg_errno = key_mature(sav)) != 0) {
	KEY_FREESAV(&sav);
	return key_senderror(so, m, 0);
	}

	{
	struct mbuf *n;

	/* set msg buf from mhp */
	n = key_getmsgbuf_x1(m, mhp);
	if (n == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* search SAD with sequence for a SA which state is SADB_SASTATE_LARVAL.
	* only called by key_update().
	* OUT:
	* NULL : not found
	* others : found, pointer to a SA.
	*/
	#ifdef IPSEC_DOSEQCHECK
	static struct secasvar *
	key_getsavbyseq(sah, seq)
	struct secashead *sah;
	u_int32_t seq;
	{
	struct secasvar *sav;
	u_int state;

	state = SADB_SASTATE_LARVAL;

	/* search SAD with sequence number ? */
	LIST_FOREACH(sav, &sah->savtree[state], chain) {

	KEY_CHKSASTATE(state, sav->state, __func__);

	if (sav->seq == seq) {
	sa_addref(sav);
	KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
	printf("DP %s cause refcnt++:%d SA:%p\n",
	__func__, sav->refcnt, sav));
	return sav;
	}
	}

	return NULL;
	}
	#endif

	/*
	* SADB_ADD processing
	* add an entry to SA database, when received
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* key(AE), (identity(SD),) (sensitivity)>
	* from the ikmpd,
	* and send
	* <base, SA, (SA2), (lifetime(HSC),) address(SD), (address(P),)
	* (identity(SD),) (sensitivity)>
	* to the ikmpd.
	*
	* IGNORE identity and sensitivity messages.
	*
	* m will always be freed.
	*/
	static int
	key_add(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_sa *sa0;
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *newsah;
	struct secasvar *newsav;
	u_int16_t proto;
	u_int8_t mode;
	u_int32_t reqid;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->ext[SADB_EXT_SA] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP &&
	mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) \|\|
	(mhp->msg->sadb_msg_satype == SADB_SATYPE_AH &&
	mhp->ext[SADB_EXT_KEY_AUTH] == NULL) \|\|
	(mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL &&
	mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) \|\|
	(mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL &&
	mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
	/* XXX need more */
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->ext[SADB_X_EXT_SA2] != NULL) {
	mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode;
	reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid;
	} else {
	mode = IPSEC_MODE_ANY;
	reqid = 0;
	}

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx);

	/* get a SA header */
	if ((newsah = key_getsah(&saidx)) == NULL) {
	/* create a new SA header */
	if ((newsah = key_newsah(&saidx)) == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__));
	return key_senderror(so, m, ENOBUFS);
	}
	}

	/* set spidx if there */
	/* XXX rewrite */
	error = key_setident(newsah, m, mhp);
	if (error) {
	return key_senderror(so, m, error);
	}

	/* create new SA entry. */
	/* We can create new SA only if SPI is differenct. */
	SAHTREE_LOCK();
	newsav = key_getsavbyspi(newsah, sa0->sadb_sa_spi);
	SAHTREE_UNLOCK();
	if (newsav != NULL) {
	ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__));
	return key_senderror(so, m, EEXIST);
	}
	newsav = KEY_NEWSAV(m, mhp, newsah, &error);
	if (newsav == NULL) {
	return key_senderror(so, m, error);
	}

	/* check SA values to be mature. */
	if ((error = key_mature(newsav)) != 0) {
	KEY_FREESAV(&newsav);
	return key_senderror(so, m, error);
	}

	/*
	* don't call key_freesav() here, as we would like to keep the SA
	* in the database on success.
	*/

	{
	struct mbuf *n;

	/* set msg buf from mhp */
	n = key_getmsgbuf_x1(m, mhp);
	if (n == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/* m is retained */
	static int
	key_setident(sah, m, mhp)
	struct secashead *sah;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	const struct sadb_ident idsrc, iddst;
	int idsrclen, iddstlen;

	IPSEC_ASSERT(sah != NULL, ("null secashead"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* don't make buffer if not there */
	if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL &&
	mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) {
	sah->idents = NULL;
	sah->identd = NULL;
	return 0;
	}

	if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) {
	ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__));
	return EINVAL;
	}

	idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC];
	iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST];
	idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC];
	iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST];

	/* validity check */
	if (idsrc->sadb_ident_type != iddst->sadb_ident_type) {
	ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__));
	return EINVAL;
	}

	switch (idsrc->sadb_ident_type) {
	case SADB_IDENTTYPE_PREFIX:
	case SADB_IDENTTYPE_FQDN:
	case SADB_IDENTTYPE_USERFQDN:
	default:
	/* XXX do nothing */
	sah->idents = NULL;
	sah->identd = NULL;
	return 0;
	}

	/* make structure */
	sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
	if (sah->idents == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return ENOBUFS;
	}
	sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT);
	if (sah->identd == NULL) {
	free(sah->idents, M_IPSEC_MISC);
	sah->idents = NULL;
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return ENOBUFS;
	}
	sah->idents->type = idsrc->sadb_ident_type;
	sah->idents->id = idsrc->sadb_ident_id;

	sah->identd->type = iddst->sadb_ident_type;
	sah->identd->id = iddst->sadb_ident_id;

	return 0;
	}

	/*
	* m will not be freed on return.
	* it is caller's responsibility to free the result.
	*/
	static struct mbuf *
	key_getmsgbuf_x1(m, mhp)
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	struct mbuf *n;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 9, SADB_EXT_RESERVED,
	SADB_EXT_SA, SADB_X_EXT_SA2,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST,
	SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT,
	SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST);
	if (!n)
	return NULL;

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return NULL;
	}
	mtod(n, struct sadb_msg *)->sadb_msg_errno = 0;
	mtod(n, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(n->m_pkthdr.len);

	return n;
	}

	static int key_delete_all __P((struct socket , struct mbuf ,
	const struct sadb_msghdr *, u_int16_t));

	/*
	* SADB_DELETE processing
	* receive
	* <base, SA(*), address(SD)>
	* from the ikmpd, and set SADB_SASTATE_DEAD,
	* and send,
	* <base, SA(*), address(SD)>
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_delete(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_sa *sa0;
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *sah;
	struct secasvar *sav = NULL;
	u_int16_t proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->ext[SADB_EXT_SA] == NULL) {
	/*
	* Caller wants us to delete all non-LARVAL SAs
	* that match the src/dst. This is used during
	* IKE INITIAL-CONTACT.
	*/
	ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__));
	return key_delete_all(so, m, mhp, proto);
	} else if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);

	/* get a SA header */
	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (sah->state == SADB_SASTATE_DEAD)
	continue;
	if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
	continue;

	/* get a SA with SPI. */
	sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
	if (sav)
	break;
	}
	if (sah == NULL) {
	SAHTREE_UNLOCK();
	ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
	return key_senderror(so, m, ENOENT);
	}

	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
	SAHTREE_UNLOCK();
	KEY_FREESAV(&sav);

	{
	struct mbuf *n;
	struct sadb_msg *newmsg;

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED,
	SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return key_senderror(so, m, ENOBUFS);
	}
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* delete all SAs for src/dst. Called from key_delete().
	*/
	static int
	key_delete_all(so, m, mhp, proto)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	u_int16_t proto;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *sah;
	struct secasvar sav, nextsav;
	u_int stateidx, state;

	src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]);

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);

	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (sah->state == SADB_SASTATE_DEAD)
	continue;
	if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
	continue;

	/* Delete all non-LARVAL SAs. */
	for (stateidx = 0;
	stateidx < _ARRAYLEN(V_saorder_state_alive);
	stateidx++) {
	state = V_saorder_state_alive[stateidx];
	if (state == SADB_SASTATE_LARVAL)
	continue;
	for (sav = LIST_FIRST(&sah->savtree[state]);
	sav != NULL; sav = nextsav) {
	nextsav = LIST_NEXT(sav, chain);
	/* sanity check */
	if (sav->state != state) {
	ipseclog((LOG_DEBUG, "%s: invalid "
	"sav->state (queue %d SA %d)\n",
	__func__, state, sav->state));
	continue;
	}

	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
	KEY_FREESAV(&sav);
	}
	}
	}
	SAHTREE_UNLOCK();
	{
	struct mbuf *n;
	struct sadb_msg *newmsg;

	/* create new sadb_msg to reply. */
	n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED,
	SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	if (n->m_len < sizeof(struct sadb_msg)) {
	n = m_pullup(n, sizeof(struct sadb_msg));
	if (n == NULL)
	return key_senderror(so, m, ENOBUFS);
	}
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ALL);
	}
	}

	/*
	* SADB_GET processing
	* receive
	* <base, SA(*), address(SD)>
	* from the ikmpd, and get a SP and a SA to respond,
	* and send,
	* <base, SA, (lifetime(HSC),) address(SD), (address(P),) key(AE),
	* (identity(SD),) (sensitivity)>
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_get(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_sa *sa0;
	struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *sah;
	struct secasvar *sav = NULL;
	u_int16_t proto;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->ext[SADB_EXT_SA] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) {
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA];
	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);

	/* get a SA header */
	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (sah->state == SADB_SASTATE_DEAD)
	continue;
	if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0)
	continue;

	/* get a SA with SPI. */
	sav = key_getsavbyspi(sah, sa0->sadb_sa_spi);
	if (sav)
	break;
	}
	SAHTREE_UNLOCK();
	if (sah == NULL) {
	ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__));
	return key_senderror(so, m, ENOENT);
	}

	{
	struct mbuf *n;
	u_int8_t satype;

	/* map proto to satype */
	if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
	ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	/* create new sadb_msg to reply. */
	n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq,
	mhp->msg->sadb_msg_pid);
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}

	/* XXX make it sysctl-configurable? */
	static void
	key_getcomb_setlifetime(comb)
	struct sadb_comb *comb;
	{

	comb->sadb_comb_soft_allocations = 1;
	comb->sadb_comb_hard_allocations = 1;
	comb->sadb_comb_soft_bytes = 0;
	comb->sadb_comb_hard_bytes = 0;
	comb->sadb_comb_hard_addtime = 86400; /* 1 day */
	comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100;
	comb->sadb_comb_soft_usetime = 28800; /* 8 hours */
	comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100;
	}

	/*
	* XXX reorder combinations by preference
	* XXX no idea if the user wants ESP authentication or not
	*/
	static struct mbuf *
	key_getcomb_esp()
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_comb *comb;
	struct enc_xform *algo;
	struct mbuf result = NULL, m, *n;
	int encmin;
	int i, off, o;
	int totlen;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));

	m = NULL;
	for (i = 1; i <= SADB_EALG_MAX; i++) {
	algo = esp_algorithm_lookup(i);
	if (algo == NULL)
	continue;

	/* discard algorithms with key size smaller than system min */
	if (_BITS(algo->maxkey) < V_ipsec_esp_keymin)
	continue;
	if (_BITS(algo->minkey) < V_ipsec_esp_keymin)
	encmin = V_ipsec_esp_keymin;
	else
	encmin = _BITS(algo->minkey);

	if (V_ipsec_esp_auth)
	m = key_getcomb_ah();
	else {
	IPSEC_ASSERT(l <= MLEN,
	("l=%u > MLEN=%lu", l, (u_long) MLEN));
	MGET(m, M_DONTWAIT, MT_DATA);
	if (m) {
	M_ALIGN(m, l);
	m->m_len = l;
	m->m_next = NULL;
	bzero(mtod(m, caddr_t), m->m_len);
	}
	}
	if (!m)
	goto fail;

	totlen = 0;
	for (n = m; n; n = n->m_next)
	totlen += n->m_len;
	IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l));

	for (off = 0; off < totlen; off += l) {
	n = m_pulldown(m, off, l, &o);
	if (!n) {
	/* m is already freed */
	goto fail;
	}
	comb = (struct sadb_comb *)(mtod(n, caddr_t) + o);
	bzero(comb, sizeof(*comb));
	key_getcomb_setlifetime(comb);
	comb->sadb_comb_encrypt = i;
	comb->sadb_comb_encrypt_minbits = encmin;
	comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey);
	}

	if (!result)
	result = m;
	else
	m_cat(result, m);
	}

	return result;

	fail:
	if (result)
	m_freem(result);
	return NULL;
	}

	static void
	key_getsizes_ah(
	const struct auth_hash *ah,
	int alg,
	u_int16_t* min,
	u_int16_t* max)
	{
	+ INIT_VNET_IPSEC(curvnet);
	+
	min = max = ah->keysize;
	if (ah->keysize == 0) {
	/*
	* Transform takes arbitrary key size but algorithm
	* key size is restricted. Enforce this here.
	*/
	switch (alg) {
	case SADB_X_AALG_MD5: min = max = 16; break;
	case SADB_X_AALG_SHA: min = max = 20; break;
	case SADB_X_AALG_NULL: min = 1; max = 256; break;
	default:
	DPRINTF(("%s: unknown AH algorithm %u\n",
	__func__, alg));
	break;
	}
	}
	}

	/*
	* XXX reorder combinations by preference
	*/
	static struct mbuf *
	key_getcomb_ah()
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_comb *comb;
	struct auth_hash *algo;
	struct mbuf *m;
	u_int16_t minkeysize, maxkeysize;
	int i;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));

	m = NULL;
	for (i = 1; i <= SADB_AALG_MAX; i++) {
	#if 1
	/* we prefer HMAC algorithms, not old algorithms */
	if (i != SADB_AALG_SHA1HMAC && i != SADB_AALG_MD5HMAC)
	continue;
	#endif
	algo = ah_algorithm_lookup(i);
	if (!algo)
	continue;
	key_getsizes_ah(algo, i, &minkeysize, &maxkeysize);
	/* discard algorithms with key size smaller than system min */
	if (_BITS(minkeysize) < V_ipsec_ah_keymin)
	continue;

	if (!m) {
	IPSEC_ASSERT(l <= MLEN,
	("l=%u > MLEN=%lu", l, (u_long) MLEN));
	MGET(m, M_DONTWAIT, MT_DATA);
	if (m) {
	M_ALIGN(m, l);
	m->m_len = l;
	m->m_next = NULL;
	}
	} else
	M_PREPEND(m, l, M_DONTWAIT);
	if (!m)
	return NULL;

	comb = mtod(m, struct sadb_comb *);
	bzero(comb, sizeof(*comb));
	key_getcomb_setlifetime(comb);
	comb->sadb_comb_auth = i;
	comb->sadb_comb_auth_minbits = _BITS(minkeysize);
	comb->sadb_comb_auth_maxbits = _BITS(maxkeysize);
	}

	return m;
	}

	/*
	* not really an official behavior. discussed in pf_key@inner.net in Sep2000.
	* XXX reorder combinations by preference
	*/
	static struct mbuf *
	key_getcomb_ipcomp()
	{
	struct sadb_comb *comb;
	struct comp_algo *algo;
	struct mbuf *m;
	int i;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb));

	m = NULL;
	for (i = 1; i <= SADB_X_CALG_MAX; i++) {
	algo = ipcomp_algorithm_lookup(i);
	if (!algo)
	continue;

	if (!m) {
	IPSEC_ASSERT(l <= MLEN,
	("l=%u > MLEN=%lu", l, (u_long) MLEN));
	MGET(m, M_DONTWAIT, MT_DATA);
	if (m) {
	M_ALIGN(m, l);
	m->m_len = l;
	m->m_next = NULL;
	}
	} else
	M_PREPEND(m, l, M_DONTWAIT);
	if (!m)
	return NULL;

	comb = mtod(m, struct sadb_comb *);
	bzero(comb, sizeof(*comb));
	key_getcomb_setlifetime(comb);
	comb->sadb_comb_encrypt = i;
	/* what should we set into sadb_comb__{min,max}bits? /
	}

	return m;
	}

	/*
	* XXX no way to pass mode (transport/tunnel) to userland
	* XXX replay checking?
	* XXX sysctl interface to ipsec_{ah,esp}_keymin
	*/
	static struct mbuf *
	key_getprop(saidx)
	const struct secasindex *saidx;
	{
	struct sadb_prop *prop;
	struct mbuf m, n;
	const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop));
	int totlen;

	switch (saidx->proto) {
	case IPPROTO_ESP:
	m = key_getcomb_esp();
	break;
	case IPPROTO_AH:
	m = key_getcomb_ah();
	break;
	case IPPROTO_IPCOMP:
	m = key_getcomb_ipcomp();
	break;
	default:
	return NULL;
	}

	if (!m)
	return NULL;
	M_PREPEND(m, l, M_DONTWAIT);
	if (!m)
	return NULL;

	totlen = 0;
	for (n = m; n; n = n->m_next)
	totlen += n->m_len;

	prop = mtod(m, struct sadb_prop *);
	bzero(prop, sizeof(*prop));
	prop->sadb_prop_len = PFKEY_UNIT64(totlen);
	prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
	prop->sadb_prop_replay = 32; /* XXX */

	return m;
	}

	/*
	* SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2().
	* send
	* <base, SA, address(SD), (address(P)), x_policy,
	* (identity(SD),) (sensitivity,) proposal>
	* to KMD, and expect to receive
	* <base> with SADB_ACQUIRE if error occured,
	* or
	* <base, src address, dst address, (SPI range)> with SADB_GETSPI
	* from KMD by PF_KEY.
	*
	* XXX x_policy is outside of RFC2367 (KAME extension).
	* XXX sensitivity is not supported.
	* XXX for ipcomp, RFC2367 does not define how to fill in proposal.
	* see comment for key_getcomb_ipcomp().
	*
	* OUT:
	* 0 : succeed
	* others: error number
	*/
	static int
	key_acquire(const struct secasindex saidx, struct secpolicy sp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf result = NULL, m;
	struct secacq *newacq;
	u_int8_t satype;
	int error = -1;
	u_int32_t seq;

	IPSEC_ASSERT(saidx != NULL, ("null saidx"));
	satype = key_proto2satype(saidx->proto);
	IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto));

	/*
	* We never do anything about acquirng SA. There is anather
	* solution that kernel blocks to send SADB_ACQUIRE message until
	* getting something message from IKEd. In later case, to be
	* managed with ACQUIRING list.
	*/
	/* Get an entry to check whether sending message or not. */
	if ((newacq = key_getacq(saidx)) != NULL) {
	if (V_key_blockacq_count < newacq->count) {
	/* reset counter and do send message. */
	newacq->count = 0;
	} else {
	/* increment counter and do nothing. */
	newacq->count++;
	return 0;
	}
	} else {
	/* make new entry for blocking to send SADB_ACQUIRE. */
	if ((newacq = key_newacq(saidx)) == NULL)
	return ENOBUFS;
	}


	seq = newacq->seq;
	m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	result = m;

	/* set sadb_address for saidx's. */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&saidx->src.sa, FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&saidx->dst.sa, FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* XXX proxy address (optional) */

	/* set sadb_x_policy */
	if (sp) {
	m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);
	}

	/* XXX identity (optional) */
	#if 0
	if (idexttype && fqdn) {
	/* create identity extension (FQDN) */
	struct sadb_ident *id;
	int fqdnlen;

	fqdnlen = strlen(fqdn) + 1; /* +1 for terminating-NUL */
	id = (struct sadb_ident *)p;
	bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
	id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen));
	id->sadb_ident_exttype = idexttype;
	id->sadb_ident_type = SADB_IDENTTYPE_FQDN;
	bcopy(fqdn, id + 1, fqdnlen);
	p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen);
	}

	if (idexttype) {
	/* create identity extension (USERFQDN) */
	struct sadb_ident *id;
	int userfqdnlen;

	if (userfqdn) {
	/* +1 for terminating-NUL */
	userfqdnlen = strlen(userfqdn) + 1;
	} else
	userfqdnlen = 0;
	id = (struct sadb_ident *)p;
	bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
	id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen));
	id->sadb_ident_exttype = idexttype;
	id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN;
	/* XXX is it correct? */
	if (curproc && curproc->p_cred)
	id->sadb_ident_id = curproc->p_cred->p_ruid;
	if (userfqdn && userfqdnlen)
	bcopy(userfqdn, id + 1, userfqdnlen);
	p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen);
	}
	#endif

	/* XXX sensitivity (optional) */

	/* create proposal/combination extension */
	m = key_getprop(saidx);
	#if 0
	/*
	* spec conformant: always attach proposal/combination extension,
	* the problem is that we have no way to attach it for ipcomp,
	* due to the way sadb_comb is declared in RFC2367.
	*/
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);
	#else
	/*
	* outside of spec; make proposal/combination extension optional.
	*/
	if (m)
	m_cat(result, m);
	#endif

	if ((result->m_flags & M_PKTHDR) == 0) {
	error = EINVAL;
	goto fail;
	}

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);

	fail:
	if (result)
	m_freem(result);
	return error;
	}

	static struct secacq *
	key_newacq(const struct secasindex *saidx)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secacq *newacq;

	/* get new entry */
	newacq = malloc(sizeof(struct secacq), M_IPSEC_SAQ, M_NOWAIT\|M_ZERO);
	if (newacq == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return NULL;
	}

	/* copy secindex */
	bcopy(saidx, &newacq->saidx, sizeof(newacq->saidx));
	newacq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq);
	newacq->created = time_second;
	newacq->count = 0;

	/* add to acqtree */
	ACQ_LOCK();
	LIST_INSERT_HEAD(&V_acqtree, newacq, chain);
	ACQ_UNLOCK();

	return newacq;
	}

	static struct secacq *
	key_getacq(const struct secasindex *saidx)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secacq *acq;

	ACQ_LOCK();
	LIST_FOREACH(acq, &V_acqtree, chain) {
	if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY))
	break;
	}
	ACQ_UNLOCK();

	return acq;
	}

	static struct secacq *
	key_getacqbyseq(seq)
	u_int32_t seq;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secacq *acq;

	ACQ_LOCK();
	LIST_FOREACH(acq, &V_acqtree, chain) {
	if (acq->seq == seq)
	break;
	}
	ACQ_UNLOCK();

	return acq;
	}

	static struct secspacq *
	key_newspacq(spidx)
	struct secpolicyindex *spidx;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secspacq *acq;

	/* get new entry */
	acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT\|M_ZERO);
	if (acq == NULL) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return NULL;
	}

	/* copy secindex */
	bcopy(spidx, &acq->spidx, sizeof(acq->spidx));
	acq->created = time_second;
	acq->count = 0;

	/* add to spacqtree */
	SPACQ_LOCK();
	LIST_INSERT_HEAD(&V_spacqtree, acq, chain);
	SPACQ_UNLOCK();

	return acq;
	}

	static struct secspacq *
	key_getspacq(spidx)
	struct secpolicyindex *spidx;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secspacq *acq;

	SPACQ_LOCK();
	LIST_FOREACH(acq, &V_spacqtree, chain) {
	if (key_cmpspidx_exactly(spidx, &acq->spidx)) {
	/* NB: return holding spacq_lock */
	return acq;
	}
	}
	SPACQ_UNLOCK();

	return NULL;
	}

	/*
	* SADB_ACQUIRE processing,
	* in first situation, is receiving
	* <base>
	* from the ikmpd, and clear sequence of its secasvar entry.
	*
	* In second situation, is receiving
	* <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
	* from a user land process, and return
	* <base, address(SD), (address(P),) (identity(SD),) (sensitivity,) proposal>
	* to the socket.
	*
	* m will always be freed.
	*/
	static int
	key_acquire2(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	const struct sadb_address src0, dst0;
	struct secasindex saidx;
	struct secashead *sah;
	u_int16_t proto;
	int error;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/*
	* Error message from KMd.
	* We assume that if error was occured in IKEd, the length of PFKEY
	* message is equal to the size of sadb_msg structure.
	* We do not raise error even if error occured in this function.
	*/
	if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) {
	struct secacq *acq;

	/* check sequence number */
	if (mhp->msg->sadb_msg_seq == 0) {
	ipseclog((LOG_DEBUG, "%s: must specify sequence "
	"number.\n", __func__));
	m_freem(m);
	return 0;
	}

	if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) == NULL) {
	/*
	* the specified larval SA is already gone, or we got
	* a bogus sequence number. we can silently ignore it.
	*/
	m_freem(m);
	return 0;
	}

	/* reset acq counter in order to deletion by timehander. */
	acq->created = time_second;
	acq->count = 0;
	m_freem(m);
	return 0;
	}

	/*
	* This message is from user land.
	*/

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL \|\|
	mhp->ext[SADB_EXT_ADDRESS_DST] == NULL \|\|
	mhp->ext[SADB_EXT_PROPOSAL] == NULL) {
	/* error */
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}
	if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) \|\|
	mhp->extlen[SADB_EXT_PROPOSAL] < sizeof(struct sadb_prop)) {
	/* error */
	ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC];
	dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST];

	/* XXX boundary check against sa_len */
	KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx);

	/* get a SA index */
	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (sah->state == SADB_SASTATE_DEAD)
	continue;
	if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID))
	break;
	}
	SAHTREE_UNLOCK();
	if (sah != NULL) {
	ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__));
	return key_senderror(so, m, EEXIST);
	}

	error = key_acquire(&saidx, NULL);
	if (error != 0) {
	ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n",
	__func__, mhp->msg->sadb_msg_errno));
	return key_senderror(so, m, error);
	}

	return key_sendup_mbuf(so, m, KEY_SENDUP_REGISTERED);
	}

	/*
	* SADB_REGISTER processing.
	* If SATYPE_UNSPEC has been passed as satype, only return sabd_supported.
	* receive
	* <base>
	* from the ikmpd, and register a socket to send PF_KEY messages,
	* and send
	* <base, supported>
	* to KMD by PF_KEY.
	* If socket is detached, must free from regnode.
	*
	* m will always be freed.
	*/
	static int
	key_register(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secreg reg, newreg = 0;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* check for invalid register message */
	if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0]))
	return key_senderror(so, m, EINVAL);

	/* When SATYPE_UNSPEC is specified, only return sabd_supported. */
	if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC)
	goto setmsg;

	/* check whether existing or not */
	REGTREE_LOCK();
	LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) {
	if (reg->so == so) {
	REGTREE_UNLOCK();
	ipseclog((LOG_DEBUG, "%s: socket exists already.\n",
	__func__));
	return key_senderror(so, m, EEXIST);
	}
	}

	/* create regnode */
	newreg = malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT\|M_ZERO);
	if (newreg == NULL) {
	REGTREE_UNLOCK();
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	newreg->so = so;
	((struct keycb *)sotorawcb(so))->kp_registered++;

	/* add regnode to regtree. */
	LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain);
	REGTREE_UNLOCK();

	setmsg:
	{
	struct mbuf *n;
	struct sadb_msg *newmsg;
	struct sadb_supported *sup;
	u_int len, alen, elen;
	int off;
	int i;
	struct sadb_alg *alg;

	/* create new sadb_msg to reply. */
	alen = 0;
	for (i = 1; i <= SADB_AALG_MAX; i++) {
	if (ah_algorithm_lookup(i))
	alen += sizeof(struct sadb_alg);
	}
	if (alen)
	alen += sizeof(struct sadb_supported);
	elen = 0;
	for (i = 1; i <= SADB_EALG_MAX; i++) {
	if (esp_algorithm_lookup(i))
	elen += sizeof(struct sadb_alg);
	}
	if (elen)
	elen += sizeof(struct sadb_supported);

	len = sizeof(struct sadb_msg) + alen + elen;

	if (len > MCLBYTES)
	return key_senderror(so, m, ENOBUFS);

	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (len > MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_freem(n);
	n = NULL;
	}
	}
	if (!n)
	return key_senderror(so, m, ENOBUFS);

	n->m_pkthdr.len = n->m_len = len;
	n->m_next = NULL;
	off = 0;

	m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off);
	newmsg = mtod(n, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(len);
	off += PFKEY_ALIGN8(sizeof(struct sadb_msg));

	/* for authentication algorithm */
	if (alen) {
	sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
	sup->sadb_supported_len = PFKEY_UNIT64(alen);
	sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
	off += PFKEY_ALIGN8(sizeof(*sup));

	for (i = 1; i <= SADB_AALG_MAX; i++) {
	struct auth_hash *aalgo;
	u_int16_t minkeysize, maxkeysize;

	aalgo = ah_algorithm_lookup(i);
	if (!aalgo)
	continue;
	alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
	alg->sadb_alg_id = i;
	alg->sadb_alg_ivlen = 0;
	key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize);
	alg->sadb_alg_minbits = _BITS(minkeysize);
	alg->sadb_alg_maxbits = _BITS(maxkeysize);
	off += PFKEY_ALIGN8(sizeof(*alg));
	}
	}

	/* for encryption algorithm */
	if (elen) {
	sup = (struct sadb_supported *)(mtod(n, caddr_t) + off);
	sup->sadb_supported_len = PFKEY_UNIT64(elen);
	sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT;
	off += PFKEY_ALIGN8(sizeof(*sup));

	for (i = 1; i <= SADB_EALG_MAX; i++) {
	struct enc_xform *ealgo;

	ealgo = esp_algorithm_lookup(i);
	if (!ealgo)
	continue;
	alg = (struct sadb_alg *)(mtod(n, caddr_t) + off);
	alg->sadb_alg_id = i;
	alg->sadb_alg_ivlen = ealgo->blocksize;
	alg->sadb_alg_minbits = _BITS(ealgo->minkey);
	alg->sadb_alg_maxbits = _BITS(ealgo->maxkey);
	off += PFKEY_ALIGN8(sizeof(struct sadb_alg));
	}
	}

	IPSEC_ASSERT(off == len,
	("length assumption failed (off %u len %u)", off, len));

	m_freem(m);
	return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED);
	}
	}

	/*
	* free secreg entry registered.
	* XXX: I want to do free a socket marked done SADB_RESIGER to socket.
	*/
	void
	key_freereg(struct socket *so)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secreg *reg;
	int i;

	IPSEC_ASSERT(so != NULL, ("NULL so"));

	/*
	* check whether existing or not.
	* check all type of SA, because there is a potential that
	* one socket is registered to multiple type of SA.
	*/
	REGTREE_LOCK();
	for (i = 0; i <= SADB_SATYPE_MAX; i++) {
	LIST_FOREACH(reg, &V_regtree[i], chain) {
	if (reg->so == so && __LIST_CHAINED(reg)) {
	LIST_REMOVE(reg, chain);
	free(reg, M_IPSEC_SAR);
	break;
	}
	}
	}
	REGTREE_UNLOCK();
	}

	/*
	* SADB_EXPIRE processing
	* send
	* <base, SA, SA2, lifetime(C and one of HS), address(SD)>
	* to KMD by PF_KEY.
	* NOTE: We send only soft lifetime extension.
	*
	* OUT: 0 : succeed
	* others : error number
	*/
	static int
	key_expire(struct secasvar *sav)
	{
	int s;
	int satype;
	struct mbuf result = NULL, m;
	int len;
	int error = -1;
	struct sadb_lifetime *lt;

	/* XXX: Why do we lock ? */
	s = splnet(); /called from softclock()/

	IPSEC_ASSERT (sav != NULL, ("null sav"));
	IPSEC_ASSERT (sav->sah != NULL, ("null sa header"));

	/* set msg header */
	satype = key_proto2satype(sav->sah->saidx.proto);
	IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype));
	m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	result = m;

	/* create SA extension */
	m = key_setsadbsa(sav);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* create SA extension */
	m = key_setsadbxsa2(sav->sah->saidx.mode,
	sav->replay ? sav->replay->count : 0,
	sav->sah->saidx.reqid);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* create lifetime extension (current and soft) */
	len = PFKEY_ALIGN8(sizeof(lt)) 2;
	m = key_alloc_mbuf(len);
	if (!m \|\| m->m_next) { /XXX/
	if (m)
	m_freem(m);
	error = ENOBUFS;
	goto fail;
	}
	bzero(mtod(m, caddr_t), len);
	lt = mtod(m, struct sadb_lifetime *);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT;
	lt->sadb_lifetime_allocations = sav->lft_c->allocations;
	lt->sadb_lifetime_bytes = sav->lft_c->bytes;
	lt->sadb_lifetime_addtime = sav->lft_c->addtime;
	lt->sadb_lifetime_usetime = sav->lft_c->usetime;
	lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2);
	lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime));
	lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT;
	lt->sadb_lifetime_allocations = sav->lft_s->allocations;
	lt->sadb_lifetime_bytes = sav->lft_s->bytes;
	lt->sadb_lifetime_addtime = sav->lft_s->addtime;
	lt->sadb_lifetime_usetime = sav->lft_s->usetime;
	m_cat(result, m);

	/* set sadb_address for source */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC,
	&sav->sah->saidx.src.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	/* set sadb_address for destination */
	m = key_setsadbaddr(SADB_EXT_ADDRESS_DST,
	&sav->sah->saidx.dst.sa,
	FULLMASK, IPSEC_ULPROTO_ANY);
	if (!m) {
	error = ENOBUFS;
	goto fail;
	}
	m_cat(result, m);

	if ((result->m_flags & M_PKTHDR) == 0) {
	error = EINVAL;
	goto fail;
	}

	if (result->m_len < sizeof(struct sadb_msg)) {
	result = m_pullup(result, sizeof(struct sadb_msg));
	if (result == NULL) {
	error = ENOBUFS;
	goto fail;
	}
	}

	result->m_pkthdr.len = 0;
	for (m = result; m; m = m->m_next)
	result->m_pkthdr.len += m->m_len;

	mtod(result, struct sadb_msg *)->sadb_msg_len =
	PFKEY_UNIT64(result->m_pkthdr.len);

	splx(s);
	return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED);

	fail:
	if (result)
	m_freem(result);
	splx(s);
	return error;
	}

	/*
	* SADB_FLUSH processing
	* receive
	* <base>
	* from the ikmpd, and free all entries in secastree.
	* and send,
	* <base>
	* to the ikmpd.
	* NOTE: to do is only marking SADB_SASTATE_DEAD.
	*
	* m will always be freed.
	*/
	static int
	key_flush(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_msg *newmsg;
	struct secashead sah, nextsah;
	struct secasvar sav, nextsav;
	u_int16_t proto;
	u_int8_t state;
	u_int stateidx;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	/* no SATYPE specified, i.e. flushing all SA. */
	SAHTREE_LOCK();
	for (sah = LIST_FIRST(&V_sahtree);
	sah != NULL;
	sah = nextsah) {
	nextsah = LIST_NEXT(sah, chain);

	if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
	&& proto != sah->saidx.proto)
	continue;

	for (stateidx = 0;
	stateidx < _ARRAYLEN(V_saorder_state_alive);
	stateidx++) {
	state = V_saorder_state_any[stateidx];
	for (sav = LIST_FIRST(&sah->savtree[state]);
	sav != NULL;
	sav = nextsav) {

	nextsav = LIST_NEXT(sav, chain);

	key_sa_chgstate(sav, SADB_SASTATE_DEAD);
	KEY_FREESAV(&sav);
	}
	}

	sah->state = SADB_SASTATE_DEAD;
	}
	SAHTREE_UNLOCK();

	if (m->m_len < sizeof(struct sadb_msg) \|\|
	sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) {
	ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__));
	return key_senderror(so, m, ENOBUFS);
	}

	if (m->m_next)
	m_freem(m->m_next);
	m->m_next = NULL;
	m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg);
	newmsg = mtod(m, struct sadb_msg *);
	newmsg->sadb_msg_errno = 0;
	newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);

	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	}

	/*
	* SADB_DUMP processing
	* dump all entries including status of DEAD in SAD.
	* receive
	* <base>
	* from the ikmpd, and dump all secasvar leaves
	* and send,
	* <base> .....
	* to the ikmpd.
	*
	* m will always be freed.
	*/
	static int
	key_dump(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *sah;
	struct secasvar *sav;
	u_int16_t proto;
	u_int stateidx;
	u_int8_t satype;
	u_int8_t state;
	int cnt;
	struct sadb_msg *newmsg;
	struct mbuf *n;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	/* map satype to proto */
	if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) {
	ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n",
	__func__));
	return key_senderror(so, m, EINVAL);
	}

	/* count sav entries to be sent to the userland. */
	cnt = 0;
	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
	&& proto != sah->saidx.proto)
	continue;

	for (stateidx = 0;
	stateidx < _ARRAYLEN(V_saorder_state_any);
	stateidx++) {
	state = V_saorder_state_any[stateidx];
	LIST_FOREACH(sav, &sah->savtree[state], chain) {
	cnt++;
	}
	}
	}

	if (cnt == 0) {
	SAHTREE_UNLOCK();
	return key_senderror(so, m, ENOENT);
	}

	/* send this to the userland, one at a time. */
	newmsg = NULL;
	LIST_FOREACH(sah, &V_sahtree, chain) {
	if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC
	&& proto != sah->saidx.proto)
	continue;

	/* map proto to satype */
	if ((satype = key_proto2satype(sah->saidx.proto)) == 0) {
	SAHTREE_UNLOCK();
	ipseclog((LOG_DEBUG, "%s: there was invalid proto in "
	"SAD.\n", __func__));
	return key_senderror(so, m, EINVAL);
	}

	for (stateidx = 0;
	stateidx < _ARRAYLEN(V_saorder_state_any);
	stateidx++) {
	state = V_saorder_state_any[stateidx];
	LIST_FOREACH(sav, &sah->savtree[state], chain) {
	n = key_setdumpsa(sav, SADB_DUMP, satype,
	--cnt, mhp->msg->sadb_msg_pid);
	if (!n) {
	SAHTREE_UNLOCK();
	return key_senderror(so, m, ENOBUFS);
	}
	key_sendup_mbuf(so, n, KEY_SENDUP_ONE);
	}
	}
	}
	SAHTREE_UNLOCK();

	m_freem(m);
	return 0;
	}

	/*
	* SADB_X_PROMISC processing
	*
	* m will always be freed.
	*/
	static int
	key_promisc(so, m, mhp)
	struct socket *so;
	struct mbuf *m;
	const struct sadb_msghdr *mhp;
	{
	int olen;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(mhp->msg != NULL, ("null msg"));

	olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);

	if (olen < sizeof(struct sadb_msg)) {
	#if 1
	return key_senderror(so, m, EINVAL);
	#else
	m_freem(m);
	return 0;
	#endif
	} else if (olen == sizeof(struct sadb_msg)) {
	/* enable/disable promisc mode */
	struct keycb *kp;

	if ((kp = (struct keycb *)sotorawcb(so)) == NULL)
	return key_senderror(so, m, EINVAL);
	mhp->msg->sadb_msg_errno = 0;
	switch (mhp->msg->sadb_msg_satype) {
	case 0:
	case 1:
	kp->kp_promisc = mhp->msg->sadb_msg_satype;
	break;
	default:
	return key_senderror(so, m, EINVAL);
	}

	/* send the original message back to everyone */
	mhp->msg->sadb_msg_errno = 0;
	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	} else {
	/* send packet as is */

	m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg)));

	/* TODO: if sadb_msg_seq is specified, send to specific pid */
	return key_sendup_mbuf(so, m, KEY_SENDUP_ALL);
	}
	}

	static int (key_typesw[]) __P((struct socket , struct mbuf *,
	const struct sadb_msghdr *)) = {
	NULL, /* SADB_RESERVED */
	key_getspi, /* SADB_GETSPI */
	key_update, /* SADB_UPDATE */
	key_add, /* SADB_ADD */
	key_delete, /* SADB_DELETE */
	key_get, /* SADB_GET */
	key_acquire2, /* SADB_ACQUIRE */
	key_register, /* SADB_REGISTER */
	NULL, /* SADB_EXPIRE */
	key_flush, /* SADB_FLUSH */
	key_dump, /* SADB_DUMP */
	key_promisc, /* SADB_X_PROMISC */
	NULL, /* SADB_X_PCHANGE */
	key_spdadd, /* SADB_X_SPDUPDATE */
	key_spdadd, /* SADB_X_SPDADD */
	key_spddelete, /* SADB_X_SPDDELETE */
	key_spdget, /* SADB_X_SPDGET */
	NULL, /* SADB_X_SPDACQUIRE */
	key_spddump, /* SADB_X_SPDDUMP */
	key_spdflush, /* SADB_X_SPDFLUSH */
	key_spdadd, /* SADB_X_SPDSETIDX */
	NULL, /* SADB_X_SPDEXPIRE */
	key_spddelete2, /* SADB_X_SPDDELETE2 */
	};

	/*
	* parse sadb_msg buffer to process PFKEYv2,
	* and create a data to response if needed.
	* I think to be dealed with mbuf directly.
	* IN:
	* msgp : pointer to pointer to a received buffer pulluped.
	* This is rewrited to response.
	* so : pointer to socket.
	* OUT:
	* length for buffer to send to user process.
	*/
	int
	key_parse(m, so)
	struct mbuf *m;
	struct socket *so;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_msg *msg;
	struct sadb_msghdr mh;
	u_int orglen;
	int error;
	int target;

	IPSEC_ASSERT(so != NULL, ("null socket"));
	IPSEC_ASSERT(m != NULL, ("null mbuf"));

	#if 0 /kdebug_sadb assumes msg in linear buffer/
	KEYDEBUG(KEYDEBUG_KEY_DUMP,
	ipseclog((LOG_DEBUG, "%s: passed sadb_msg\n", __func__));
	kdebug_sadb(msg));
	#endif

	if (m->m_len < sizeof(struct sadb_msg)) {
	m = m_pullup(m, sizeof(struct sadb_msg));
	if (!m)
	return ENOBUFS;
	}
	msg = mtod(m, struct sadb_msg *);
	orglen = PFKEY_UNUNIT64(msg->sadb_msg_len);
	target = KEY_SENDUP_ONE;

	if ((m->m_flags & M_PKTHDR) == 0 \|\|
	m->m_pkthdr.len != m->m_pkthdr.len) {
	ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__));
	V_pfkeystat.out_invlen++;
	error = EINVAL;
	goto senderror;
	}

	if (msg->sadb_msg_version != PF_KEY_V2) {
	ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n",
	__func__, msg->sadb_msg_version));
	V_pfkeystat.out_invver++;
	error = EINVAL;
	goto senderror;
	}

	if (msg->sadb_msg_type > SADB_MAX) {
	ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
	__func__, msg->sadb_msg_type));
	V_pfkeystat.out_invmsgtype++;
	error = EINVAL;
	goto senderror;
	}

	/* for old-fashioned code - should be nuked */
	if (m->m_pkthdr.len > MCLBYTES) {
	m_freem(m);
	return ENOBUFS;
	}
	if (m->m_next) {
	struct mbuf *n;

	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (n && m->m_pkthdr.len > MHLEN) {
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_free(n);
	n = NULL;
	}
	}
	if (!n) {
	m_freem(m);
	return ENOBUFS;
	}
	m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t));
	n->m_pkthdr.len = n->m_len = m->m_pkthdr.len;
	n->m_next = NULL;
	m_freem(m);
	m = n;
	}

	/* align the mbuf chain so that extensions are in contiguous region. */
	error = key_align(m, &mh);
	if (error)
	return error;

	msg = mh.msg;

	/* check SA type */
	switch (msg->sadb_msg_satype) {
	case SADB_SATYPE_UNSPEC:
	switch (msg->sadb_msg_type) {
	case SADB_GETSPI:
	case SADB_UPDATE:
	case SADB_ADD:
	case SADB_DELETE:
	case SADB_GET:
	case SADB_ACQUIRE:
	case SADB_EXPIRE:
	ipseclog((LOG_DEBUG, "%s: must specify satype "
	"when msg type=%u.\n", __func__,
	msg->sadb_msg_type));
	V_pfkeystat.out_invsatype++;
	error = EINVAL;
	goto senderror;
	}
	break;
	case SADB_SATYPE_AH:
	case SADB_SATYPE_ESP:
	case SADB_X_SATYPE_IPCOMP:
	case SADB_X_SATYPE_TCPSIGNATURE:
	switch (msg->sadb_msg_type) {
	case SADB_X_SPDADD:
	case SADB_X_SPDDELETE:
	case SADB_X_SPDGET:
	case SADB_X_SPDDUMP:
	case SADB_X_SPDFLUSH:
	case SADB_X_SPDSETIDX:
	case SADB_X_SPDUPDATE:
	case SADB_X_SPDDELETE2:
	ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n",
	__func__, msg->sadb_msg_type));
	V_pfkeystat.out_invsatype++;
	error = EINVAL;
	goto senderror;
	}
	break;
	case SADB_SATYPE_RSVP:
	case SADB_SATYPE_OSPFV2:
	case SADB_SATYPE_RIPV2:
	case SADB_SATYPE_MIP:
	ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n",
	__func__, msg->sadb_msg_satype));
	V_pfkeystat.out_invsatype++;
	error = EOPNOTSUPP;
	goto senderror;
	case 1: /* XXX: What does it do? */
	if (msg->sadb_msg_type == SADB_X_PROMISC)
	break;
	/FALLTHROUGH/
	default:
	ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n",
	__func__, msg->sadb_msg_satype));
	V_pfkeystat.out_invsatype++;
	error = EINVAL;
	goto senderror;
	}

	/* check field of upper layer protocol and address family */
	if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL
	&& mh.ext[SADB_EXT_ADDRESS_DST] != NULL) {
	struct sadb_address src0, dst0;
	u_int plen;

	src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]);
	dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]);

	/* check upper layer protocol */
	if (src0->sadb_address_proto != dst0->sadb_address_proto) {
	ipseclog((LOG_DEBUG, "%s: upper layer protocol "
	"mismatched.\n", __func__));
	V_pfkeystat.out_invaddr++;
	error = EINVAL;
	goto senderror;
	}

	/* check family */
	if (PFKEY_ADDR_SADDR(src0)->sa_family !=
	PFKEY_ADDR_SADDR(dst0)->sa_family) {
	ipseclog((LOG_DEBUG, "%s: address family mismatched.\n",
	__func__));
	V_pfkeystat.out_invaddr++;
	error = EINVAL;
	goto senderror;
	}
	if (PFKEY_ADDR_SADDR(src0)->sa_len !=
	PFKEY_ADDR_SADDR(dst0)->sa_len) {
	ipseclog((LOG_DEBUG, "%s: address struct size "
	"mismatched.\n", __func__));
	V_pfkeystat.out_invaddr++;
	error = EINVAL;
	goto senderror;
	}

	switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
	case AF_INET:
	if (PFKEY_ADDR_SADDR(src0)->sa_len !=
	sizeof(struct sockaddr_in)) {
	V_pfkeystat.out_invaddr++;
	error = EINVAL;
	goto senderror;
	}
	break;
	case AF_INET6:
	if (PFKEY_ADDR_SADDR(src0)->sa_len !=
	sizeof(struct sockaddr_in6)) {
	V_pfkeystat.out_invaddr++;
	error = EINVAL;
	goto senderror;
	}
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: unsupported address family\n",
	__func__));
	V_pfkeystat.out_invaddr++;
	error = EAFNOSUPPORT;
	goto senderror;
	}

	switch (PFKEY_ADDR_SADDR(src0)->sa_family) {
	case AF_INET:
	plen = sizeof(struct in_addr) << 3;
	break;
	case AF_INET6:
	plen = sizeof(struct in6_addr) << 3;
	break;
	default:
	plen = 0; /fool gcc/
	break;
	}

	/* check max prefix length */
	if (src0->sadb_address_prefixlen > plen \|\|
	dst0->sadb_address_prefixlen > plen) {
	ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n",
	__func__));
	V_pfkeystat.out_invaddr++;
	error = EINVAL;
	goto senderror;
	}

	/*
	* prefixlen == 0 is valid because there can be a case when
	* all addresses are matched.
	*/
	}

	if (msg->sadb_msg_type >= sizeof(key_typesw)/sizeof(key_typesw[0]) \|\|
	key_typesw[msg->sadb_msg_type] == NULL) {
	V_pfkeystat.out_invmsgtype++;
	error = EINVAL;
	goto senderror;
	}

	return (*key_typesw[msg->sadb_msg_type])(so, m, &mh);

	senderror:
	msg->sadb_msg_errno = error;
	return key_sendup_mbuf(so, m, target);
	}

	static int
	key_senderror(so, m, code)
	struct socket *so;
	struct mbuf *m;
	int code;
	{
	struct sadb_msg *msg;

	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
	("mbuf too small, len %u", m->m_len));

	msg = mtod(m, struct sadb_msg *);
	msg->sadb_msg_errno = code;
	return key_sendup_mbuf(so, m, KEY_SENDUP_ONE);
	}

	/*
	* set the pointer to each header into message buffer.
	* m will be freed on error.
	* XXX larger-than-MCLBYTES extension?
	*/
	static int
	key_align(m, mhp)
	struct mbuf *m;
	struct sadb_msghdr *mhp;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf *n;
	struct sadb_ext *ext;
	size_t off, end;
	int extlen;
	int toff;

	IPSEC_ASSERT(m != NULL, ("null mbuf"));
	IPSEC_ASSERT(mhp != NULL, ("null msghdr"));
	IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg),
	("mbuf too small, len %u", m->m_len));

	/* initialize */
	bzero(mhp, sizeof(*mhp));

	mhp->msg = mtod(m, struct sadb_msg *);
	mhp->ext[0] = (struct sadb_ext )mhp->msg; /XXX backward compat */

	end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len);
	extlen = end; /just in case extlen is not updated/
	for (off = sizeof(struct sadb_msg); off < end; off += extlen) {
	n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff);
	if (!n) {
	/* m is already freed */
	return ENOBUFS;
	}
	ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);

	/* set pointer */
	switch (ext->sadb_ext_type) {
	case SADB_EXT_SA:
	case SADB_EXT_ADDRESS_SRC:
	case SADB_EXT_ADDRESS_DST:
	case SADB_EXT_ADDRESS_PROXY:
	case SADB_EXT_LIFETIME_CURRENT:
	case SADB_EXT_LIFETIME_HARD:
	case SADB_EXT_LIFETIME_SOFT:
	case SADB_EXT_KEY_AUTH:
	case SADB_EXT_KEY_ENCRYPT:
	case SADB_EXT_IDENTITY_SRC:
	case SADB_EXT_IDENTITY_DST:
	case SADB_EXT_SENSITIVITY:
	case SADB_EXT_PROPOSAL:
	case SADB_EXT_SUPPORTED_AUTH:
	case SADB_EXT_SUPPORTED_ENCRYPT:
	case SADB_EXT_SPIRANGE:
	case SADB_X_EXT_POLICY:
	case SADB_X_EXT_SA2:
	/* duplicate check */
	/*
	* XXX Are there duplication payloads of either
	* KEY_AUTH or KEY_ENCRYPT ?
	*/
	if (mhp->ext[ext->sadb_ext_type] != NULL) {
	ipseclog((LOG_DEBUG, "%s: duplicate ext_type "
	"%u\n", __func__, ext->sadb_ext_type));
	m_freem(m);
	V_pfkeystat.out_dupext++;
	return EINVAL;
	}
	break;
	default:
	ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n",
	__func__, ext->sadb_ext_type));
	m_freem(m);
	V_pfkeystat.out_invexttype++;
	return EINVAL;
	}

	extlen = PFKEY_UNUNIT64(ext->sadb_ext_len);

	if (key_validate_ext(ext, extlen)) {
	m_freem(m);
	V_pfkeystat.out_invlen++;
	return EINVAL;
	}

	n = m_pulldown(m, off, extlen, &toff);
	if (!n) {
	/* m is already freed */
	return ENOBUFS;
	}
	ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff);

	mhp->ext[ext->sadb_ext_type] = ext;
	mhp->extoff[ext->sadb_ext_type] = off;
	mhp->extlen[ext->sadb_ext_type] = extlen;
	}

	if (off != end) {
	m_freem(m);
	V_pfkeystat.out_invlen++;
	return EINVAL;
	}

	return 0;
	}

	static int
	key_validate_ext(ext, len)
	const struct sadb_ext *ext;
	int len;
	{
	const struct sockaddr *sa;
	enum { NONE, ADDR } checktype = NONE;
	int baselen = 0;
	const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len);

	if (len != PFKEY_UNUNIT64(ext->sadb_ext_len))
	return EINVAL;

	/* if it does not match minimum/maximum length, bail */
	if (ext->sadb_ext_type >= sizeof(minsize) / sizeof(minsize[0]) \|\|
	ext->sadb_ext_type >= sizeof(maxsize) / sizeof(maxsize[0]))
	return EINVAL;
	if (!minsize[ext->sadb_ext_type] \|\| len < minsize[ext->sadb_ext_type])
	return EINVAL;
	if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type])
	return EINVAL;

	/* more checks based on sadb_ext_type XXX need more */
	switch (ext->sadb_ext_type) {
	case SADB_EXT_ADDRESS_SRC:
	case SADB_EXT_ADDRESS_DST:
	case SADB_EXT_ADDRESS_PROXY:
	baselen = PFKEY_ALIGN8(sizeof(struct sadb_address));
	checktype = ADDR;
	break;
	case SADB_EXT_IDENTITY_SRC:
	case SADB_EXT_IDENTITY_DST:
	if (((const struct sadb_ident *)ext)->sadb_ident_type ==
	SADB_X_IDENTTYPE_ADDR) {
	baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident));
	checktype = ADDR;
	} else
	checktype = NONE;
	break;
	default:
	checktype = NONE;
	break;
	}

	switch (checktype) {
	case NONE:
	break;
	case ADDR:
	sa = (const struct sockaddr )(((const u_int8_t)ext)+baselen);
	if (len < baselen + sal)
	return EINVAL;
	if (baselen + PFKEY_ALIGN8(sa->sa_len) != len)
	return EINVAL;
	break;
	}

	return 0;
	}

	void
	key_init(void)
	{
	+ INIT_VNET_IPSEC(curvnet);
	int i;

	SPTREE_LOCK_INIT();
	REGTREE_LOCK_INIT();
	SAHTREE_LOCK_INIT();
	ACQ_LOCK_INIT();
	SPACQ_LOCK_INIT();

	for (i = 0; i < IPSEC_DIR_MAX; i++)
	LIST_INIT(&V_sptree[i]);

	LIST_INIT(&V_sahtree);

	for (i = 0; i <= SADB_SATYPE_MAX; i++)
	LIST_INIT(&V_regtree[i]);

	LIST_INIT(&V_acqtree);
	LIST_INIT(&V_spacqtree);

	/* system default */
	V_ip4_def_policy.policy = IPSEC_POLICY_NONE;
	V_ip4_def_policy.refcnt++; /never reclaim this/

	#ifndef IPSEC_DEBUG2
	timeout((void )key_timehandler, (void )0, hz);
	#endif /IPSEC_DEBUG2/

	/* initialize key statistics */
	keystat.getspi_count = 1;

	printf("IPsec: Initialized Security Association Processing.\n");

	return;
	}

	/*
	* XXX: maybe This function is called after INBOUND IPsec processing.
	*
	* Special check for tunnel-mode packets.
	* We must make some checks for consistency between inner and outer IP header.
	*
	* xxx more checks to be provided
	*/
	int
	key_checktunnelsanity(sav, family, src, dst)
	struct secasvar *sav;
	u_int family;
	caddr_t src;
	caddr_t dst;
	{
	IPSEC_ASSERT(sav->sah != NULL, ("null SA header"));

	/* XXX: check inner IP header */

	return 1;
	}

	/* record data transfer on SA, and update timestamps */
	void
	key_sa_recordxfer(sav, m)
	struct secasvar *sav;
	struct mbuf *m;
	{
	IPSEC_ASSERT(sav != NULL, ("Null secasvar"));
	IPSEC_ASSERT(m != NULL, ("Null mbuf"));
	if (!sav->lft_c)
	return;

	/*
	* XXX Currently, there is a difference of bytes size
	* between inbound and outbound processing.
	*/
	sav->lft_c->bytes += m->m_pkthdr.len;
	/* to check bytes lifetime is done in key_timehandler(). */

	/*
	* We use the number of packets as the unit of
	* allocations. We increment the variable
	* whenever {esp,ah}_{in,out}put is called.
	*/
	sav->lft_c->allocations++;
	/* XXX check for expires? */

	/*
	* NOTE: We record CURRENT usetime by using wall clock,
	* in seconds. HARD and SOFT lifetime are measured by the time
	* difference (again in seconds) from usetime.
	*
	* usetime
	* v expire expire
	* -----+-----+--------+---> t
	* <--------------> HARD
	* <-----> SOFT
	*/
	sav->lft_c->usetime = time_second;
	/* XXX check for expires? */

	return;
	}

	/* dumb version */
	void
	key_sa_routechange(dst)
	struct sockaddr *dst;
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secashead *sah;
	struct route *ro;

	SAHTREE_LOCK();
	LIST_FOREACH(sah, &V_sahtree, chain) {
	ro = &sah->sa_route;
	if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len
	&& bcmp(dst, &ro->ro_dst, dst->sa_len) == 0) {
	RTFREE(ro->ro_rt);
	ro->ro_rt = (struct rtentry *)NULL;
	}
	}
	SAHTREE_UNLOCK();
	}

	static void
	key_sa_chgstate(sav, state)
	struct secasvar *sav;
	u_int8_t state;
	{
	IPSEC_ASSERT(sav != NULL, ("NULL sav"));
	SAHTREE_LOCK_ASSERT();

	if (sav->state != state) {
	if (__LIST_CHAINED(sav))
	LIST_REMOVE(sav, chain);
	sav->state = state;
	LIST_INSERT_HEAD(&sav->sah->savtree[state], sav, chain);
	}
	}

	void
	key_sa_stir_iv(sav)
	struct secasvar *sav;
	{

	IPSEC_ASSERT(sav->iv != NULL, ("null IV"));
	key_randomfill(sav->iv, sav->ivlen);
	}

	/* XXX too much? */
	static struct mbuf *
	key_alloc_mbuf(l)
	int l;
	{
	struct mbuf m = NULL, n;
	int len, t;

	len = l;
	while (len > 0) {
	MGET(n, M_DONTWAIT, MT_DATA);
	if (n && len > MLEN)
	MCLGET(n, M_DONTWAIT);
	if (!n) {
	m_freem(m);
	return NULL;
	}

	n->m_next = NULL;
	n->m_len = 0;
	n->m_len = M_TRAILINGSPACE(n);
	/* use the bottom of mbuf, hoping we can prepend afterwards */
	if (n->m_len > len) {
	t = (n->m_len - len) & ~(sizeof(long) - 1);
	n->m_data += t;
	n->m_len = len;
	}

	len -= n->m_len;

	if (m)
	m_cat(m, n);
	else
	m = n;
	}

	return m;
	}

	/*
	* Take one of the kernel's security keys and convert it into a PF_KEY
	* structure within an mbuf, suitable for sending up to a waiting
	* application in user land.
	*
	* IN:
	* src: A pointer to a kernel security key.
	* exttype: Which type of key this is. Refer to the PF_KEY data structures.
	* OUT:
	* a valid mbuf or NULL indicating an error
	*
	*/

	static struct mbuf *
	key_setkey(struct seckey *src, u_int16_t exttype)
	{
	struct mbuf *m;
	struct sadb_key *p;
	int len;

	if (src == NULL)
	return NULL;

	len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src));
	m = key_alloc_mbuf(len);
	if (m == NULL)
	return NULL;
	p = mtod(m, struct sadb_key *);
	bzero(p, len);
	p->sadb_key_len = PFKEY_UNIT64(len);
	p->sadb_key_exttype = exttype;
	p->sadb_key_bits = src->bits;
	bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src));

	return m;
	}

	/*
	* Take one of the kernel's lifetime data structures and convert it
	* into a PF_KEY structure within an mbuf, suitable for sending up to
	* a waiting application in user land.
	*
	* IN:
	* src: A pointer to a kernel lifetime structure.
	* exttype: Which type of lifetime this is. Refer to the PF_KEY
	* data structures for more information.
	* OUT:
	* a valid mbuf or NULL indicating an error
	*
	*/

	static struct mbuf *
	key_setlifetime(struct seclifetime *src, u_int16_t exttype)
	{
	struct mbuf *m = NULL;
	struct sadb_lifetime *p;
	int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime));

	if (src == NULL)
	return NULL;

	m = key_alloc_mbuf(len);
	if (m == NULL)
	return m;
	p = mtod(m, struct sadb_lifetime *);

	bzero(p, len);
	p->sadb_lifetime_len = PFKEY_UNIT64(len);
	p->sadb_lifetime_exttype = exttype;
	p->sadb_lifetime_allocations = src->allocations;
	p->sadb_lifetime_bytes = src->bytes;
	p->sadb_lifetime_addtime = src->addtime;
	p->sadb_lifetime_usetime = src->usetime;

	return m;

	}
	Index: head/sys/netipsec/keysock.c
	===================================================================
	--- head/sys/netipsec/keysock.c (revision 183549)
	+++ head/sys/netipsec/keysock.c (revision 183550)
	@@ -1,573 +1,585 @@
	/* $FreeBSD$ */
	/* $KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $ */

	/*-
	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the project nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_ipsec.h"

	/* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/domain.h>
	#include <sys/errno.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/protosw.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/vimage.h>

	+#include <net/if.h>
	#include <net/raw_cb.h>
	#include <net/route.h>

	+#include <netinet/in.h>
	+
	#include <net/pfkeyv2.h>
	#include <netipsec/key.h>
	#include <netipsec/keysock.h>
	#include <netipsec/key_debug.h>
	+#include <netipsec/ipsec.h>

	#include <machine/stdarg.h>

	struct key_cb {
	int key_count;
	int any_count;
	};
	static struct key_cb key_cb;

	static struct sockaddr key_src = { 2, PF_KEY, };

	static int key_sendup0 __P((struct rawcb , struct mbuf , int));

	struct pfkeystat pfkeystat;

	/*
	* key_output()
	*/
	int
	key_output(struct mbuf m, struct socket so)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct sadb_msg *msg;
	int len, error = 0;

	if (m == 0)
	panic("%s: NULL pointer was passed.\n", __func__);

	V_pfkeystat.out_total++;
	V_pfkeystat.out_bytes += m->m_pkthdr.len;

	len = m->m_pkthdr.len;
	if (len < sizeof(struct sadb_msg)) {
	V_pfkeystat.out_tooshort++;
	error = EINVAL;
	goto end;
	}

	if (m->m_len < sizeof(struct sadb_msg)) {
	if ((m = m_pullup(m, sizeof(struct sadb_msg))) == 0) {
	V_pfkeystat.out_nomem++;
	error = ENOBUFS;
	goto end;
	}
	}

	M_ASSERTPKTHDR(m);

	KEYDEBUG(KEYDEBUG_KEY_DUMP, kdebug_mbuf(m));

	msg = mtod(m, struct sadb_msg *);
	V_pfkeystat.out_msgtype[msg->sadb_msg_type]++;
	if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) {
	V_pfkeystat.out_invlen++;
	error = EINVAL;
	goto end;
	}

	error = key_parse(m, so);
	m = NULL;
	end:
	if (m)
	m_freem(m);
	return error;
	}

	/*
	* send message to the socket.
	*/
	static int
	key_sendup0(rp, m, promisc)
	struct rawcb *rp;
	struct mbuf *m;
	int promisc;
	{
	+ INIT_VNET_IPSEC(curvnet);
	int error;

	if (promisc) {
	struct sadb_msg *pmsg;

	M_PREPEND(m, sizeof(struct sadb_msg), M_DONTWAIT);
	if (m && m->m_len < sizeof(struct sadb_msg))
	m = m_pullup(m, sizeof(struct sadb_msg));
	if (!m) {
	V_pfkeystat.in_nomem++;
	m_freem(m);
	return ENOBUFS;
	}
	m->m_pkthdr.len += sizeof(*pmsg);

	pmsg = mtod(m, struct sadb_msg *);
	bzero(pmsg, sizeof(*pmsg));
	pmsg->sadb_msg_version = PF_KEY_V2;
	pmsg->sadb_msg_type = SADB_X_PROMISC;
	pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
	/* pid and seq? */

	V_pfkeystat.in_msgtype[pmsg->sadb_msg_type]++;
	}

	if (!sbappendaddr(&rp->rcb_socket->so_rcv, (struct sockaddr *)&V_key_src,
	m, NULL)) {
	V_pfkeystat.in_nomem++;
	m_freem(m);
	error = ENOBUFS;
	} else
	error = 0;
	sorwakeup(rp->rcb_socket);
	return error;
	}

	/* XXX this interface should be obsoleted. */
	int
	key_sendup(so, msg, len, target)
	struct socket *so;
	struct sadb_msg *msg;
	u_int len;
	int target; /target of the resulting message/
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf m, n, *mprev;
	int tlen;

	/* sanity check */
	if (so == 0 \|\| msg == 0)
	panic("%s: NULL pointer was passed.\n", __func__);

	KEYDEBUG(KEYDEBUG_KEY_DUMP,
	printf("%s: \n", __func__);
	kdebug_sadb(msg));

	/*
	* we increment statistics here, just in case we have ENOBUFS
	* in this function.
	*/
	V_pfkeystat.in_total++;
	V_pfkeystat.in_bytes += len;
	V_pfkeystat.in_msgtype[msg->sadb_msg_type]++;

	/*
	* Get mbuf chain whenever possible (not clusters),
	* to save socket buffer. We'll be generating many SADB_ACQUIRE
	* messages to listening key sockets. If we simply allocate clusters,
	* sbappendaddr() will raise ENOBUFS due to too little sbspace().
	* sbspace() computes # of actual data bytes AND mbuf region.
	*
	* TODO: SADB_ACQUIRE filters should be implemented.
	*/
	tlen = len;
	m = mprev = NULL;
	while (tlen > 0) {
	if (tlen == len) {
	MGETHDR(n, M_DONTWAIT, MT_DATA);
	if (n == NULL) {
	V_pfkeystat.in_nomem++;
	return ENOBUFS;
	}
	n->m_len = MHLEN;
	} else {
	MGET(n, M_DONTWAIT, MT_DATA);
	if (n == NULL) {
	V_pfkeystat.in_nomem++;
	return ENOBUFS;
	}
	n->m_len = MLEN;
	}
	if (tlen >= MCLBYTES) { /XXX better threshold? /
	MCLGET(n, M_DONTWAIT);
	if ((n->m_flags & M_EXT) == 0) {
	m_free(n);
	m_freem(m);
	V_pfkeystat.in_nomem++;
	return ENOBUFS;
	}
	n->m_len = MCLBYTES;
	}

	if (tlen < n->m_len)
	n->m_len = tlen;
	n->m_next = NULL;
	if (m == NULL)
	m = mprev = n;
	else {
	mprev->m_next = n;
	mprev = n;
	}
	tlen -= n->m_len;
	n = NULL;
	}
	m->m_pkthdr.len = len;
	m->m_pkthdr.rcvif = NULL;
	m_copyback(m, 0, len, (caddr_t)msg);

	/* avoid duplicated statistics */
	V_pfkeystat.in_total--;
	V_pfkeystat.in_bytes -= len;
	V_pfkeystat.in_msgtype[msg->sadb_msg_type]--;

	return key_sendup_mbuf(so, m, target);
	}

	/* so can be NULL if target != KEY_SENDUP_ONE */
	int
	key_sendup_mbuf(so, m, target)
	struct socket *so;
	struct mbuf *m;
	int target;
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf *n;
	struct keycb *kp;
	int sendup;
	struct rawcb *rp;
	int error = 0;

	if (m == NULL)
	panic("key_sendup_mbuf: NULL pointer was passed.\n");
	if (so == NULL && target == KEY_SENDUP_ONE)
	panic("%s: NULL pointer was passed.\n", __func__);

	V_pfkeystat.in_total++;
	V_pfkeystat.in_bytes += m->m_pkthdr.len;
	if (m->m_len < sizeof(struct sadb_msg)) {
	m = m_pullup(m, sizeof(struct sadb_msg));
	if (m == NULL) {
	V_pfkeystat.in_nomem++;
	return ENOBUFS;
	}
	}
	if (m->m_len >= sizeof(struct sadb_msg)) {
	struct sadb_msg *msg;
	msg = mtod(m, struct sadb_msg *);
	V_pfkeystat.in_msgtype[msg->sadb_msg_type]++;
	}
	mtx_lock(&rawcb_mtx);
	LIST_FOREACH(rp, &V_rawcb_list, list)
	{
	if (rp->rcb_proto.sp_family != PF_KEY)
	continue;
	if (rp->rcb_proto.sp_protocol
	&& rp->rcb_proto.sp_protocol != PF_KEY_V2) {
	continue;
	}

	kp = (struct keycb *)rp;

	/*
	* If you are in promiscuous mode, and when you get broadcasted
	* reply, you'll get two PF_KEY messages.
	* (based on pf_key@inner.net message on 14 Oct 1998)
	*/
	if (((struct keycb *)rp)->kp_promisc) {
	if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
	(void)key_sendup0(rp, n, 1);
	n = NULL;
	}
	}

	/* the exact target will be processed later */
	if (so && sotorawcb(so) == rp)
	continue;

	sendup = 0;
	switch (target) {
	case KEY_SENDUP_ONE:
	/* the statement has no effect */
	if (so && sotorawcb(so) == rp)
	sendup++;
	break;
	case KEY_SENDUP_ALL:
	sendup++;
	break;
	case KEY_SENDUP_REGISTERED:
	if (kp->kp_registered)
	sendup++;
	break;
	}
	V_pfkeystat.in_msgtarget[target]++;

	if (!sendup)
	continue;

	if ((n = m_copy(m, 0, (int)M_COPYALL)) == NULL) {
	m_freem(m);
	V_pfkeystat.in_nomem++;
	mtx_unlock(&rawcb_mtx);
	return ENOBUFS;
	}

	if ((error = key_sendup0(rp, n, 0)) != 0) {
	m_freem(m);
	mtx_unlock(&rawcb_mtx);
	return error;
	}

	n = NULL;
	}

	if (so) {
	error = key_sendup0(sotorawcb(so), m, 0);
	m = NULL;
	} else {
	error = 0;
	m_freem(m);
	}
	mtx_unlock(&rawcb_mtx);
	return error;
	}

	/*
	* key_abort()
	* derived from net/rtsock.c:rts_abort()
	*/
	static void
	key_abort(struct socket *so)
	{
	raw_usrreqs.pru_abort(so);
	}

	/*
	* key_attach()
	* derived from net/rtsock.c:rts_attach()
	*/
	static int
	key_attach(struct socket so, int proto, struct thread td)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct keycb *kp;
	int error;

	KASSERT(so->so_pcb == NULL, ("key_attach: so_pcb != NULL"));

	if (td != NULL) {
	error = priv_check(td, PRIV_NET_RAW);
	if (error)
	return error;
	}

	/* XXX */
	MALLOC(kp, struct keycb , sizeof kp, M_PCB, M_WAITOK \| M_ZERO);
	if (kp == 0)
	return ENOBUFS;

	so->so_pcb = (caddr_t)kp;
	error = raw_attach(so, proto);
	kp = (struct keycb *)sotorawcb(so);
	if (error) {
	free(kp, M_PCB);
	so->so_pcb = (caddr_t) 0;
	return error;
	}

	kp->kp_promisc = kp->kp_registered = 0;

	if (kp->kp_raw.rcb_proto.sp_protocol == PF_KEY) /* XXX: AF_KEY */
	V_key_cb.key_count++;
	V_key_cb.any_count++;
	soisconnected(so);
	so->so_options \|= SO_USELOOPBACK;

	return 0;
	}

	/*
	* key_bind()
	* derived from net/rtsock.c:rts_bind()
	*/
	static int
	key_bind(struct socket so, struct sockaddr nam, struct thread *td)
	{
	return EINVAL;
	}

	/*
	* key_close()
	* derived from net/rtsock.c:rts_close().
	*/
	static void
	key_close(struct socket *so)
	{

	raw_usrreqs.pru_close(so);
	}

	/*
	* key_connect()
	* derived from net/rtsock.c:rts_connect()
	*/
	static int
	key_connect(struct socket so, struct sockaddr nam, struct thread *td)
	{
	return EINVAL;
	}

	/*
	* key_detach()
	* derived from net/rtsock.c:rts_detach()
	*/
	static void
	key_detach(struct socket *so)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct keycb kp = (struct keycb )sotorawcb(so);

	KASSERT(kp != NULL, ("key_detach: kp == NULL"));
	if (kp->kp_raw.rcb_proto.sp_protocol
	== PF_KEY) /* XXX: AF_KEY */
	V_key_cb.key_count--;
	V_key_cb.any_count--;

	key_freereg(so);
	raw_usrreqs.pru_detach(so);
	}

	/*
	* key_disconnect()
	* derived from net/rtsock.c:key_disconnect()
	*/
	static int
	key_disconnect(struct socket *so)
	{
	return(raw_usrreqs.pru_disconnect(so));
	}

	/*
	* key_peeraddr()
	* derived from net/rtsock.c:rts_peeraddr()
	*/
	static int
	key_peeraddr(struct socket so, struct sockaddr *nam)
	{
	return(raw_usrreqs.pru_peeraddr(so, nam));
	}

	/*
	* key_send()
	* derived from net/rtsock.c:rts_send()
	*/
	static int
	key_send(struct socket so, int flags, struct mbuf m, struct sockaddr *nam,
	struct mbuf control, struct thread td)
	{
	return(raw_usrreqs.pru_send(so, flags, m, nam, control, td));
	}

	/*
	* key_shutdown()
	* derived from net/rtsock.c:rts_shutdown()
	*/
	static int
	key_shutdown(struct socket *so)
	{
	return(raw_usrreqs.pru_shutdown(so));
	}

	/*
	* key_sockaddr()
	* derived from net/rtsock.c:rts_sockaddr()
	*/
	static int
	key_sockaddr(struct socket so, struct sockaddr *nam)
	{
	return(raw_usrreqs.pru_sockaddr(so, nam));
	}

	struct pr_usrreqs key_usrreqs = {
	.pru_abort = key_abort,
	.pru_attach = key_attach,
	.pru_bind = key_bind,
	.pru_connect = key_connect,
	.pru_detach = key_detach,
	.pru_disconnect = key_disconnect,
	.pru_peeraddr = key_peeraddr,
	.pru_send = key_send,
	.pru_shutdown = key_shutdown,
	.pru_sockaddr = key_sockaddr,
	.pru_close = key_close,
	};

	/* sysctl */
	SYSCTL_NODE(_net, PF_KEY, key, CTLFLAG_RW, 0, "Key Family");

	/*
	* Definitions of protocols supported in the KEY domain.
	*/

	extern struct domain keydomain;

	struct protosw keysw[] = {
	{
	.pr_type = SOCK_RAW,
	.pr_domain = &keydomain,
	.pr_protocol = PF_KEY_V2,
	.pr_flags = PR_ATOMIC\|PR_ADDR,
	.pr_output = key_output,
	.pr_ctlinput = raw_ctlinput,
	.pr_init = raw_init,
	.pr_usrreqs = &key_usrreqs
	}
	};

	static void
	key_init0(void)
	{
	+ INIT_VNET_IPSEC(curvnet);
	bzero((caddr_t)&V_key_cb, sizeof(V_key_cb));
	key_init();
	}

	struct domain keydomain = {
	.dom_family = PF_KEY,
	.dom_name = "key",
	.dom_init = key_init0,
	.dom_protosw = keysw,
	.dom_protoswNPROTOSW = &keysw[sizeof(keysw)/sizeof(keysw[0])]
	};

	DOMAIN_SET(key);
	Index: head/sys/netipsec/vipsec.h
	===================================================================
	--- head/sys/netipsec/vipsec.h (nonexistent)
	+++ head/sys/netipsec/vipsec.h (revision 183550)
	@@ -0,0 +1,188 @@
	+/*
	+ * Copyright (c) 2007-2008 University of Zagreb
	+ * Copyright (c) 2007-2008 FreeBSD Foundation
	+ *
	+ * This software was developed by the University of Zagreb and the
	+ * FreeBSD Foundation under sponsorship by the Stichting NLnet and the
	+ * FreeBSD Foundation.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _NETIPSEC_VIPSEC_H_
	+#define _NETIPSEC_VIPSEC_H_
	+
	+#ifdef VIMAGE
	+#include <sys/proc.h>
	+#include <sys/protosw.h>
	+#include <sys/socket.h>
	+
	+#include <netipsec/ipsec.h>
	+#include <netipsec/esp_var.h>
	+#include <netipsec/ah_var.h>
	+#include <netipsec/ipcomp_var.h>
	+#include <netipsec/ipip_var.h>
	+
	+#include <net/if.h>
	+#include <net/if_var.h>
	+#include <net/route.h>
	+#include <net/raw_cb.h>
	+
	+#include <netipsec/keysock.h>
	+
	+struct vnet_ipsec {
	+ int _ipsec_debug;
	+ struct ipsecstat _ipsec4stat;
	+ struct secpolicy _ip4_def_policy;
	+
	+ int _ip4_esp_trans_deflev;
	+ int _ip4_esp_net_deflev;
	+ int _ip4_ah_trans_deflev;
	+ int _ip4_ah_net_deflev;
	+ int _ip4_ah_offsetmask;
	+ int _ip4_ipsec_dfbit;
	+ int _ip4_ipsec_ecn;
	+ int _ip4_esp_randpad;
	+
	+ int _ipsec_replay;
	+ int _ipsec_integrity;
	+ int _crypto_support;
	+
	+ u_int32_t _key_debug_level;
	+ u_int _key_spi_trycnt;
	+ u_int32_t _key_spi_minval;
	+ u_int32_t _key_spi_maxval;
	+ u_int32_t _policy_id;
	+ u_int _key_int_random;
	+ u_int _key_larval_lifetime;
	+ int _key_blockacq_count;
	+ int _key_blockacq_lifetime;
	+ int _key_preferred_oldsa;
	+ u_int32_t _acq_seq;
	+
	+ u_int _saorder_state_alive[3];
	+ u_int _saorder_state_any[4];
	+ int _esp_enable;
	+ struct espstat _espstat;
	+ int _esp_max_ivlen;
	+ int _ipsec_esp_keymin;
	+ int _ipsec_esp_auth;
	+ int _ipsec_ah_keymin;
	+ int _ipip_allow;
	+ struct ipipstat _ipipstat;
	+
	+ struct ipsecstat _ipsec6stat;
	+ int _ip6_esp_trans_deflev;
	+ int _ip6_esp_net_deflev;
	+ int _ip6_ah_trans_deflev;
	+ int _ip6_ah_net_deflev;
	+ int _ip6_ipsec_ecn;
	+ int _ip6_esp_randpad;
	+
	+ int _ah_enable;
	+ int _ah_cleartos;
	+ struct ahstat _ahstat;
	+
	+ int _ipcomp_enable;
	+ struct ipcompstat _ipcompstat;
	+
	+ struct pfkeystat _pfkeystat;
	+ struct key_cb _key_cb;
	+ struct sockaddr _key_dst;
	+ struct sockaddr _key_src;
	+
	+ LIST_HEAD(, secpolicy) _sptree[IPSEC_DIR_MAX];
	+ LIST_HEAD(, secashead) _sahtree;
	+ LIST_HEAD(, secreg) _regtree[SADB_SATYPE_MAX + 1];
	+ LIST_HEAD(, secacq) _acqtree;
	+ LIST_HEAD(, secspacq) _spacqtree;
	+};
	+#endif
	+
	+/*
	+ * Symbol translation macros
	+ */
	+#define INIT_VNET_IPSEC(vnet) \
	+ INIT_FROM_VNET(vnet, VNET_MOD_IPSEC, struct vnet_ipsec, vnet_ipsec)
	+
	+#define VNET_IPSEC(sym) VSYM(vnet_ipsec, sym)
	+
	+#define V_acq_seq VNET_IPSEC(acq_seq)
	+#define V_acqtree VNET_IPSEC(acqtree)
	+#define V_ah_cleartos VNET_IPSEC(ah_cleartos)
	+#define V_ah_enable VNET_IPSEC(ah_enable)
	+#define V_ahstat VNET_IPSEC(ahstat)
	+#define V_crypto_support VNET_IPSEC(crypto_support)
	+#define V_esp_enable VNET_IPSEC(esp_enable)
	+#define V_esp_max_ivlen VNET_IPSEC(esp_max_ivlen)
	+#define V_espstat VNET_IPSEC(espstat)
	+#define V_ip4_ah_net_deflev VNET_IPSEC(ip4_ah_net_deflev)
	+#define V_ip4_ah_offsetmask VNET_IPSEC(ip4_ah_offsetmask)
	+#define V_ip4_ah_trans_deflev VNET_IPSEC(ip4_ah_trans_deflev)
	+#define V_ip4_def_policy VNET_IPSEC(ip4_def_policy)
	+#define V_ip4_esp_net_deflev VNET_IPSEC(ip4_esp_net_deflev)
	+#define V_ip4_esp_randpad VNET_IPSEC(ip4_esp_randpad)
	+#define V_ip4_esp_trans_deflev VNET_IPSEC(ip4_esp_trans_deflev)
	+#define V_ip4_ipsec_dfbit VNET_IPSEC(ip4_ipsec_dfbit)
	+#define V_ip4_ipsec_ecn VNET_IPSEC(ip4_ipsec_ecn)
	+#define V_ip6_ah_net_deflev VNET_IPSEC(ip6_ah_net_deflev)
	+#define V_ip6_ah_trans_deflev VNET_IPSEC(ip6_ah_trans_deflev)
	+#define V_ip6_esp_net_deflev VNET_IPSEC(ip6_esp_net_deflev)
	+#define V_ip6_esp_randpad VNET_IPSEC(ip6_esp_randpad)
	+#define V_ip6_esp_trans_deflev VNET_IPSEC(ip6_esp_trans_deflev)
	+#define V_ip6_ipsec_ecn VNET_IPSEC(ip6_ipsec_ecn)
	+#define V_ipcomp_enable VNET_IPSEC(ipcomp_enable)
	+#define V_ipcompstat VNET_IPSEC(ipcompstat)
	+#define V_ipip_allow VNET_IPSEC(ipip_allow)
	+#define V_ipipstat VNET_IPSEC(ipipstat)
	+#define V_ipsec4stat VNET_IPSEC(ipsec4stat)
	+#define V_ipsec6stat VNET_IPSEC(ipsec6stat)
	+#define V_ipsec_ah_keymin VNET_IPSEC(ipsec_ah_keymin)
	+#define V_ipsec_debug VNET_IPSEC(ipsec_debug)
	+#define V_ipsec_esp_auth VNET_IPSEC(ipsec_esp_auth)
	+#define V_ipsec_esp_keymin VNET_IPSEC(ipsec_esp_keymin)
	+#define V_ipsec_integrity VNET_IPSEC(ipsec_integrity)
	+#define V_ipsec_replay VNET_IPSEC(ipsec_replay)
	+#define V_key_blockacq_count VNET_IPSEC(key_blockacq_count)
	+#define V_key_blockacq_lifetime VNET_IPSEC(key_blockacq_lifetime)
	+#define V_key_cb VNET_IPSEC(key_cb)
	+#define V_key_debug_level VNET_IPSEC(key_debug_level)
	+#define V_key_dst VNET_IPSEC(key_dst)
	+#define V_key_int_random VNET_IPSEC(key_int_random)
	+#define V_key_larval_lifetime VNET_IPSEC(key_larval_lifetime)
	+#define V_key_preferred_oldsa VNET_IPSEC(key_preferred_oldsa)
	+#define V_key_spi_maxval VNET_IPSEC(key_spi_maxval)
	+#define V_key_spi_minval VNET_IPSEC(key_spi_minval)
	+#define V_key_spi_trycnt VNET_IPSEC(key_spi_trycnt)
	+#define V_key_src VNET_IPSEC(key_src)
	+#define V_pfkeystat VNET_IPSEC(pfkeystat)
	+#define V_policy_id VNET_IPSEC(policy_id)
	+#define V_regtree VNET_IPSEC(regtree)
	+#define V_sahtree VNET_IPSEC(sahtree)
	+#define V_saorder_state_alive VNET_IPSEC(saorder_state_alive)
	+#define V_saorder_state_any VNET_IPSEC(saorder_state_any)
	+#define V_spacqtree VNET_IPSEC(spacqtree)
	+#define V_sptree VNET_IPSEC(sptree)
	+
	+#endif /* !_NETIPSEC_VIPSEC_H_ */

	Property changes on: head/sys/netipsec/vipsec.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/sys/netipsec/xform_ah.c
	===================================================================
	--- head/sys/netipsec/xform_ah.c (revision 183549)
	+++ head/sys/netipsec/xform_ah.c (revision 183550)
	@@ -1,1215 +1,1222 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_ah.c,v 1.63 2001/06/26 06:18:58 angelos Exp $ */
	/*-
	* The authors of this code are John Ioannidis (ji@tla.org),
	* Angelos D. Keromytis (kermit@csd.uch.gr) and
	* Niels Provos (provos@physnet.uni-hamburg.de).
	*
	* The original version of this code was written by John Ioannidis
	* for BSD/OS in Athens, Greece, in November 1995.
	*
	* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
	* by Angelos D. Keromytis.
	*
	* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
	* and Niels Provos.
	*
	* Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist.
	*
	* Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
	* Angelos D. Keromytis and Niels Provos.
	* Copyright (c) 1999 Niklas Hallqvist.
	* Copyright (c) 2001 Angelos D. Keromytis.
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all copies of any software which is or includes a copy or
	* modification of this software.
	* You may use this code under the GNU public license if you so wish. Please
	* contribute changes back to the authors under this freer than GPL license
	* so that we may further the use of strong encryption without limitations to
	* all.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_ecn.h>
	#include <netinet/ip6.h>

	#include <net/route.h>
	#include <netipsec/ipsec.h>
	#include <netipsec/ah.h>
	#include <netipsec/ah_var.h>
	#include <netipsec/xform.h>

	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netipsec/ipsec6.h>
	#include <netinet6/ip6_ecn.h>
	#endif

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <opencrypto/cryptodev.h>

	/*
	* Return header size in bytes. The old protocol did not support
	* the replay counter; the new protocol always includes the counter.
	*/
	#define HDRSIZE(sav) \
	(((sav)->flags & SADB_X_EXT_OLD) ? \
	sizeof (struct ah) : sizeof (struct ah) + sizeof (u_int32_t))
	/*
	* Return authenticator size in bytes. The old protocol is known
	* to use a fixed 16-byte authenticator. The new algorithm use 12-byte
	* authenticator.
	*/
	#define AUTHSIZE(sav) \
	((sav->flags & SADB_X_EXT_OLD) ? 16 : AH_HMAC_HASHLEN)

	int ah_enable = 1; /* control flow of packets with AH */
	int ah_cleartos = 1; /* clear ip_tos when doing AH calc */
	struct ahstat ahstat;

	SYSCTL_DECL(_net_inet_ah);
	-SYSCTL_INT(_net_inet_ah, OID_AUTO,
	- ah_enable, CTLFLAG_RW, &ah_enable, 0, "");
	-SYSCTL_INT(_net_inet_ah, OID_AUTO,
	- ah_cleartos, CTLFLAG_RW, &ah_cleartos, 0, "");
	-SYSCTL_STRUCT(_net_inet_ah, IPSECCTL_STATS,
	- stats, CTLFLAG_RD, &ahstat, ahstat, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO,
	+ ah_enable, CTLFLAG_RW, ah_enable, 0, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ah, OID_AUTO,
	+ ah_cleartos, CTLFLAG_RW, ah_cleartos, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ah, IPSECCTL_STATS,
	+ stats, CTLFLAG_RD, ahstat, ahstat, "");

	static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */

	static int ah_input_cb(struct cryptop*);
	static int ah_output_cb(struct cryptop*);

	/*
	* NB: this is public for use by the PF_KEY support.
	*/
	struct auth_hash *
	ah_algorithm_lookup(int alg)
	{
	if (alg > SADB_AALG_MAX)
	return NULL;
	switch (alg) {
	case SADB_X_AALG_NULL:
	return &auth_hash_null;
	case SADB_AALG_MD5HMAC:
	return &auth_hash_hmac_md5;
	case SADB_AALG_SHA1HMAC:
	return &auth_hash_hmac_sha1;
	case SADB_X_AALG_RIPEMD160HMAC:
	return &auth_hash_hmac_ripemd_160;
	case SADB_X_AALG_MD5:
	return &auth_hash_key_md5;
	case SADB_X_AALG_SHA:
	return &auth_hash_key_sha1;
	case SADB_X_AALG_SHA2_256:
	return &auth_hash_hmac_sha2_256;
	case SADB_X_AALG_SHA2_384:
	return &auth_hash_hmac_sha2_384;
	case SADB_X_AALG_SHA2_512:
	return &auth_hash_hmac_sha2_512;
	}
	return NULL;
	}

	size_t
	ah_hdrsiz(struct secasvar *sav)
	{
	size_t size;

	if (sav != NULL) {
	int authsize;
	IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null xform"));
	/XXX not right for null algorithm--does it matter??/
	authsize = AUTHSIZE(sav);
	size = roundup(authsize, sizeof (u_int32_t)) + HDRSIZE(sav);
	} else {
	/* default guess */
	size = sizeof (struct ah) + sizeof (u_int32_t) + 16;
	}
	return size;
	}

	/*
	* NB: public for use by esp_init.
	*/
	int
	ah_init0(struct secasvar sav, struct xformsw xsp, struct cryptoini *cria)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct auth_hash *thash;
	int keylen;

	thash = ah_algorithm_lookup(sav->alg_auth);
	if (thash == NULL) {
	DPRINTF(("%s: unsupported authentication algorithm %u\n",
	__func__, sav->alg_auth));
	return EINVAL;
	}
	/*
	* Verify the replay state block allocation is consistent with
	* the protocol type. We check here so we can make assumptions
	* later during protocol processing.
	*/
	/* NB: replay state is setup elsewhere (sigh) */
	if (((sav->flags&SADB_X_EXT_OLD) == 0) ^ (sav->replay != NULL)) {
	DPRINTF(("%s: replay state block inconsistency, "
	"%s algorithm %s replay state\n", __func__,
	(sav->flags & SADB_X_EXT_OLD) ? "old" : "new",
	sav->replay == NULL ? "without" : "with"));
	return EINVAL;
	}
	if (sav->key_auth == NULL) {
	DPRINTF(("%s: no authentication key for %s algorithm\n",
	__func__, thash->name));
	return EINVAL;
	}
	keylen = _KEYLEN(sav->key_auth);
	if (keylen != thash->keysize && thash->keysize != 0) {
	DPRINTF(("%s: invalid keylength %d, algorithm %s requires "
	"keysize %d\n", __func__,
	keylen, thash->name, thash->keysize));
	return EINVAL;
	}

	sav->tdb_xform = xsp;
	sav->tdb_authalgxform = thash;

	/* Initialize crypto session. */
	bzero(cria, sizeof (*cria));
	cria->cri_alg = sav->tdb_authalgxform->type;
	cria->cri_klen = _KEYBITS(sav->key_auth);
	cria->cri_key = sav->key_auth->key_data;
	cria->cri_mlen = AUTHSIZE(sav);

	return 0;
	}

	/*
	* ah_init() is called when an SPI is being set up.
	*/
	static int
	ah_init(struct secasvar sav, struct xformsw xsp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct cryptoini cria;
	int error;

	error = ah_init0(sav, xsp, &cria);
	return error ? error :
	crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support);
	}

	/*
	* Paranoia.
	*
	* NB: public for use by esp_zeroize (XXX).
	*/
	int
	ah_zeroize(struct secasvar *sav)
	{
	int err;

	if (sav->key_auth)
	bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth));

	err = crypto_freesession(sav->tdb_cryptoid);
	sav->tdb_cryptoid = 0;
	sav->tdb_authalgxform = NULL;
	sav->tdb_xform = NULL;
	return err;
	}

	/*
	* Massage IPv4/IPv6 headers for AH processing.
	*/
	static int
	ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct mbuf m = m0;
	unsigned char *ptr;
	int off, count;

	#ifdef INET
	struct ip *ip;
	#endif /* INET */

	#ifdef INET6
	struct ip6_ext *ip6e;
	struct ip6_hdr ip6;
	int alloc, len, ad;
	#endif /* INET6 */

	switch (proto) {
	#ifdef INET
	case AF_INET:
	/*
	* This is the least painful way of dealing with IPv4 header
	* and option processing -- just make sure they're in
	* contiguous memory.
	*/
	*m0 = m = m_pullup(m, skip);
	if (m == NULL) {
	DPRINTF(("%s: m_pullup failed\n", __func__));
	return ENOBUFS;
	}

	/* Fix the IP header */
	ip = mtod(m, struct ip *);
	if (V_ah_cleartos)
	ip->ip_tos = 0;
	ip->ip_ttl = 0;
	ip->ip_sum = 0;

	/*
	* On input, fix ip_len which has been byte-swapped
	* at ip_input().
	*/
	if (!out) {
	ip->ip_len = htons(ip->ip_len + skip);

	if (alg == CRYPTO_MD5_KPDK \|\| alg == CRYPTO_SHA1_KPDK)
	ip->ip_off = htons(ip->ip_off & IP_DF);
	else
	ip->ip_off = 0;
	} else {
	if (alg == CRYPTO_MD5_KPDK \|\| alg == CRYPTO_SHA1_KPDK)
	ip->ip_off = htons(ntohs(ip->ip_off) & IP_DF);
	else
	ip->ip_off = 0;
	}

	ptr = mtod(m, unsigned char *) + sizeof(struct ip);

	/* IPv4 option processing */
	for (off = sizeof(struct ip); off < skip;) {
	if (ptr[off] == IPOPT_EOL \|\| ptr[off] == IPOPT_NOP \|\|
	off + 1 < skip)
	;
	else {
	DPRINTF(("%s: illegal IPv4 option length for "
	"option %d\n", __func__, ptr[off]));

	m_freem(m);
	return EINVAL;
	}

	switch (ptr[off]) {
	case IPOPT_EOL:
	off = skip; /* End the loop. */
	break;

	case IPOPT_NOP:
	off++;
	break;

	case IPOPT_SECURITY: /* 0x82 */
	case 0x85: /* Extended security. */
	case 0x86: /* Commercial security. */
	case 0x94: /* Router alert */
	case 0x95: /* RFC1770 */
	/* Sanity check for option length. */
	if (ptr[off + 1] < 2) {
	DPRINTF(("%s: illegal IPv4 option "
	"length for option %d\n",
	__func__, ptr[off]));

	m_freem(m);
	return EINVAL;
	}

	off += ptr[off + 1];
	break;

	case IPOPT_LSRR:
	case IPOPT_SSRR:
	/* Sanity check for option length. */
	if (ptr[off + 1] < 2) {
	DPRINTF(("%s: illegal IPv4 option "
	"length for option %d\n",
	__func__, ptr[off]));

	m_freem(m);
	return EINVAL;
	}

	/*
	* On output, if we have either of the
	* source routing options, we should
	* swap the destination address of the
	* IP header with the last address
	* specified in the option, as that is
	* what the destination's IP header
	* will look like.
	*/
	if (out)
	bcopy(ptr + off + ptr[off + 1] -
	sizeof(struct in_addr),
	&(ip->ip_dst), sizeof(struct in_addr));

	/* Fall through */
	default:
	/* Sanity check for option length. */
	if (ptr[off + 1] < 2) {
	DPRINTF(("%s: illegal IPv4 option "
	"length for option %d\n",
	__func__, ptr[off]));
	m_freem(m);
	return EINVAL;
	}

	/* Zeroize all other options. */
	count = ptr[off + 1];
	bcopy(ipseczeroes, ptr, count);
	off += count;
	break;
	}

	/* Sanity check. */
	if (off > skip) {
	DPRINTF(("%s: malformed IPv4 options header\n",
	__func__));

	m_freem(m);
	return EINVAL;
	}
	}

	break;
	#endif /* INET */

	#ifdef INET6
	case AF_INET6: /* Ugly... */
	/* Copy and "cook" the IPv6 header. */
	m_copydata(m, 0, sizeof(ip6), (caddr_t) &ip6);

	/* We don't do IPv6 Jumbograms. */
	if (ip6.ip6_plen == 0) {
	DPRINTF(("%s: unsupported IPv6 jumbogram\n", __func__));
	m_freem(m);
	return EMSGSIZE;
	}

	ip6.ip6_flow = 0;
	ip6.ip6_hlim = 0;
	ip6.ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6.ip6_vfc \|= IPV6_VERSION;

	/* Scoped address handling. */
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_src))
	ip6.ip6_src.s6_addr16[1] = 0;
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_dst))
	ip6.ip6_dst.s6_addr16[1] = 0;

	/* Done with IPv6 header. */
	m_copyback(m, 0, sizeof(struct ip6_hdr), (caddr_t) &ip6);

	/* Let's deal with the remaining headers (if any). */
	if (skip - sizeof(struct ip6_hdr) > 0) {
	if (m->m_len <= skip) {
	ptr = (unsigned char *) malloc(
	skip - sizeof(struct ip6_hdr),
	M_XDATA, M_NOWAIT);
	if (ptr == NULL) {
	DPRINTF(("%s: failed to allocate memory"
	"for IPv6 headers\n",__func__));
	m_freem(m);
	return ENOBUFS;
	}

	/*
	* Copy all the protocol headers after
	* the IPv6 header.
	*/
	m_copydata(m, sizeof(struct ip6_hdr),
	skip - sizeof(struct ip6_hdr), ptr);
	alloc = 1;
	} else {
	/* No need to allocate memory. */
	ptr = mtod(m, unsigned char *) +
	sizeof(struct ip6_hdr);
	alloc = 0;
	}
	} else
	break;

	off = ip6.ip6_nxt & 0xff; /* Next header type. */

	for (len = 0; len < skip - sizeof(struct ip6_hdr);)
	switch (off) {
	case IPPROTO_HOPOPTS:
	case IPPROTO_DSTOPTS:
	ip6e = (struct ip6_ext *) (ptr + len);

	/*
	* Process the mutable/immutable
	* options -- borrows heavily from the
	* KAME code.
	*/
	for (count = len + sizeof(struct ip6_ext);
	count < len + ((ip6e->ip6e_len + 1) << 3);) {
	if (ptr[count] == IP6OPT_PAD1) {
	count++;
	continue; /* Skip padding. */
	}

	/* Sanity check. */
	if (count > len +
	((ip6e->ip6e_len + 1) << 3)) {
	m_freem(m);

	/* Free, if we allocated. */
	if (alloc)
	FREE(ptr, M_XDATA);
	return EINVAL;
	}

	ad = ptr[count + 1];

	/* If mutable option, zeroize. */
	if (ptr[count] & IP6OPT_MUTABLE)
	bcopy(ipseczeroes, ptr + count,
	ptr[count + 1]);

	count += ad;

	/* Sanity check. */
	if (count >
	skip - sizeof(struct ip6_hdr)) {
	m_freem(m);

	/* Free, if we allocated. */
	if (alloc)
	FREE(ptr, M_XDATA);
	return EINVAL;
	}
	}

	/* Advance. */
	len += ((ip6e->ip6e_len + 1) << 3);
	off = ip6e->ip6e_nxt;
	break;

	case IPPROTO_ROUTING:
	/*
	* Always include routing headers in
	* computation.
	*/
	ip6e = (struct ip6_ext *) (ptr + len);
	len += ((ip6e->ip6e_len + 1) << 3);
	off = ip6e->ip6e_nxt;
	break;

	default:
	DPRINTF(("%s: unexpected IPv6 header type %d",
	__func__, off));
	if (alloc)
	FREE(ptr, M_XDATA);
	m_freem(m);
	return EINVAL;
	}

	/* Copyback and free, if we allocated. */
	if (alloc) {
	m_copyback(m, sizeof(struct ip6_hdr),
	skip - sizeof(struct ip6_hdr), ptr);
	free(ptr, M_XDATA);
	}

	break;
	#endif /* INET6 */
	}

	return 0;
	}

	/*
	* ah_input() gets called to verify that an input packet
	* passes authentication.
	*/
	static int
	ah_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct auth_hash *ahx;
	struct tdb_ident *tdbi;
	struct tdb_crypto *tc;
	struct m_tag *mtag;
	struct newah *ah;
	int hl, rplen, authsize;

	struct cryptodesc *crda;
	struct cryptop *crp;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->key_auth != NULL, ("null authentication key"));
	IPSEC_ASSERT(sav->tdb_authalgxform != NULL,
	("null authentication xform"));

	/* Figure out header size. */
	rplen = HDRSIZE(sav);

	/* XXX don't pullup, just copy header */
	IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen);
	if (ah == NULL) {
	DPRINTF(("ah_input: cannot pullup header\n"));
	V_ahstat.ahs_hdrops++; /XXX/
	m_freem(m);
	return ENOBUFS;
	}

	/* Check replay window, if applicable. */
	if (sav->replay && !ipsec_chkreplay(ntohl(ah->ah_seq), sav)) {
	V_ahstat.ahs_replay++;
	DPRINTF(("%s: packet replay failure: %s\n", __func__,
	ipsec_logsastr(sav)));
	m_freem(m);
	return ENOBUFS;
	}

	/* Verify AH header length. */
	hl = ah->ah_len * sizeof (u_int32_t);
	ahx = sav->tdb_authalgxform;
	authsize = AUTHSIZE(sav);
	if (hl != authsize + rplen - sizeof (struct ah)) {
	DPRINTF(("%s: bad authenticator length %u (expecting %lu)"
	" for packet in SA %s/%08lx\n", __func__,
	hl, (u_long) (authsize + rplen - sizeof (struct ah)),
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	V_ahstat.ahs_badauthl++;
	m_freem(m);
	return EACCES;
	}
	V_ahstat.ahs_ibytes += m->m_pkthdr.len - skip - hl;

	/* Get crypto descriptors. */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__));
	V_ahstat.ahs_crypto++;
	m_freem(m);
	return ENOBUFS;
	}

	crda = crp->crp_desc;
	IPSEC_ASSERT(crda != NULL, ("null crypto descriptor"));

	crda->crd_skip = 0;
	crda->crd_len = m->m_pkthdr.len;
	crda->crd_inject = skip + rplen;

	/* Authentication operation. */
	crda->crd_alg = ahx->type;
	crda->crd_klen = _KEYBITS(sav->key_auth);
	crda->crd_key = sav->key_auth->key_data;

	/* Find out if we've already done crypto. */
	for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL);
	mtag != NULL;
	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) {
	tdbi = (struct tdb_ident *) (mtag + 1);
	if (tdbi->proto == sav->sah->saidx.proto &&
	tdbi->spi == sav->spi &&
	!bcmp(&tdbi->dst, &sav->sah->saidx.dst,
	sizeof (union sockaddr_union)))
	break;
	}

	/* Allocate IPsec-specific opaque crypto info. */
	if (mtag == NULL) {
	tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto) +
	skip + rplen + authsize, M_XDATA, M_NOWAIT\|M_ZERO);
	} else {
	/* Hash verification has already been done successfully. */
	tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto),
	M_XDATA, M_NOWAIT\|M_ZERO);
	}
	if (tc == NULL) {
	DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
	V_ahstat.ahs_crypto++;
	crypto_freereq(crp);
	m_freem(m);
	return ENOBUFS;
	}

	/* Only save information if crypto processing is needed. */
	if (mtag == NULL) {
	int error;

	/*
	* Save the authenticator, the skipped portion of the packet,
	* and the AH header.
	*/
	m_copydata(m, 0, skip + rplen + authsize, (caddr_t)(tc+1));

	/* Zeroize the authenticator on the packet. */
	m_copyback(m, skip + rplen, authsize, ipseczeroes);

	/* "Massage" the packet headers for crypto processing. */
	error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
	skip, ahx->type, 0);
	if (error != 0) {
	/* NB: mbuf is free'd by ah_massage_headers */
	V_ahstat.ahs_hdrops++;
	free(tc, M_XDATA);
	crypto_freereq(crp);
	return error;
	}
	}

	/* Crypto operation descriptor. */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ah_input_cb;
	crp->crp_sid = sav->tdb_cryptoid;
	crp->crp_opaque = (caddr_t) tc;

	/* These are passed as-is to the callback. */
	tc->tc_spi = sav->spi;
	tc->tc_dst = sav->sah->saidx.dst;
	tc->tc_proto = sav->sah->saidx.proto;
	tc->tc_nxt = ah->ah_nxt;
	tc->tc_protoff = protoff;
	tc->tc_skip = skip;
	tc->tc_ptr = (caddr_t) mtag; /* Save the mtag we've identified. */

	if (mtag == NULL)
	return crypto_dispatch(crp);
	else
	return ah_input_cb(crp);
	}

	#ifdef INET6
	#define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) do { \
	if (saidx->dst.sa.sa_family == AF_INET6) { \
	error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); \
	} else { \
	error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); \
	} \
	} while (0)
	#else
	#define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) \
	(error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag))
	#endif

	/*
	* AH input callback from the crypto driver.
	*/
	static int
	ah_input_cb(struct cryptop *crp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	int rplen, error, skip, protoff;
	unsigned char calc[AH_ALEN_MAX];
	struct mbuf *m;
	struct cryptodesc *crd;
	struct auth_hash *ahx;
	struct tdb_crypto *tc;
	struct m_tag *mtag;
	struct secasvar *sav;
	struct secasindex *saidx;
	u_int8_t nxt;
	caddr_t ptr;
	int authsize;

	crd = crp->crp_desc;

	tc = (struct tdb_crypto *) crp->crp_opaque;
	IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!"));
	skip = tc->tc_skip;
	nxt = tc->tc_nxt;
	protoff = tc->tc_protoff;
	mtag = (struct m_tag *) tc->tc_ptr;
	m = (struct mbuf *) crp->crp_buf;

	sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi);
	if (sav == NULL) {
	V_ahstat.ahs_notdb++;
	DPRINTF(("%s: SA expired while in crypto\n", __func__));
	error = ENOBUFS; /XXX/
	goto bad;
	}

	saidx = &sav->sah->saidx;
	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET \|\|
	saidx->dst.sa.sa_family == AF_INET6,
	("unexpected protocol family %u", saidx->dst.sa.sa_family));

	ahx = (struct auth_hash *) sav->tdb_authalgxform;

	/* Check for crypto errors. */
	if (crp->crp_etype) {
	if (sav->tdb_cryptoid != 0)
	sav->tdb_cryptoid = crp->crp_sid;

	if (crp->crp_etype == EAGAIN) {
	error = crypto_dispatch(crp);
	return error;
	}

	V_ahstat.ahs_noxform++;
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	} else {
	V_ahstat.ahs_hist[sav->alg_auth]++;
	crypto_freereq(crp); /* No longer needed. */
	crp = NULL;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	V_ahstat.ahs_crypto++;
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}

	/* Figure out header size. */
	rplen = HDRSIZE(sav);
	authsize = AUTHSIZE(sav);

	/* Copy authenticator off the packet. */
	m_copydata(m, skip + rplen, authsize, calc);

	/*
	* If we have an mtag, we don't need to verify the authenticator --
	* it has been verified by an IPsec-aware NIC.
	*/
	if (mtag == NULL) {
	ptr = (caddr_t) (tc + 1);

	/* Verify authenticator. */
	if (bcmp(ptr + skip + rplen, calc, authsize)) {
	DPRINTF(("%s: authentication hash mismatch for packet "
	"in SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));
	V_ahstat.ahs_badauth++;
	error = EACCES;
	goto bad;
	}

	/* Fix the Next Protocol field. */
	((u_int8_t *) ptr)[protoff] = nxt;

	/* Copyback the saved (uncooked) network headers. */
	m_copyback(m, 0, skip, ptr);
	} else {
	/* Fix the Next Protocol field. */
	m_copyback(m, protoff, sizeof(u_int8_t), &nxt);
	}

	free(tc, M_XDATA), tc = NULL; /* No longer needed */

	/*
	* Header is now authenticated.
	*/
	m->m_flags \|= M_AUTHIPHDR\|M_AUTHIPDGM;

	/*
	* Update replay sequence number, if appropriate.
	*/
	if (sav->replay) {
	u_int32_t seq;

	m_copydata(m, skip + offsetof(struct newah, ah_seq),
	sizeof (seq), (caddr_t) &seq);
	if (ipsec_updatereplay(ntohl(seq), sav)) {
	V_ahstat.ahs_replay++;
	error = ENOBUFS; /XXX as above/
	goto bad;
	}
	}

	/*
	* Remove the AH header and authenticator from the mbuf.
	*/
	error = m_striphdr(m, skip, rplen + authsize);
	if (error) {
	DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi)));

	V_ahstat.ahs_hdrops++;
	goto bad;
	}

	IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag);

	KEY_FREESAV(&sav);
	return error;
	bad:
	if (sav)
	KEY_FREESAV(&sav);
	if (m != NULL)
	m_freem(m);
	if (tc != NULL)
	free(tc, M_XDATA);
	if (crp != NULL)
	crypto_freereq(crp);
	return error;
	}

	/*
	* AH output routine, called by ipsec[46]_process_packet().
	*/
	static int
	ah_output(
	struct mbuf *m,
	struct ipsecrequest *isr,
	struct mbuf **mp,
	int skip,
	int protoff)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar *sav;
	struct auth_hash *ahx;
	struct cryptodesc *crda;
	struct tdb_crypto *tc;
	struct mbuf *mi;
	struct cryptop *crp;
	u_int16_t iplen;
	int error, rplen, authsize, maxpacketsize, roff;
	u_int8_t prot;
	struct newah *ah;

	sav = isr->sav;
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	ahx = sav->tdb_authalgxform;
	IPSEC_ASSERT(ahx != NULL, ("null authentication xform"));

	V_ahstat.ahs_output++;

	/* Figure out header size. */
	rplen = HDRSIZE(sav);

	/* Check for maximum packet size violations. */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	maxpacketsize = IP_MAXPACKET;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	maxpacketsize = IPV6_MAXPACKET;
	break;
	#endif /* INET6 */
	default:
	DPRINTF(("%s: unknown/unsupported protocol family %u, "
	"SA %s/%08lx\n", __func__,
	sav->sah->saidx.dst.sa.sa_family,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	V_ahstat.ahs_nopf++;
	error = EPFNOSUPPORT;
	goto bad;
	}
	authsize = AUTHSIZE(sav);
	if (rplen + authsize + m->m_pkthdr.len > maxpacketsize) {
	DPRINTF(("%s: packet in SA %s/%08lx got too big "
	"(len %u, max len %u)\n", __func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi),
	rplen + authsize + m->m_pkthdr.len, maxpacketsize));
	V_ahstat.ahs_toobig++;
	error = EMSGSIZE;
	goto bad;
	}

	/* Update the counters. */
	V_ahstat.ahs_obytes += m->m_pkthdr.len - skip;

	m = m_unshare(m, M_NOWAIT);
	if (m == NULL) {
	DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	V_ahstat.ahs_hdrops++;
	error = ENOBUFS;
	goto bad;
	}

	/* Inject AH header. */
	mi = m_makespace(m, skip, rplen + authsize, &roff);
	if (mi == NULL) {
	DPRINTF(("%s: failed to inject %u byte AH header for SA "
	"%s/%08lx\n", __func__,
	rplen + authsize,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	V_ahstat.ahs_hdrops++; /XXX differs from openbsd /
	error = ENOBUFS;
	goto bad;
	}

	/*
	* The AH header is guaranteed by m_makespace() to be in
	* contiguous memory, at roff bytes offset into the returned mbuf.
	*/
	ah = (struct newah *)(mtod(mi, caddr_t) + roff);

	/* Initialize the AH header. */
	m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nxt);
	ah->ah_len = (rplen + authsize - sizeof(struct ah)) / sizeof(u_int32_t);
	ah->ah_reserve = 0;
	ah->ah_spi = sav->spi;

	/* Zeroize authenticator. */
	m_copyback(m, skip + rplen, authsize, ipseczeroes);

	/* Insert packet replay counter, as requested. */
	if (sav->replay) {
	if (sav->replay->count == ~0 &&
	(sav->flags & SADB_X_EXT_CYCSEQ) == 0) {
	DPRINTF(("%s: replay counter wrapped for SA %s/%08lx\n",
	__func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	V_ahstat.ahs_wrap++;
	error = EINVAL;
	goto bad;
	}
	#ifdef REGRESSION
	/* Emulate replay attack when ipsec_replay is TRUE. */
	if (!V_ipsec_replay)
	#endif
	sav->replay->count++;
	ah->ah_seq = htonl(sav->replay->count);
	}

	/* Get crypto descriptors. */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptors\n",
	__func__));
	V_ahstat.ahs_crypto++;
	error = ENOBUFS;
	goto bad;
	}

	crda = crp->crp_desc;

	crda->crd_skip = 0;
	crda->crd_inject = skip + rplen;
	crda->crd_len = m->m_pkthdr.len;

	/* Authentication operation. */
	crda->crd_alg = ahx->type;
	crda->crd_key = sav->key_auth->key_data;
	crda->crd_klen = _KEYBITS(sav->key_auth);

	/* Allocate IPsec-specific opaque crypto info. */
	tc = (struct tdb_crypto *) malloc(
	sizeof(struct tdb_crypto) + skip, M_XDATA, M_NOWAIT\|M_ZERO);
	if (tc == NULL) {
	crypto_freereq(crp);
	DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
	V_ahstat.ahs_crypto++;
	error = ENOBUFS;
	goto bad;
	}

	/* Save the skipped portion of the packet. */
	m_copydata(m, 0, skip, (caddr_t) (tc + 1));

	/*
	* Fix IP header length on the header used for
	* authentication. We don't need to fix the original
	* header length as it will be fixed by our caller.
	*/
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	bcopy(((caddr_t)(tc + 1)) +
	offsetof(struct ip, ip_len),
	(caddr_t) &iplen, sizeof(u_int16_t));
	iplen = htons(ntohs(iplen) + rplen + authsize);
	m_copyback(m, offsetof(struct ip, ip_len),
	sizeof(u_int16_t), (caddr_t) &iplen);
	break;
	#endif /* INET */

	#ifdef INET6
	case AF_INET6:
	bcopy(((caddr_t)(tc + 1)) +
	offsetof(struct ip6_hdr, ip6_plen),
	(caddr_t) &iplen, sizeof(u_int16_t));
	iplen = htons(ntohs(iplen) + rplen + authsize);
	m_copyback(m, offsetof(struct ip6_hdr, ip6_plen),
	sizeof(u_int16_t), (caddr_t) &iplen);
	break;
	#endif /* INET6 */
	}

	/* Fix the Next Header field in saved header. */
	((u_int8_t *) (tc + 1))[protoff] = IPPROTO_AH;

	/* Update the Next Protocol field in the IP header. */
	prot = IPPROTO_AH;
	m_copyback(m, protoff, sizeof(u_int8_t), (caddr_t) &prot);

	/* "Massage" the packet headers for crypto processing. */
	error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family,
	skip, ahx->type, 1);
	if (error != 0) {
	m = NULL; /* mbuf was free'd by ah_massage_headers. */
	free(tc, M_XDATA);
	crypto_freereq(crp);
	goto bad;
	}

	/* Crypto operation descriptor. */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ah_output_cb;
	crp->crp_sid = sav->tdb_cryptoid;
	crp->crp_opaque = (caddr_t) tc;

	/* These are passed as-is to the callback. */
	tc->tc_isr = isr;
	tc->tc_spi = sav->spi;
	tc->tc_dst = sav->sah->saidx.dst;
	tc->tc_proto = sav->sah->saidx.proto;
	tc->tc_skip = skip;
	tc->tc_protoff = protoff;

	return crypto_dispatch(crp);
	bad:
	if (m)
	m_freem(m);
	return (error);
	}

	/*
	* AH output callback from the crypto driver.
	*/
	static int
	ah_output_cb(struct cryptop *crp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	int skip, protoff, error;
	struct tdb_crypto *tc;
	struct ipsecrequest *isr;
	struct secasvar *sav;
	struct mbuf *m;
	caddr_t ptr;
	int err;

	tc = (struct tdb_crypto *) crp->crp_opaque;
	IPSEC_ASSERT(tc != NULL, ("null opaque data area!"));
	skip = tc->tc_skip;
	protoff = tc->tc_protoff;
	ptr = (caddr_t) (tc + 1);
	m = (struct mbuf *) crp->crp_buf;

	isr = tc->tc_isr;
	IPSECREQUEST_LOCK(isr);
	sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi);
	if (sav == NULL) {
	V_ahstat.ahs_notdb++;
	DPRINTF(("%s: SA expired while in crypto\n", __func__));
	error = ENOBUFS; /XXX/
	goto bad;
	}
	IPSEC_ASSERT(isr->sav == sav, ("SA changed\n"));

	/* Check for crypto errors. */
	if (crp->crp_etype) {
	if (sav->tdb_cryptoid != 0)
	sav->tdb_cryptoid = crp->crp_sid;

	if (crp->crp_etype == EAGAIN) {
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	error = crypto_dispatch(crp);
	return error;
	}

	V_ahstat.ahs_noxform++;
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	V_ahstat.ahs_crypto++;
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	V_ahstat.ahs_hist[sav->alg_auth]++;

	/*
	* Copy original headers (with the new protocol number) back
	* in place.
	*/
	m_copyback(m, 0, skip, ptr);

	/* No longer needed. */
	free(tc, M_XDATA);
	crypto_freereq(crp);

	#ifdef REGRESSION
	/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
	if (V_ipsec_integrity) {
	int alen;

	/*
	* Corrupt HMAC if we want to test integrity verification of
	* the other side.
	*/
	alen = AUTHSIZE(sav);
	m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes);
	}
	#endif

	/* NB: m is reclaimed by ipsec_process_done. */
	err = ipsec_process_done(m, isr);
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	return err;
	bad:
	if (sav)
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	if (m)
	m_freem(m);
	free(tc, M_XDATA);
	crypto_freereq(crp);
	return error;
	}

	static struct xformsw ah_xformsw = {
	XF_AH, XFT_AUTH, "IPsec AH",
	ah_init, ah_zeroize, ah_input, ah_output,
	};

	static void
	ah_attach(void)
	{
	xform_register(&ah_xformsw);
	}
	SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL);
	Index: head/sys/netipsec/xform_esp.c
	===================================================================
	--- head/sys/netipsec/xform_esp.c (revision 183549)
	+++ head/sys/netipsec/xform_esp.c (revision 183550)
	@@ -1,1003 +1,1009 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_esp.c,v 1.69 2001/06/26 06:18:59 angelos Exp $ */
	/*-
	* The authors of this code are John Ioannidis (ji@tla.org),
	* Angelos D. Keromytis (kermit@csd.uch.gr) and
	* Niels Provos (provos@physnet.uni-hamburg.de).
	*
	* The original version of this code was written by John Ioannidis
	* for BSD/OS in Athens, Greece, in November 1995.
	*
	* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
	* by Angelos D. Keromytis.
	*
	* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
	* and Niels Provos.
	*
	* Additional features in 1999 by Angelos D. Keromytis.
	*
	* Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
	* Angelos D. Keromytis and Niels Provos.
	* Copyright (c) 2001 Angelos D. Keromytis.
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all copies of any software which is or includes a copy or
	* modification of this software.
	* You may use this code under the GNU public license if you so wish. Please
	* contribute changes back to the authors under this freer than GPL license
	* so that we may further the use of strong encryption without limitations to
	* all.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/syslog.h>
	#include <sys/kernel.h>
	#include <sys/random.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_ecn.h>
	#include <netinet/ip6.h>

	#include <net/route.h>
	#include <netipsec/ipsec.h>
	#include <netipsec/ah.h>
	#include <netipsec/ah_var.h>
	#include <netipsec/esp.h>
	#include <netipsec/esp_var.h>
	#include <netipsec/xform.h>

	#ifdef INET6
	#include <netinet6/ip6_var.h>
	#include <netipsec/ipsec6.h>
	#include <netinet6/ip6_ecn.h>
	#endif

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <opencrypto/cryptodev.h>
	#include <opencrypto/xform.h>

	int esp_enable = 1;
	struct espstat espstat;

	SYSCTL_DECL(_net_inet_esp);
	-SYSCTL_INT(_net_inet_esp, OID_AUTO,
	- esp_enable, CTLFLAG_RW, &esp_enable, 0, "");
	-SYSCTL_STRUCT(_net_inet_esp, IPSECCTL_STATS,
	- stats, CTLFLAG_RD, &espstat, espstat, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec,_net_inet_esp, OID_AUTO,
	+ esp_enable, CTLFLAG_RW, esp_enable, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_esp, IPSECCTL_STATS,
	+ stats, CTLFLAG_RD, espstat, espstat, "");

	static int esp_max_ivlen; /* max iv length over all algorithms */

	static int esp_input_cb(struct cryptop *op);
	static int esp_output_cb(struct cryptop *crp);

	/*
	* NB: this is public for use by the PF_KEY support.
	* NB: if you add support here; be sure to add code to esp_attach below!
	*/
	struct enc_xform *
	esp_algorithm_lookup(int alg)
	{
	if (alg >= ESP_ALG_MAX)
	return NULL;
	switch (alg) {
	case SADB_EALG_DESCBC:
	return &enc_xform_des;
	case SADB_EALG_3DESCBC:
	return &enc_xform_3des;
	case SADB_X_EALG_AES:
	return &enc_xform_rijndael128;
	case SADB_X_EALG_BLOWFISHCBC:
	return &enc_xform_blf;
	case SADB_X_EALG_CAST128CBC:
	return &enc_xform_cast5;
	case SADB_X_EALG_SKIPJACK:
	return &enc_xform_skipjack;
	case SADB_EALG_NULL:
	return &enc_xform_null;
	case SADB_X_EALG_CAMELLIACBC:
	return &enc_xform_camellia;
	}
	return NULL;
	}

	size_t
	esp_hdrsiz(struct secasvar *sav)
	{
	+ INIT_VNET_IPSEC(curvnet);
	size_t size;

	if (sav != NULL) {
	/XXX not right for null algorithm--does it matter??/
	IPSEC_ASSERT(sav->tdb_encalgxform != NULL,
	("SA with null xform"));
	if (sav->flags & SADB_X_EXT_OLD)
	size = sizeof (struct esp);
	else
	size = sizeof (struct newesp);
	size += sav->tdb_encalgxform->blocksize + 9;
	/XXX need alg check???/
	if (sav->tdb_authalgxform != NULL && sav->replay)
	size += ah_hdrsiz(sav);
	} else {
	/*
	* base header size
	* + max iv length for CBC mode
	* + max pad length
	* + sizeof (pad length field)
	* + sizeof (next header field)
	* + max icv supported.
	*/
	size = sizeof (struct newesp) + V_esp_max_ivlen + 9 + 16;
	}
	return size;
	}

	/*
	* esp_init() is called when an SPI is being set up.
	*/
	static int
	esp_init(struct secasvar sav, struct xformsw xsp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct enc_xform *txform;
	struct cryptoini cria, crie;
	int keylen;
	int error;

	txform = esp_algorithm_lookup(sav->alg_enc);
	if (txform == NULL) {
	DPRINTF(("%s: unsupported encryption algorithm %d\n",
	__func__, sav->alg_enc));
	return EINVAL;
	}
	if (sav->key_enc == NULL) {
	DPRINTF(("%s: no encoding key for %s algorithm\n",
	__func__, txform->name));
	return EINVAL;
	}
	if ((sav->flags&(SADB_X_EXT_OLD\|SADB_X_EXT_IV4B)) == SADB_X_EXT_IV4B) {
	DPRINTF(("%s: 4-byte IV not supported with protocol\n",
	__func__));
	return EINVAL;
	}
	keylen = _KEYLEN(sav->key_enc);
	if (txform->minkey > keylen \|\| keylen > txform->maxkey) {
	DPRINTF(("%s: invalid key length %u, must be in the range "
	"[%u..%u] for algorithm %s\n", __func__,
	keylen, txform->minkey, txform->maxkey,
	txform->name));
	return EINVAL;
	}

	/*
	* NB: The null xform needs a non-zero blocksize to keep the
	* crypto code happy but if we use it to set ivlen then
	* the ESP header will be processed incorrectly. The
	* compromise is to force it to zero here.
	*/
	sav->ivlen = (txform == &enc_xform_null ? 0 : txform->blocksize);
	sav->iv = (caddr_t) malloc(sav->ivlen, M_XDATA, M_WAITOK);
	if (sav->iv == NULL) {
	DPRINTF(("%s: no memory for IV\n", __func__));
	return EINVAL;
	}
	key_randomfill(sav->iv, sav->ivlen); /XXX/

	/*
	* Setup AH-related state.
	*/
	if (sav->alg_auth != 0) {
	error = ah_init0(sav, xsp, &cria);
	if (error)
	return error;
	}

	/* NB: override anything set in ah_init0 */
	sav->tdb_xform = xsp;
	sav->tdb_encalgxform = txform;

	/* Initialize crypto session. */
	bzero(&crie, sizeof (crie));
	crie.cri_alg = sav->tdb_encalgxform->type;
	crie.cri_klen = _KEYBITS(sav->key_enc);
	crie.cri_key = sav->key_enc->key_data;
	/* XXX Rounds ? */

	if (sav->tdb_authalgxform && sav->tdb_encalgxform) {
	/* init both auth & enc */
	crie.cri_next = &cria;
	error = crypto_newsession(&sav->tdb_cryptoid,
	&crie, V_crypto_support);
	} else if (sav->tdb_encalgxform) {
	error = crypto_newsession(&sav->tdb_cryptoid,
	&crie, V_crypto_support);
	} else if (sav->tdb_authalgxform) {
	error = crypto_newsession(&sav->tdb_cryptoid,
	&cria, V_crypto_support);
	} else {
	/* XXX cannot happen? */
	DPRINTF(("%s: no encoding OR authentication xform!\n",
	__func__));
	error = EINVAL;
	}
	return error;
	}

	/*
	* Paranoia.
	*/
	static int
	esp_zeroize(struct secasvar *sav)
	{
	/* NB: ah_zerorize free's the crypto session state */
	int error = ah_zeroize(sav);

	if (sav->key_enc)
	bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc));
	if (sav->iv) {
	free(sav->iv, M_XDATA);
	sav->iv = NULL;
	}
	sav->tdb_encalgxform = NULL;
	sav->tdb_xform = NULL;
	return error;
	}

	/*
	* ESP input processing, called (eventually) through the protocol switch.
	*/
	static int
	esp_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct auth_hash *esph;
	struct enc_xform *espx;
	struct tdb_ident *tdbi;
	struct tdb_crypto *tc;
	int plen, alen, hlen;
	struct m_tag *mtag;
	struct newesp *esp;

	struct cryptodesc *crde;
	struct cryptop *crp;

	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("null encoding xform"));
	IPSEC_ASSERT((skip&3) == 0 && (m->m_pkthdr.len&3) == 0,
	("misaligned packet, skip %u pkt len %u",
	skip, m->m_pkthdr.len));

	/* XXX don't pullup, just copy header */
	IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp));

	esph = sav->tdb_authalgxform;
	espx = sav->tdb_encalgxform;

	/* Determine the ESP header length */
	if (sav->flags & SADB_X_EXT_OLD)
	hlen = sizeof (struct esp) + sav->ivlen;
	else
	hlen = sizeof (struct newesp) + sav->ivlen;
	/* Authenticator hash size */
	alen = esph ? AH_HMAC_HASHLEN : 0;

	/*
	* Verify payload length is multiple of encryption algorithm
	* block size.
	*
	* NB: This works for the null algorithm because the blocksize
	* is 4 and all packets must be 4-byte aligned regardless
	* of the algorithm.
	*/
	plen = m->m_pkthdr.len - (skip + hlen + alen);
	if ((plen & (espx->blocksize - 1)) \|\| (plen <= 0)) {
	DPRINTF(("%s: payload of %d octets not a multiple of %d octets,"
	" SA %s/%08lx\n", __func__,
	plen, espx->blocksize,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	V_espstat.esps_badilen++;
	m_freem(m);
	return EINVAL;
	}

	/*
	* Check sequence number.
	*/
	if (esph && sav->replay && !ipsec_chkreplay(ntohl(esp->esp_seq), sav)) {
	DPRINTF(("%s: packet replay check for %s\n", __func__,
	ipsec_logsastr(sav))); /XXX/
	V_espstat.esps_replay++;
	m_freem(m);
	return ENOBUFS; /XXX/
	}

	/* Update the counters */
	V_espstat.esps_ibytes += m->m_pkthdr.len - (skip + hlen + alen);

	/* Find out if we've already done crypto */
	for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL);
	mtag != NULL;
	mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) {
	tdbi = (struct tdb_ident *) (mtag + 1);
	if (tdbi->proto == sav->sah->saidx.proto &&
	tdbi->spi == sav->spi &&
	!bcmp(&tdbi->dst, &sav->sah->saidx.dst,
	sizeof(union sockaddr_union)))
	break;
	}

	/* Get crypto descriptors */
	crp = crypto_getreq(esph && espx ? 2 : 1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptors\n",
	__func__));
	V_espstat.esps_crypto++;
	m_freem(m);
	return ENOBUFS;
	}

	/* Get IPsec-specific opaque pointer */
	if (esph == NULL \|\| mtag != NULL)
	tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto),
	M_XDATA, M_NOWAIT\|M_ZERO);
	else
	tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto) + alen,
	M_XDATA, M_NOWAIT\|M_ZERO);
	if (tc == NULL) {
	crypto_freereq(crp);
	DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
	V_espstat.esps_crypto++;
	m_freem(m);
	return ENOBUFS;
	}

	tc->tc_ptr = (caddr_t) mtag;

	if (esph) {
	struct cryptodesc *crda = crp->crp_desc;

	IPSEC_ASSERT(crda != NULL, ("null ah crypto descriptor"));

	/* Authentication descriptor */
	crda->crd_skip = skip;
	crda->crd_len = m->m_pkthdr.len - (skip + alen);
	crda->crd_inject = m->m_pkthdr.len - alen;

	crda->crd_alg = esph->type;
	crda->crd_key = sav->key_auth->key_data;
	crda->crd_klen = _KEYBITS(sav->key_auth);

	/* Copy the authenticator */
	if (mtag == NULL)
	m_copydata(m, m->m_pkthdr.len - alen, alen,
	(caddr_t) (tc + 1));

	/* Chain authentication request */
	crde = crda->crd_next;
	} else {
	crde = crp->crp_desc;
	}

	/* Crypto operation descriptor */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = esp_input_cb;
	crp->crp_sid = sav->tdb_cryptoid;
	crp->crp_opaque = (caddr_t) tc;

	/* These are passed as-is to the callback */
	tc->tc_spi = sav->spi;
	tc->tc_dst = sav->sah->saidx.dst;
	tc->tc_proto = sav->sah->saidx.proto;
	tc->tc_protoff = protoff;
	tc->tc_skip = skip;

	/* Decryption descriptor */
	if (espx) {
	IPSEC_ASSERT(crde != NULL, ("null esp crypto descriptor"));
	crde->crd_skip = skip + hlen;
	crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
	crde->crd_inject = skip + hlen - sav->ivlen;

	crde->crd_alg = espx->type;
	crde->crd_key = sav->key_enc->key_data;
	crde->crd_klen = _KEYBITS(sav->key_enc);
	/* XXX Rounds ? */
	}

	if (mtag == NULL)
	return crypto_dispatch(crp);
	else
	return esp_input_cb(crp);
	}

	#ifdef INET6
	#define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) do { \
	if (saidx->dst.sa.sa_family == AF_INET6) { \
	error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); \
	} else { \
	error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); \
	} \
	} while (0)
	#else
	#define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) \
	(error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag))
	#endif

	/*
	* ESP input callback from the crypto driver.
	*/
	static int
	esp_input_cb(struct cryptop *crp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	u_int8_t lastthree[3], aalg[AH_HMAC_HASHLEN];
	int hlen, skip, protoff, error;
	struct mbuf *m;
	struct cryptodesc *crd;
	struct auth_hash *esph;
	struct enc_xform *espx;
	struct tdb_crypto *tc;
	struct m_tag *mtag;
	struct secasvar *sav;
	struct secasindex *saidx;
	caddr_t ptr;

	crd = crp->crp_desc;
	IPSEC_ASSERT(crd != NULL, ("null crypto descriptor!"));

	tc = (struct tdb_crypto *) crp->crp_opaque;
	IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!"));
	skip = tc->tc_skip;
	protoff = tc->tc_protoff;
	mtag = (struct m_tag *) tc->tc_ptr;
	m = (struct mbuf *) crp->crp_buf;

	sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi);
	if (sav == NULL) {
	V_espstat.esps_notdb++;
	DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n",
	__func__, ipsec_address(&tc->tc_dst),
	(u_long) ntohl(tc->tc_spi), tc->tc_proto));
	error = ENOBUFS; /XXX/
	goto bad;
	}

	saidx = &sav->sah->saidx;
	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET \|\|
	saidx->dst.sa.sa_family == AF_INET6,
	("unexpected protocol family %u", saidx->dst.sa.sa_family));

	esph = sav->tdb_authalgxform;
	espx = sav->tdb_encalgxform;

	/* Check for crypto errors */
	if (crp->crp_etype) {
	/* Reset the session ID */
	if (sav->tdb_cryptoid != 0)
	sav->tdb_cryptoid = crp->crp_sid;

	if (crp->crp_etype == EAGAIN) {
	KEY_FREESAV(&sav);
	error = crypto_dispatch(crp);
	return error;
	}

	V_espstat.esps_noxform++;
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	V_espstat.esps_crypto++;
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	V_espstat.esps_hist[sav->alg_enc]++;

	/* If authentication was performed, check now. */
	if (esph != NULL) {
	/*
	* If we have a tag, it means an IPsec-aware NIC did
	* the verification for us. Otherwise we need to
	* check the authentication calculation.
	*/
	V_ahstat.ahs_hist[sav->alg_auth]++;
	if (mtag == NULL) {
	/* Copy the authenticator from the packet */
	m_copydata(m, m->m_pkthdr.len - AH_HMAC_HASHLEN,
	AH_HMAC_HASHLEN, aalg);

	ptr = (caddr_t) (tc + 1);

	/* Verify authenticator */
	if (bcmp(ptr, aalg, AH_HMAC_HASHLEN) != 0) {
	DPRINTF(("%s: "
	"authentication hash mismatch for packet in SA %s/%08lx\n",
	__func__,
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));
	V_espstat.esps_badauth++;
	error = EACCES;
	goto bad;
	}
	}

	/* Remove trailing authenticator */
	m_adj(m, -AH_HMAC_HASHLEN);
	}

	/* Release the crypto descriptors */
	free(tc, M_XDATA), tc = NULL;
	crypto_freereq(crp), crp = NULL;

	/*
	* Packet is now decrypted.
	*/
	m->m_flags \|= M_DECRYPTED;

	/*
	* Update replay sequence number, if appropriate.
	*/
	if (sav->replay) {
	u_int32_t seq;

	m_copydata(m, skip + offsetof(struct newesp, esp_seq),
	sizeof (seq), (caddr_t) &seq);
	if (ipsec_updatereplay(ntohl(seq), sav)) {
	DPRINTF(("%s: packet replay check for %s\n", __func__,
	ipsec_logsastr(sav)));
	V_espstat.esps_replay++;
	error = ENOBUFS;
	goto bad;
	}
	}

	/* Determine the ESP header length */
	if (sav->flags & SADB_X_EXT_OLD)
	hlen = sizeof (struct esp) + sav->ivlen;
	else
	hlen = sizeof (struct newesp) + sav->ivlen;

	/* Remove the ESP header and IV from the mbuf. */
	error = m_striphdr(m, skip, hlen);
	if (error) {
	V_espstat.esps_hdrops++;
	DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	goto bad;
	}

	/* Save the last three bytes of decrypted data */
	m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree);

	/* Verify pad length */
	if (lastthree[1] + 2 > m->m_pkthdr.len - skip) {
	V_espstat.esps_badilen++;
	DPRINTF(("%s: invalid padding length %d for %u byte packet "
	"in SA %s/%08lx\n", __func__,
	lastthree[1], m->m_pkthdr.len - skip,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	error = EINVAL;
	goto bad;
	}

	/* Verify correct decryption by checking the last padding bytes */
	if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) {
	if (lastthree[1] != lastthree[0] && lastthree[1] != 0) {
	V_espstat.esps_badenc++;
	DPRINTF(("%s: decryption failed for packet in "
	"SA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	error = EINVAL;
	goto bad;
	}
	}

	/* Trim the mbuf chain to remove trailing authenticator and padding */
	m_adj(m, -(lastthree[1] + 2));

	/* Restore the Next Protocol field */
	m_copyback(m, protoff, sizeof (u_int8_t), lastthree + 2);

	IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag);

	KEY_FREESAV(&sav);
	return error;
	bad:
	if (sav)
	KEY_FREESAV(&sav);
	if (m != NULL)
	m_freem(m);
	if (tc != NULL)
	free(tc, M_XDATA);
	if (crp != NULL)
	crypto_freereq(crp);
	return error;
	}

	/*
	* ESP output routine, called by ipsec[46]_process_packet().
	*/
	static int
	esp_output(
	struct mbuf *m,
	struct ipsecrequest *isr,
	struct mbuf **mp,
	int skip,
	int protoff
	)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct enc_xform *espx;
	struct auth_hash *esph;
	int hlen, rlen, plen, padding, blks, alen, i, roff;
	struct mbuf mo = (struct mbuf ) NULL;
	struct tdb_crypto *tc;
	struct secasvar *sav;
	struct secasindex *saidx;
	unsigned char *pad;
	u_int8_t prot;
	int error, maxpacketsize;

	struct cryptodesc crde = NULL, crda = NULL;
	struct cryptop *crp;

	sav = isr->sav;
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	esph = sav->tdb_authalgxform;
	espx = sav->tdb_encalgxform;
	IPSEC_ASSERT(espx != NULL, ("null encoding xform"));

	if (sav->flags & SADB_X_EXT_OLD)
	hlen = sizeof (struct esp) + sav->ivlen;
	else
	hlen = sizeof (struct newesp) + sav->ivlen;

	rlen = m->m_pkthdr.len - skip; /* Raw payload length. */
	/*
	* NB: The null encoding transform has a blocksize of 4
	* so that headers are properly aligned.
	*/
	blks = espx->blocksize; /* IV blocksize */

	/* XXX clamp padding length a la KAME??? */
	padding = ((blks - ((rlen + 2) % blks)) % blks) + 2;
	plen = rlen + padding; /* Padded payload length. */

	if (esph)
	alen = AH_HMAC_HASHLEN;
	else
	alen = 0;

	V_espstat.esps_output++;

	saidx = &sav->sah->saidx;
	/* Check for maximum packet size violations. */
	switch (saidx->dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	maxpacketsize = IP_MAXPACKET;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	maxpacketsize = IPV6_MAXPACKET;
	break;
	#endif /* INET6 */
	default:
	DPRINTF(("%s: unknown/unsupported protocol "
	"family %d, SA %s/%08lx\n", __func__,
	saidx->dst.sa.sa_family, ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));
	V_espstat.esps_nopf++;
	error = EPFNOSUPPORT;
	goto bad;
	}
	if (skip + hlen + rlen + padding + alen > maxpacketsize) {
	DPRINTF(("%s: packet in SA %s/%08lx got too big "
	"(len %u, max len %u)\n", __func__,
	ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi),
	skip + hlen + rlen + padding + alen, maxpacketsize));
	V_espstat.esps_toobig++;
	error = EMSGSIZE;
	goto bad;
	}

	/* Update the counters. */
	V_espstat.esps_obytes += m->m_pkthdr.len - skip;

	m = m_unshare(m, M_NOWAIT);
	if (m == NULL) {
	DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi)));
	V_espstat.esps_hdrops++;
	error = ENOBUFS;
	goto bad;
	}

	/* Inject ESP header. */
	mo = m_makespace(m, skip, hlen, &roff);
	if (mo == NULL) {
	DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n",
	__func__, hlen, ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));
	V_espstat.esps_hdrops++; /* XXX diffs from openbsd */
	error = ENOBUFS;
	goto bad;
	}

	/* Initialize ESP header. */
	bcopy((caddr_t) &sav->spi, mtod(mo, caddr_t) + roff, sizeof(u_int32_t));
	if (sav->replay) {
	u_int32_t replay;

	#ifdef REGRESSION
	/* Emulate replay attack when ipsec_replay is TRUE. */
	if (!V_ipsec_replay)
	#endif
	sav->replay->count++;
	replay = htonl(sav->replay->count);
	bcopy((caddr_t) &replay,
	mtod(mo, caddr_t) + roff + sizeof(u_int32_t),
	sizeof(u_int32_t));
	}

	/*
	* Add padding -- better to do it ourselves than use the crypto engine,
	* although if/when we support compression, we'd have to do that.
	*/
	pad = (u_char *) m_pad(m, padding + alen);
	if (pad == NULL) {
	DPRINTF(("%s: m_pad failed for SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi)));
	m = NULL; /* NB: free'd by m_pad */
	error = ENOBUFS;
	goto bad;
	}

	/*
	* Add padding: random, zero, or self-describing.
	* XXX catch unexpected setting
	*/
	switch (sav->flags & SADB_X_EXT_PMASK) {
	case SADB_X_EXT_PRAND:
	(void) read_random(pad, padding - 2);
	break;
	case SADB_X_EXT_PZERO:
	bzero(pad, padding - 2);
	break;
	case SADB_X_EXT_PSEQ:
	for (i = 0; i < padding - 2; i++)
	pad[i] = i+1;
	break;
	}

	/* Fix padding length and Next Protocol in padding itself. */
	pad[padding - 2] = padding - 2;
	m_copydata(m, protoff, sizeof(u_int8_t), pad + padding - 1);

	/* Fix Next Protocol in IPv4/IPv6 header. */
	prot = IPPROTO_ESP;
	m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot);

	/* Get crypto descriptors. */
	crp = crypto_getreq(esph && espx ? 2 : 1);
	if (crp == NULL) {
	DPRINTF(("%s: failed to acquire crypto descriptors\n",
	__func__));
	V_espstat.esps_crypto++;
	error = ENOBUFS;
	goto bad;
	}

	if (espx) {
	crde = crp->crp_desc;
	crda = crde->crd_next;

	/* Encryption descriptor. */
	crde->crd_skip = skip + hlen;
	crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen);
	crde->crd_flags = CRD_F_ENCRYPT;
	crde->crd_inject = skip + hlen - sav->ivlen;

	/* Encryption operation. */
	crde->crd_alg = espx->type;
	crde->crd_key = sav->key_enc->key_data;
	crde->crd_klen = _KEYBITS(sav->key_enc);
	/* XXX Rounds ? */
	} else
	crda = crp->crp_desc;

	/* IPsec-specific opaque crypto info. */
	tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto),
	M_XDATA, M_NOWAIT\|M_ZERO);
	if (tc == NULL) {
	crypto_freereq(crp);
	DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
	V_espstat.esps_crypto++;
	error = ENOBUFS;
	goto bad;
	}

	/* Callback parameters */
	tc->tc_isr = isr;
	tc->tc_spi = sav->spi;
	tc->tc_dst = saidx->dst;
	tc->tc_proto = saidx->proto;

	/* Crypto operation descriptor. */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = esp_output_cb;
	crp->crp_opaque = (caddr_t) tc;
	crp->crp_sid = sav->tdb_cryptoid;

	if (esph) {
	/* Authentication descriptor. */
	crda->crd_skip = skip;
	crda->crd_len = m->m_pkthdr.len - (skip + alen);
	crda->crd_inject = m->m_pkthdr.len - alen;

	/* Authentication operation. */
	crda->crd_alg = esph->type;
	crda->crd_key = sav->key_auth->key_data;
	crda->crd_klen = _KEYBITS(sav->key_auth);
	}

	return crypto_dispatch(crp);
	bad:
	if (m)
	m_freem(m);
	return (error);
	}

	/*
	* ESP output callback from the crypto driver.
	*/
	static int
	esp_output_cb(struct cryptop *crp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct tdb_crypto *tc;
	struct ipsecrequest *isr;
	struct secasvar *sav;
	struct mbuf *m;
	int err, error;

	tc = (struct tdb_crypto *) crp->crp_opaque;
	IPSEC_ASSERT(tc != NULL, ("null opaque data area!"));
	m = (struct mbuf *) crp->crp_buf;

	isr = tc->tc_isr;
	IPSECREQUEST_LOCK(isr);
	sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi);
	if (sav == NULL) {
	V_espstat.esps_notdb++;
	DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n",
	__func__, ipsec_address(&tc->tc_dst),
	(u_long) ntohl(tc->tc_spi), tc->tc_proto));
	error = ENOBUFS; /XXX/
	goto bad;
	}
	IPSEC_ASSERT(isr->sav == sav,
	("SA changed was %p now %p\n", isr->sav, sav));

	/* Check for crypto errors. */
	if (crp->crp_etype) {
	/* Reset session ID. */
	if (sav->tdb_cryptoid != 0)
	sav->tdb_cryptoid = crp->crp_sid;

	if (crp->crp_etype == EAGAIN) {
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	error = crypto_dispatch(crp);
	return error;
	}

	V_espstat.esps_noxform++;
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}

	/* Shouldn't happen... */
	if (m == NULL) {
	V_espstat.esps_crypto++;
	DPRINTF(("%s: bogus returned buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	V_espstat.esps_hist[sav->alg_enc]++;
	if (sav->tdb_authalgxform != NULL)
	V_ahstat.ahs_hist[sav->alg_auth]++;

	/* Release crypto descriptors. */
	free(tc, M_XDATA);
	crypto_freereq(crp);

	#ifdef REGRESSION
	/* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */
	if (V_ipsec_integrity) {
	static unsigned char ipseczeroes[AH_HMAC_HASHLEN];
	struct auth_hash *esph;

	/*
	* Corrupt HMAC if we want to test integrity verification of
	* the other side.
	*/
	esph = sav->tdb_authalgxform;
	if (esph != NULL) {
	m_copyback(m, m->m_pkthdr.len - AH_HMAC_HASHLEN,
	AH_HMAC_HASHLEN, ipseczeroes);
	}
	}
	#endif

	/* NB: m is reclaimed by ipsec_process_done. */
	err = ipsec_process_done(m, isr);
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	return err;
	bad:
	if (sav)
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	if (m)
	m_freem(m);
	free(tc, M_XDATA);
	crypto_freereq(crp);
	return error;
	}

	static struct xformsw esp_xformsw = {
	XF_ESP, XFT_CONF\|XFT_AUTH, "IPsec ESP",
	esp_init, esp_zeroize, esp_input,
	esp_output
	};

	static void
	esp_attach(void)
	{
	#define MAXIV(xform) \
	if (xform.blocksize > V_esp_max_ivlen) \
	V_esp_max_ivlen = xform.blocksize \

	V_esp_max_ivlen = 0;
	MAXIV(enc_xform_des); /* SADB_EALG_DESCBC */
	MAXIV(enc_xform_3des); /* SADB_EALG_3DESCBC */
	MAXIV(enc_xform_rijndael128); /* SADB_X_EALG_AES */
	MAXIV(enc_xform_blf); /* SADB_X_EALG_BLOWFISHCBC */
	MAXIV(enc_xform_cast5); /* SADB_X_EALG_CAST128CBC */
	MAXIV(enc_xform_skipjack); /* SADB_X_EALG_SKIPJACK */
	MAXIV(enc_xform_null); /* SADB_EALG_NULL */
	MAXIV(enc_xform_camellia); /* SADB_X_EALG_CAMELLIACBC */

	xform_register(&esp_xformsw);
	#undef MAXIV
	}
	SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL);
	Index: head/sys/netipsec/xform_ipcomp.c
	===================================================================
	--- head/sys/netipsec/xform_ipcomp.c (revision 183549)
	+++ head/sys/netipsec/xform_ipcomp.c (revision 183550)
	@@ -1,597 +1,602 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */

	/*-
	* Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org)
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/* IP payload compression protocol (IPComp), see RFC 2393 */
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/socket.h>
	#include <sys/kernel.h>
	#include <sys/protosw.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>

	#include <net/route.h>
	#include <netipsec/ipsec.h>
	#include <netipsec/xform.h>

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netipsec/ipsec6.h>
	#endif

	#include <netipsec/ipcomp.h>
	#include <netipsec/ipcomp_var.h>

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <opencrypto/cryptodev.h>
	#include <opencrypto/deflate.h>
	#include <opencrypto/xform.h>

	int ipcomp_enable = 0;
	struct ipcompstat ipcompstat;

	SYSCTL_DECL(_net_inet_ipcomp);
	-SYSCTL_INT(_net_inet_ipcomp, OID_AUTO,
	- ipcomp_enable, CTLFLAG_RW, &ipcomp_enable, 0, "");
	-SYSCTL_STRUCT(_net_inet_ipcomp, IPSECCTL_STATS,
	- stats, CTLFLAG_RD, &ipcompstat, ipcompstat, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipcomp, OID_AUTO,
	+ ipcomp_enable, CTLFLAG_RW, ipcomp_enable, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipcomp, IPSECCTL_STATS,
	+ stats, CTLFLAG_RD, ipcompstat, ipcompstat, "");

	static int ipcomp_input_cb(struct cryptop *crp);
	static int ipcomp_output_cb(struct cryptop *crp);

	struct comp_algo *
	ipcomp_algorithm_lookup(int alg)
	{
	if (alg >= IPCOMP_ALG_MAX)
	return NULL;
	switch (alg) {
	case SADB_X_CALG_DEFLATE:
	return &comp_algo_deflate;
	}
	return NULL;
	}

	/*
	* ipcomp_init() is called when an CPI is being set up.
	*/
	static int
	ipcomp_init(struct secasvar sav, struct xformsw xsp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct comp_algo *tcomp;
	struct cryptoini cric;

	/* NB: algorithm really comes in alg_enc and not alg_comp! */
	tcomp = ipcomp_algorithm_lookup(sav->alg_enc);
	if (tcomp == NULL) {
	DPRINTF(("%s: unsupported compression algorithm %d\n", __func__,
	sav->alg_comp));
	return EINVAL;
	}
	sav->alg_comp = sav->alg_enc; /* set for doing histogram */
	sav->tdb_xform = xsp;
	sav->tdb_compalgxform = tcomp;

	/* Initialize crypto session */
	bzero(&cric, sizeof (cric));
	cric.cri_alg = sav->tdb_compalgxform->type;

	return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support);
	}

	/*
	* ipcomp_zeroize() used when IPCA is deleted
	*/
	static int
	ipcomp_zeroize(struct secasvar *sav)
	{
	int err;

	err = crypto_freesession(sav->tdb_cryptoid);
	sav->tdb_cryptoid = 0;
	return err;
	}

	/*
	* ipcomp_input() gets called to uncompress an input packet
	*/
	static int
	ipcomp_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct tdb_crypto *tc;
	struct cryptodesc *crdc;
	struct cryptop *crp;
	int hlen = IPCOMP_HLENGTH;

	/* Get crypto descriptors */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	m_freem(m);
	DPRINTF(("%s: no crypto descriptors\n", __func__));
	V_ipcompstat.ipcomps_crypto++;
	return ENOBUFS;
	}
	/* Get IPsec-specific opaque pointer */
	tc = (struct tdb_crypto ) malloc(sizeof (tc), M_XDATA, M_NOWAIT\|M_ZERO);
	if (tc == NULL) {
	m_freem(m);
	crypto_freereq(crp);
	DPRINTF(("%s: cannot allocate tdb_crypto\n", __func__));
	V_ipcompstat.ipcomps_crypto++;
	return ENOBUFS;
	}
	crdc = crp->crp_desc;

	crdc->crd_skip = skip + hlen;
	crdc->crd_len = m->m_pkthdr.len - (skip + hlen);
	crdc->crd_inject = skip;

	tc->tc_ptr = 0;

	/* Decompression operation */
	crdc->crd_alg = sav->tdb_compalgxform->type;

	/* Crypto operation descriptor */
	crp->crp_ilen = m->m_pkthdr.len - (skip + hlen);
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ipcomp_input_cb;
	crp->crp_sid = sav->tdb_cryptoid;
	crp->crp_opaque = (caddr_t) tc;

	/* These are passed as-is to the callback */
	tc->tc_spi = sav->spi;
	tc->tc_dst = sav->sah->saidx.dst;
	tc->tc_proto = sav->sah->saidx.proto;
	tc->tc_protoff = protoff;
	tc->tc_skip = skip;

	return crypto_dispatch(crp);
	}

	#ifdef INET6
	#define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) do { \
	if (saidx->dst.sa.sa_family == AF_INET6) { \
	error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); \
	} else { \
	error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); \
	} \
	} while (0)
	#else
	#define IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, mtag) \
	(error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag))
	#endif

	/*
	* IPComp input callback from the crypto driver.
	*/
	static int
	ipcomp_input_cb(struct cryptop *crp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct cryptodesc *crd;
	struct tdb_crypto *tc;
	int skip, protoff;
	struct mtag *mtag;
	struct mbuf *m;
	struct secasvar *sav;
	struct secasindex *saidx;
	int hlen = IPCOMP_HLENGTH, error, clen;
	u_int8_t nproto;
	caddr_t addr;

	crd = crp->crp_desc;

	tc = (struct tdb_crypto *) crp->crp_opaque;
	IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!"));
	skip = tc->tc_skip;
	protoff = tc->tc_protoff;
	mtag = (struct mtag *) tc->tc_ptr;
	m = (struct mbuf *) crp->crp_buf;

	sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi);
	if (sav == NULL) {
	V_ipcompstat.ipcomps_notdb++;
	DPRINTF(("%s: SA expired while in crypto\n", __func__));
	error = ENOBUFS; /XXX/
	goto bad;
	}

	saidx = &sav->sah->saidx;
	IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET \|\|
	saidx->dst.sa.sa_family == AF_INET6,
	("unexpected protocol family %u", saidx->dst.sa.sa_family));

	/* Check for crypto errors */
	if (crp->crp_etype) {
	/* Reset the session ID */
	if (sav->tdb_cryptoid != 0)
	sav->tdb_cryptoid = crp->crp_sid;

	if (crp->crp_etype == EAGAIN) {
	KEY_FREESAV(&sav);
	error = crypto_dispatch(crp);
	return error;
	}

	V_ipcompstat.ipcomps_noxform++;
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}
	/* Shouldn't happen... */
	if (m == NULL) {
	V_ipcompstat.ipcomps_crypto++;
	DPRINTF(("%s: null mbuf returned from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	V_ipcompstat.ipcomps_hist[sav->alg_comp]++;

	clen = crp->crp_olen; /* Length of data after processing */

	/* Release the crypto descriptors */
	free(tc, M_XDATA), tc = NULL;
	crypto_freereq(crp), crp = NULL;

	/* In case it's not done already, adjust the size of the mbuf chain */
	m->m_pkthdr.len = clen + hlen + skip;

	if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == 0) {
	V_ipcompstat.ipcomps_hdrops++; /XXX/
	DPRINTF(("%s: m_pullup failed\n", __func__));
	error = EINVAL; /XXX/
	goto bad;
	}

	/* Keep the next protocol field */
	addr = (caddr_t) mtod(m, struct ip *) + skip;
	nproto = ((struct ipcomp *) addr)->comp_nxt;

	/* Remove the IPCOMP header */
	error = m_striphdr(m, skip, hlen);
	if (error) {
	V_ipcompstat.ipcomps_hdrops++;
	DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	goto bad;
	}

	/* Restore the Next Protocol field */
	m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto);

	IPSEC_COMMON_INPUT_CB(m, sav, skip, protoff, NULL);

	KEY_FREESAV(&sav);
	return error;
	bad:
	if (sav)
	KEY_FREESAV(&sav);
	if (m)
	m_freem(m);
	if (tc != NULL)
	free(tc, M_XDATA);
	if (crp)
	crypto_freereq(crp);
	return error;
	}

	/*
	* IPComp output routine, called by ipsec[46]_process_packet()
	*/
	static int
	ipcomp_output(
	struct mbuf *m,
	struct ipsecrequest *isr,
	struct mbuf **mp,
	int skip,
	int protoff
	)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct secasvar *sav;
	struct comp_algo *ipcompx;
	int error, ralen, hlen, maxpacketsize, roff;
	u_int8_t prot;
	struct cryptodesc *crdc;
	struct cryptop *crp;
	struct tdb_crypto *tc;
	struct mbuf *mo;
	struct ipcomp *ipcomp;

	sav = isr->sav;
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	ipcompx = sav->tdb_compalgxform;
	IPSEC_ASSERT(ipcompx != NULL, ("null compression xform"));

	ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */
	hlen = IPCOMP_HLENGTH;

	V_ipcompstat.ipcomps_output++;

	/* Check for maximum packet size violations. */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	maxpacketsize = IP_MAXPACKET;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	maxpacketsize = IPV6_MAXPACKET;
	break;
	#endif /* INET6 */
	default:
	V_ipcompstat.ipcomps_nopf++;
	DPRINTF(("%s: unknown/unsupported protocol family %d, "
	"IPCA %s/%08lx\n", __func__,
	sav->sah->saidx.dst.sa.sa_family,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	error = EPFNOSUPPORT;
	goto bad;
	}
	if (skip + hlen + ralen > maxpacketsize) {
	V_ipcompstat.ipcomps_toobig++;
	DPRINTF(("%s: packet in IPCA %s/%08lx got too big "
	"(len %u, max len %u)\n", __func__,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi),
	skip + hlen + ralen, maxpacketsize));
	error = EMSGSIZE;
	goto bad;
	}

	/* Update the counters */
	V_ipcompstat.ipcomps_obytes += m->m_pkthdr.len - skip;

	m = m_unshare(m, M_NOWAIT);
	if (m == NULL) {
	V_ipcompstat.ipcomps_hdrops++;
	DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	error = ENOBUFS;
	goto bad;
	}

	/* Inject IPCOMP header */
	mo = m_makespace(m, skip, hlen, &roff);
	if (mo == NULL) {
	V_ipcompstat.ipcomps_wrap++;
	DPRINTF(("%s: IPCOMP header inject failed for IPCA %s/%08lx\n",
	__func__, ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	error = ENOBUFS;
	goto bad;
	}
	ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff);

	/* Initialize the IPCOMP header */
	/* XXX alignment always correct? */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p;
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt;
	break;
	#endif
	}
	ipcomp->comp_flags = 0;
	ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi));

	/* Fix Next Protocol in IPv4/IPv6 header */
	prot = IPPROTO_IPCOMP;
	m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot);

	/* Ok now, we can pass to the crypto processing */

	/* Get crypto descriptors */
	crp = crypto_getreq(1);
	if (crp == NULL) {
	V_ipcompstat.ipcomps_crypto++;
	DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__));
	error = ENOBUFS;
	goto bad;
	}
	crdc = crp->crp_desc;

	/* Compression descriptor */
	crdc->crd_skip = skip + hlen;
	crdc->crd_len = m->m_pkthdr.len - (skip + hlen);
	crdc->crd_flags = CRD_F_COMP;
	crdc->crd_inject = skip + hlen;

	/* Compression operation */
	crdc->crd_alg = ipcompx->type;

	/* IPsec-specific opaque crypto info */
	tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto),
	M_XDATA, M_NOWAIT\|M_ZERO);
	if (tc == NULL) {
	V_ipcompstat.ipcomps_crypto++;
	DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__));
	crypto_freereq(crp);
	error = ENOBUFS;
	goto bad;
	}

	tc->tc_isr = isr;
	tc->tc_spi = sav->spi;
	tc->tc_dst = sav->sah->saidx.dst;
	tc->tc_proto = sav->sah->saidx.proto;
	tc->tc_skip = skip + hlen;

	/* Crypto operation descriptor */
	crp->crp_ilen = m->m_pkthdr.len; /* Total input length */
	crp->crp_flags = CRYPTO_F_IMBUF \| CRYPTO_F_CBIFSYNC;
	crp->crp_buf = (caddr_t) m;
	crp->crp_callback = ipcomp_output_cb;
	crp->crp_opaque = (caddr_t) tc;
	crp->crp_sid = sav->tdb_cryptoid;

	return crypto_dispatch(crp);
	bad:
	if (m)
	m_freem(m);
	return (error);
	}

	/*
	* IPComp output callback from the crypto driver.
	*/
	static int
	ipcomp_output_cb(struct cryptop *crp)
	{
	+ INIT_VNET_IPSEC(curvnet);
	struct tdb_crypto *tc;
	struct ipsecrequest *isr;
	struct secasvar *sav;
	struct mbuf *m;
	int error, skip, rlen;

	tc = (struct tdb_crypto *) crp->crp_opaque;
	IPSEC_ASSERT(tc != NULL, ("null opaque data area!"));
	m = (struct mbuf *) crp->crp_buf;
	skip = tc->tc_skip;
	rlen = crp->crp_ilen - skip;

	isr = tc->tc_isr;
	IPSECREQUEST_LOCK(isr);
	sav = KEY_ALLOCSA(&tc->tc_dst, tc->tc_proto, tc->tc_spi);
	if (sav == NULL) {
	V_ipcompstat.ipcomps_notdb++;
	DPRINTF(("%s: SA expired while in crypto\n", __func__));
	error = ENOBUFS; /XXX/
	goto bad;
	}
	IPSEC_ASSERT(isr->sav == sav, ("SA changed\n"));

	/* Check for crypto errors */
	if (crp->crp_etype) {
	/* Reset session ID */
	if (sav->tdb_cryptoid != 0)
	sav->tdb_cryptoid = crp->crp_sid;

	if (crp->crp_etype == EAGAIN) {
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	error = crypto_dispatch(crp);
	return error;
	}
	V_ipcompstat.ipcomps_noxform++;
	DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype));
	error = crp->crp_etype;
	goto bad;
	}
	/* Shouldn't happen... */
	if (m == NULL) {
	V_ipcompstat.ipcomps_crypto++;
	DPRINTF(("%s: bogus return buffer from crypto\n", __func__));
	error = EINVAL;
	goto bad;
	}
	V_ipcompstat.ipcomps_hist[sav->alg_comp]++;

	if (rlen > crp->crp_olen) {
	/* Adjust the length in the IP header */
	switch (sav->sah->saidx.dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len);
	break;
	#endif /* INET */
	#ifdef INET6
	case AF_INET6:
	mtod(m, struct ip6_hdr *)->ip6_plen =
	htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr);
	break;
	#endif /* INET6 */
	default:
	V_ipcompstat.ipcomps_nopf++;
	DPRINTF(("%s: unknown/unsupported protocol "
	"family %d, IPCA %s/%08lx\n", __func__,
	sav->sah->saidx.dst.sa.sa_family,
	ipsec_address(&sav->sah->saidx.dst),
	(u_long) ntohl(sav->spi)));
	error = EPFNOSUPPORT;
	goto bad;
	}
	} else {
	/* compression was useless, we have lost time */
	/* XXX add statistic */
	}

	/* Release the crypto descriptor */
	free(tc, M_XDATA);
	crypto_freereq(crp);

	/* NB: m is reclaimed by ipsec_process_done. */
	error = ipsec_process_done(m, isr);
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	return error;
	bad:
	if (sav)
	KEY_FREESAV(&sav);
	IPSECREQUEST_UNLOCK(isr);
	if (m)
	m_freem(m);
	free(tc, M_XDATA);
	crypto_freereq(crp);
	return error;
	}

	static struct xformsw ipcomp_xformsw = {
	XF_IPCOMP, XFT_COMP, "IPcomp",
	ipcomp_init, ipcomp_zeroize, ipcomp_input,
	ipcomp_output
	};

	static void
	ipcomp_attach(void)
	{
	xform_register(&ipcomp_xformsw);
	}
	SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL);
	Index: head/sys/netipsec/xform_ipip.c
	===================================================================
	--- head/sys/netipsec/xform_ipip.c (revision 183549)
	+++ head/sys/netipsec/xform_ipip.c (revision 183550)
	@@ -1,702 +1,708 @@
	/* $FreeBSD$ */
	/* $OpenBSD: ip_ipip.c,v 1.25 2002/06/10 18:04:55 itojun Exp $ */
	/*-
	* The authors of this code are John Ioannidis (ji@tla.org),
	* Angelos D. Keromytis (kermit@csd.uch.gr) and
	* Niels Provos (provos@physnet.uni-hamburg.de).
	*
	* The original version of this code was written by John Ioannidis
	* for BSD/OS in Athens, Greece, in November 1995.
	*
	* Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
	* by Angelos D. Keromytis.
	*
	* Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
	* and Niels Provos.
	*
	* Additional features in 1999 by Angelos D. Keromytis.
	*
	* Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
	* Angelos D. Keromytis and Niels Provos.
	* Copyright (c) 2001, Angelos D. Keromytis.
	*
	* Permission to use, copy, and modify this software with or without fee
	* is hereby granted, provided that this entire notice is included in
	* all copies of any software which is or includes a copy or
	* modification of this software.
	* You may use this code under the GNU public license if you so wish. Please
	* contribute changes back to the authors under this freer than GPL license
	* so that we may further the use of strong encryption without limitations to
	* all.
	*
	* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
	* IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
	* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
	* MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
	* PURPOSE.
	*/

	/*
	* IP-inside-IP processing
	*/
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_enc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/kernel.h>
	#include <sys/protosw.h>
	#include <sys/sysctl.h>
	#include <sys/vimage.h>

	#include <net/if.h>
	#include <net/pfil.h>
	#include <net/route.h>
	#include <net/netisr.h>

	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/in_var.h>
	#include <netinet/ip.h>
	#include <netinet/ip_ecn.h>
	#include <netinet/ip_var.h>
	#include <netinet/ip_encap.h>

	#include <netipsec/ipsec.h>
	#include <netipsec/xform.h>

	#include <netipsec/ipip_var.h>

	#ifdef MROUTING
	#include <netinet/ip_mroute.h>
	#endif

	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netipsec/ipsec6.h>
	#include <netinet6/ip6_ecn.h>
	#include <netinet6/in6_var.h>
	#include <netinet6/ip6protosw.h>
	#endif

	#include <netipsec/key.h>
	#include <netipsec/key_debug.h>

	#include <machine/stdarg.h>

	/*
	* We can control the acceptance of IP4 packets by altering the sysctl
	* net.inet.ipip.allow value. Zero means drop them, all else is acceptance.
	*/
	int ipip_allow = 0;
	struct ipipstat ipipstat;

	SYSCTL_DECL(_net_inet_ipip);
	-SYSCTL_INT(_net_inet_ipip, OID_AUTO,
	- ipip_allow, CTLFLAG_RW, &ipip_allow, 0, "");
	-SYSCTL_STRUCT(_net_inet_ipip, IPSECCTL_STATS,
	- stats, CTLFLAG_RD, &ipipstat, ipipstat, "");
	+SYSCTL_V_INT(V_NET, vnet_ipsec, _net_inet_ipip, OID_AUTO,
	+ ipip_allow, CTLFLAG_RW, ipip_allow, 0, "");
	+SYSCTL_V_STRUCT(V_NET, vnet_ipsec, _net_inet_ipip, IPSECCTL_STATS,
	+ stats, CTLFLAG_RD, ipipstat, ipipstat, "");

	/* XXX IPCOMP */
	#define M_IPSEC (M_AUTHIPHDR\|M_AUTHIPDGM\|M_DECRYPTED)

	static void _ipip_input(struct mbuf m, int iphlen, struct ifnet gifp);

	#ifdef INET6
	/*
	* Really only a wrapper for ipip_input(), for use with IPv6.
	*/
	int
	ip4_input6(struct mbuf *m, int offp, int proto)
	{
	#if 0
	/* If we do not accept IP-in-IP explicitly, drop. */
	if (!V_ipip_allow && ((*m)->m_flags & M_IPSEC) == 0) {
	DPRINTF(("%s: dropped due to policy\n", __func__));
	V_ipipstat.ipips_pdrops++;
	m_freem(*m);
	return IPPROTO_DONE;
	}
	#endif
	_ipip_input(m, offp, NULL);
	return IPPROTO_DONE;
	}
	#endif /* INET6 */

	#ifdef INET
	/*
	* Really only a wrapper for ipip_input(), for use with IPv4.
	*/
	void
	ip4_input(struct mbuf *m, int off)
	{
	#if 0
	/* If we do not accept IP-in-IP explicitly, drop. */
	if (!V_ipip_allow && (m->m_flags & M_IPSEC) == 0) {
	DPRINTF(("%s: dropped due to policy\n", __func__));
	V_ipipstat.ipips_pdrops++;
	m_freem(m);
	return;
	}
	#endif
	_ipip_input(m, off, NULL);
	}
	#endif /* INET */

	/*
	* ipip_input gets called when we receive an IP{46} encapsulated packet,
	* either because we got it at a real interface, or because AH or ESP
	* were being used in tunnel mode (in which case the rcvif element will
	* contain the address of the encX interface associated with the tunnel.
	*/

	static void
	_ipip_input(struct mbuf m, int iphlen, struct ifnet gifp)
	{
	+ INIT_VNET_NET(curvnet);
	+ INIT_VNET_IPSEC(curvnet);
	register struct sockaddr_in *sin;
	register struct ifnet *ifp;
	register struct ifaddr *ifa;
	struct ip *ipo;
	#ifdef INET6
	register struct sockaddr_in6 *sin6;
	struct ip6_hdr *ip6 = NULL;
	u_int8_t itos;
	#endif
	u_int8_t nxt;
	int isr;
	u_int8_t otos;
	u_int8_t v;
	int hlen;

	V_ipipstat.ipips_ipackets++;

	m_copydata(m, 0, 1, &v);

	switch (v >> 4) {
	#ifdef INET
	case 4:
	hlen = sizeof(struct ip);
	break;
	#endif /* INET */
	#ifdef INET6
	case 6:
	hlen = sizeof(struct ip6_hdr);
	break;
	#endif
	default:
	V_ipipstat.ipips_family++;
	m_freem(m);
	return /* EAFNOSUPPORT */;
	}

	/* Bring the IP header in the first mbuf, if not there already */
	if (m->m_len < hlen) {
	if ((m = m_pullup(m, hlen)) == NULL) {
	DPRINTF(("%s: m_pullup (1) failed\n", __func__));
	V_ipipstat.ipips_hdrops++;
	return;
	}
	}

	ipo = mtod(m, struct ip *);

	#ifdef MROUTING
	if (ipo->ip_v == IPVERSION && ipo->ip_p == IPPROTO_IPV4) {
	if (IN_MULTICAST(((struct ip )((char ) ipo + iphlen))->ip_dst.s_addr)) {
	ipip_mroute_input (m, iphlen);
	return;
	}
	}
	#endif /* MROUTING */

	/* Keep outer ecn field. */
	switch (v >> 4) {
	#ifdef INET
	case 4:
	otos = ipo->ip_tos;
	break;
	#endif /* INET */
	#ifdef INET6
	case 6:
	otos = (ntohl(mtod(m, struct ip6_hdr *)->ip6_flow) >> 20) & 0xff;
	break;
	#endif
	default:
	panic("ipip_input: unknown ip version %u (outer)", v>>4);
	}

	/* Remove outer IP header */
	m_adj(m, iphlen);

	/* Sanity check */
	if (m->m_pkthdr.len < sizeof(struct ip)) {
	V_ipipstat.ipips_hdrops++;
	m_freem(m);
	return;
	}

	m_copydata(m, 0, 1, &v);

	switch (v >> 4) {
	#ifdef INET
	case 4:
	hlen = sizeof(struct ip);
	break;
	#endif /* INET */

	#ifdef INET6
	case 6:
	hlen = sizeof(struct ip6_hdr);
	break;
	#endif
	default:
	V_ipipstat.ipips_family++;
	m_freem(m);
	return; /* EAFNOSUPPORT */
	}

	/*
	* Bring the inner IP header in the first mbuf, if not there already.
	*/
	if (m->m_len < hlen) {
	if ((m = m_pullup(m, hlen)) == NULL) {
	DPRINTF(("%s: m_pullup (2) failed\n", __func__));
	V_ipipstat.ipips_hdrops++;
	return;
	}
	}

	/*
	* RFC 1853 specifies that the inner TTL should not be touched on
	* decapsulation. There's no reason this comment should be here, but
	* this is as good as any a position.
	*/

	/* Some sanity checks in the inner IP header */
	switch (v >> 4) {
	#ifdef INET
	case 4:
	ipo = mtod(m, struct ip *);
	nxt = ipo->ip_p;
	ip_ecn_egress(V_ip4_ipsec_ecn, &otos, &ipo->ip_tos);
	break;
	#endif /* INET */
	#ifdef INET6
	case 6:
	ip6 = (struct ip6_hdr *) ipo;
	nxt = ip6->ip6_nxt;
	itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	ip_ecn_egress(V_ip6_ipsec_ecn, &otos, &itos);
	ip6->ip6_flow &= ~htonl(0xff << 20);
	ip6->ip6_flow \|= htonl((u_int32_t) itos << 20);
	break;
	#endif
	default:
	panic("ipip_input: unknown ip version %u (inner)", v>>4);
	}

	/* Check for local address spoofing. */
	if ((m->m_pkthdr.rcvif == NULL \|\|
	!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) &&
	V_ipip_allow != 2) {
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	#ifdef INET
	if (ipo) {
	if (ifa->ifa_addr->sa_family !=
	AF_INET)
	continue;

	sin = (struct sockaddr_in *) ifa->ifa_addr;

	if (sin->sin_addr.s_addr ==
	ipo->ip_src.s_addr) {
	V_ipipstat.ipips_spoof++;
	m_freem(m);
	IFNET_RUNLOCK();
	return;
	}
	}
	#endif /* INET */

	#ifdef INET6
	if (ip6) {
	if (ifa->ifa_addr->sa_family !=
	AF_INET6)
	continue;

	sin6 = (struct sockaddr_in6 *) ifa->ifa_addr;

	if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_src)) {
	V_ipipstat.ipips_spoof++;
	m_freem(m);
	IFNET_RUNLOCK();
	return;
	}

	}
	#endif /* INET6 */
	}
	}
	IFNET_RUNLOCK();
	}

	/* Statistics */
	V_ipipstat.ipips_ibytes += m->m_pkthdr.len - iphlen;

	#ifdef DEV_ENC
	switch (v >> 4) {
	#ifdef INET
	case 4:
	ipsec_bpf(m, NULL, AF_INET, ENC_IN\|ENC_AFTER);
	break;
	#endif
	#ifdef INET6
	case 6:
	ipsec_bpf(m, NULL, AF_INET6, ENC_IN\|ENC_AFTER);
	break;
	#endif
	default:
	panic("%s: bogus ip version %u", __func__, v>>4);
	}
	/* pass the mbuf to enc0 for packet filtering */
	if (ipsec_filter(&m, PFIL_IN, ENC_IN\|ENC_AFTER) != 0)
	return;
	#endif

	/*
	* Interface pointer stays the same; if no IPsec processing has
	* been done (or will be done), this will point to a normal
	* interface. Otherwise, it'll point to an enc interface, which
	* will allow a packet filter to distinguish between secure and
	* untrusted packets.
	*/

	switch (v >> 4) {
	#ifdef INET
	case 4:
	isr = NETISR_IP;
	break;
	#endif
	#ifdef INET6
	case 6:
	isr = NETISR_IPV6;
	break;
	#endif
	default:
	panic("%s: bogus ip version %u", __func__, v>>4);
	}

	if (netisr_queue(isr, m)) { /* (0) on success. */
	V_ipipstat.ipips_qfull++;
	DPRINTF(("%s: packet dropped because of full queue\n",
	__func__));
	}
	}

	int
	ipip_output(
	struct mbuf *m,
	struct ipsecrequest *isr,
	struct mbuf **mp,
	int skip,
	int protoff
	)
	{
	+ INIT_VNET_IPSEC(curvnet);
	+#ifdef INET
	+ INIT_VNET_INET(curvnet);
	+#endif /* INET */
	struct secasvar *sav;
	u_int8_t tp, otos;
	struct secasindex *saidx;
	int error;
	#ifdef INET
	u_int8_t itos;
	struct ip *ipo;
	#endif /* INET */
	#ifdef INET6
	struct ip6_hdr ip6, ip6o;
	#endif /* INET6 */

	sav = isr->sav;
	IPSEC_ASSERT(sav != NULL, ("null SA"));
	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));

	/* XXX Deal with empty TDB source/destination addresses. */

	m_copydata(m, 0, 1, &tp);
	tp = (tp >> 4) & 0xff; /* Get the IP version number. */

	saidx = &sav->sah->saidx;
	switch (saidx->dst.sa.sa_family) {
	#ifdef INET
	case AF_INET:
	if (saidx->src.sa.sa_family != AF_INET \|\|
	saidx->src.sin.sin_addr.s_addr == INADDR_ANY \|\|
	saidx->dst.sin.sin_addr.s_addr == INADDR_ANY) {
	DPRINTF(("%s: unspecified tunnel endpoint "
	"address in SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));
	V_ipipstat.ipips_unspec++;
	error = EINVAL;
	goto bad;
	}

	M_PREPEND(m, sizeof(struct ip), M_DONTWAIT);
	if (m == 0) {
	DPRINTF(("%s: M_PREPEND failed\n", __func__));
	V_ipipstat.ipips_hdrops++;
	error = ENOBUFS;
	goto bad;
	}

	ipo = mtod(m, struct ip *);

	ipo->ip_v = IPVERSION;
	ipo->ip_hl = 5;
	ipo->ip_len = htons(m->m_pkthdr.len);
	ipo->ip_ttl = V_ip_defttl;
	ipo->ip_sum = 0;
	ipo->ip_src = saidx->src.sin.sin_addr;
	ipo->ip_dst = saidx->dst.sin.sin_addr;

	ipo->ip_id = ip_newid();

	/* If the inner protocol is IP... */
	if (tp == IPVERSION) {
	/* Save ECN notification */
	m_copydata(m, sizeof(struct ip) +
	offsetof(struct ip, ip_tos),
	sizeof(u_int8_t), (caddr_t) &itos);

	ipo->ip_p = IPPROTO_IPIP;

	/*
	* We should be keeping tunnel soft-state and
	* send back ICMPs if needed.
	*/
	m_copydata(m, sizeof(struct ip) +
	offsetof(struct ip, ip_off),
	sizeof(u_int16_t), (caddr_t) &ipo->ip_off);
	ipo->ip_off = ntohs(ipo->ip_off);
	ipo->ip_off &= ~(IP_DF \| IP_MF \| IP_OFFMASK);
	ipo->ip_off = htons(ipo->ip_off);
	}
	#ifdef INET6
	else if (tp == (IPV6_VERSION >> 4)) {
	u_int32_t itos32;

	/* Save ECN notification. */
	m_copydata(m, sizeof(struct ip) +
	offsetof(struct ip6_hdr, ip6_flow),
	sizeof(u_int32_t), (caddr_t) &itos32);
	itos = ntohl(itos32) >> 20;
	ipo->ip_p = IPPROTO_IPV6;
	ipo->ip_off = 0;
	}
	#endif /* INET6 */
	else {
	goto nofamily;
	}

	otos = 0;
	ip_ecn_ingress(ECN_ALLOWED, &otos, &itos);
	ipo->ip_tos = otos;
	break;
	#endif /* INET */

	#ifdef INET6
	case AF_INET6:
	if (IN6_IS_ADDR_UNSPECIFIED(&saidx->dst.sin6.sin6_addr) \|\|
	saidx->src.sa.sa_family != AF_INET6 \|\|
	IN6_IS_ADDR_UNSPECIFIED(&saidx->src.sin6.sin6_addr)) {
	DPRINTF(("%s: unspecified tunnel endpoint "
	"address in SA %s/%08lx\n", __func__,
	ipsec_address(&saidx->dst),
	(u_long) ntohl(sav->spi)));
	V_ipipstat.ipips_unspec++;
	error = ENOBUFS;
	goto bad;
	}

	/* scoped address handling */
	ip6 = mtod(m, struct ip6_hdr *);
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src))
	ip6->ip6_src.s6_addr16[1] = 0;
	if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst))
	ip6->ip6_dst.s6_addr16[1] = 0;

	M_PREPEND(m, sizeof(struct ip6_hdr), M_DONTWAIT);
	if (m == 0) {
	DPRINTF(("%s: M_PREPEND failed\n", __func__));
	V_ipipstat.ipips_hdrops++;
	error = ENOBUFS;
	goto bad;
	}

	/* Initialize IPv6 header */
	ip6o = mtod(m, struct ip6_hdr *);
	ip6o->ip6_flow = 0;
	ip6o->ip6_vfc &= ~IPV6_VERSION_MASK;
	ip6o->ip6_vfc \|= IPV6_VERSION;
	ip6o->ip6_plen = htons(m->m_pkthdr.len);
	ip6o->ip6_hlim = V_ip_defttl;
	ip6o->ip6_dst = saidx->dst.sin6.sin6_addr;
	ip6o->ip6_src = saidx->src.sin6.sin6_addr;

	#ifdef INET
	if (tp == IPVERSION) {
	/* Save ECN notification */
	m_copydata(m, sizeof(struct ip6_hdr) +
	offsetof(struct ip, ip_tos), sizeof(u_int8_t),
	(caddr_t) &itos);

	/* This is really IPVERSION. */
	ip6o->ip6_nxt = IPPROTO_IPIP;
	} else
	#endif /* INET */
	if (tp == (IPV6_VERSION >> 4)) {
	u_int32_t itos32;

	/* Save ECN notification. */
	m_copydata(m, sizeof(struct ip6_hdr) +
	offsetof(struct ip6_hdr, ip6_flow),
	sizeof(u_int32_t), (caddr_t) &itos32);
	itos = ntohl(itos32) >> 20;

	ip6o->ip6_nxt = IPPROTO_IPV6;
	} else {
	goto nofamily;
	}

	otos = 0;
	ip_ecn_ingress(ECN_ALLOWED, &otos, &itos);
	ip6o->ip6_flow \|= htonl((u_int32_t) otos << 20);
	break;
	#endif /* INET6 */

	default:
	nofamily:
	DPRINTF(("%s: unsupported protocol family %u\n", __func__,
	saidx->dst.sa.sa_family));
	V_ipipstat.ipips_family++;
	error = EAFNOSUPPORT; /* XXX diffs from openbsd */
	goto bad;
	}

	V_ipipstat.ipips_opackets++;
	*mp = m;

	#ifdef INET
	if (saidx->dst.sa.sa_family == AF_INET) {
	#if 0
	if (sav->tdb_xform->xf_type == XF_IP4)
	tdb->tdb_cur_bytes +=
	m->m_pkthdr.len - sizeof(struct ip);
	#endif
	V_ipipstat.ipips_obytes += m->m_pkthdr.len - sizeof(struct ip);
	}
	#endif /* INET */

	#ifdef INET6
	if (saidx->dst.sa.sa_family == AF_INET6) {
	#if 0
	if (sav->tdb_xform->xf_type == XF_IP4)
	tdb->tdb_cur_bytes +=
	m->m_pkthdr.len - sizeof(struct ip6_hdr);
	#endif
	V_ipipstat.ipips_obytes +=
	m->m_pkthdr.len - sizeof(struct ip6_hdr);
	}
	#endif /* INET6 */

	return 0;
	bad:
	if (m)
	m_freem(m);
	*mp = NULL;
	return (error);
	}

	#ifdef IPSEC
	static int
	ipe4_init(struct secasvar sav, struct xformsw xsp)
	{
	sav->tdb_xform = xsp;
	return 0;
	}

	static int
	ipe4_zeroize(struct secasvar *sav)
	{
	sav->tdb_xform = NULL;
	return 0;
	}

	static int
	ipe4_input(struct mbuf m, struct secasvar sav, int skip, int protoff)
	{
	/* This is a rather serious mistake, so no conditional printing. */
	printf("%s: should never be called\n", __func__);
	if (m)
	m_freem(m);
	return EOPNOTSUPP;
	}

	static struct xformsw ipe4_xformsw = {
	XF_IP4, 0, "IPv4 Simple Encapsulation",
	ipe4_init, ipe4_zeroize, ipe4_input, ipip_output,
	};

	extern struct domain inetdomain;
	static struct protosw ipe4_protosw =
	{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	ip4_input,
	0, 0, rip_ctloutput,
	0,
	0, 0, 0, 0,
	&rip_usrreqs
	};
	#ifdef INET6
	static struct ip6protosw ipe6_protosw =
	{ SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC\|PR_ADDR\|PR_LASTHDR,
	ip4_input6,
	0, 0, rip_ctloutput,
	0,
	0, 0, 0, 0,
	&rip_usrreqs
	};
	#endif

	/*
	* Check the encapsulated packet to see if we want it
	*/
	static int
	ipe4_encapcheck(const struct mbuf m, int off, int proto, void arg)
	{
	/*
	* Only take packets coming from IPSEC tunnels; the rest
	* must be handled by the gif tunnel code. Note that we
	* also return a minimum priority when we want the packet
	* so any explicit gif tunnels take precedence.
	*/
	return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0);
	}

	static void
	ipe4_attach(void)
	{
	xform_register(&ipe4_xformsw);
	/* attach to encapsulation framework */
	/* XXX save return cookie for detach on module remove */
	(void) encap_attach_func(AF_INET, -1,
	ipe4_encapcheck, &ipe4_protosw, NULL);
	#ifdef INET6
	(void) encap_attach_func(AF_INET6, -1,
	ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL);
	#endif
	}
	SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL);
	#endif /* IPSEC */
	Index: head/sys/nfsclient/nfs_diskless.c
	===================================================================
	--- head/sys/nfsclient/nfs_diskless.c (revision 183549)
	+++ head/sys/nfsclient/nfs_diskless.c (revision 183550)
	@@ -1,338 +1,339 @@
	/*-
	* Copyright (c) 1990 The Regents of the University of California.
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)autoconf.c 7.1 (Berkeley) 5/9/91
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_bootp.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	-
	#include <sys/socket.h>
	#include <sys/vimage.h>
	+
	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/if_var.h>
	#include <net/ethernet.h>
	#include <netinet/in.h>
	#include <rpc/rpcclnt.h>
	#include <nfs/rpcv2.h>
	#include <nfs/nfsproto.h>
	#include <nfsclient/nfs.h>
	#include <nfsclient/nfsdiskless.h>

	static int inaddr_to_sockaddr(char ev, struct sockaddr_in sa);
	static int hwaddr_to_sockaddr(char ev, struct sockaddr_dl sa);
	static int decode_nfshandle(char ev, u_char fh);

	/*
	* Validate/sanity check a rsize/wsize parameter.
	*/
	static int
	checkrwsize(unsigned long v, const char *name)
	{
	/*
	* 32K is used as an upper bound because most servers
	* limit block size to satisfy IPv4's limit of
	* 64K/reassembled packet. The lower bound is pretty
	* much arbitrary.
	*/
	if (!(4 <= v && v <= 32*1024)) {
	printf("nfs_parse_options: invalid %s %lu ignored\n", name, v);
	return 0;
	} else
	return 1;
	}

	/*
	* Parse mount options and apply them to the supplied
	* nfs_diskless state. Used also by bootp/dhcp support.
	*/
	void
	nfs_parse_options(const char envopts, struct nfs_args nd)
	{
	char opts, o, *otmp;
	unsigned long v;

	opts = strdup(envopts, M_TEMP);
	otmp = opts;
	while ((o = strsep(&otmp, ":;, ")) != NULL) {
	if (*o == '\0')
	; /* Skip empty options. */
	else if (strcmp(o, "soft") == 0)
	nd->flags \|= NFSMNT_SOFT;
	else if (strcmp(o, "intr") == 0)
	nd->flags \|= NFSMNT_INT;
	else if (strcmp(o, "conn") == 0)
	nd->flags \|= NFSMNT_NOCONN;
	else if (strcmp(o, "nolockd") == 0)
	nd->flags \|= NFSMNT_NOLOCKD;
	else if (strcmp(o, "nfsv2") == 0)
	nd->flags &= ~(NFSMNT_NFSV3 \| NFSMNT_NFSV4);
	else if (strcmp(o, "nfsv3") == 0) {
	nd->flags &= ~NFSMNT_NFSV4;
	nd->flags \|= NFSMNT_NFSV3;
	} else if (strcmp(o, "tcp") == 0)
	nd->sotype = SOCK_STREAM;
	else if (strcmp(o, "udp") == 0)
	nd->sotype = SOCK_DGRAM;
	else if (strncmp(o, "rsize=", 6) == 0) {
	v = strtoul(o+6, NULL, 10);
	if (checkrwsize(v, "rsize")) {
	nd->rsize = (int) v;
	nd->flags \|= NFSMNT_RSIZE;
	}
	} else if (strncmp(o, "wsize=", 6) == 0) {
	v = strtoul(o+6, NULL, 10);
	if (checkrwsize(v, "wsize")) {
	nd->wsize = (int) v;
	nd->flags \|= NFSMNT_WSIZE;
	}
	} else
	printf("%s: skipping unknown option \"%s\"\n",
	__func__, o);
	}
	free(opts, M_TEMP);
	}

	/*
	* Populate the essential fields in the nfsv3_diskless structure.
	*
	* The loader is expected to export the following environment variables:
	*
	* boot.netif.name name of boot interface
	* boot.netif.ip IP address on boot interface
	* boot.netif.netmask netmask on boot interface
	* boot.netif.gateway default gateway (optional)
	* boot.netif.hwaddr hardware address of boot interface
	* boot.nfsroot.server IP address of root filesystem server
	* boot.nfsroot.path path of the root filesystem on server
	* boot.nfsroot.nfshandle NFS handle for root filesystem on server
	* boot.nfsroot.options NFS options for the root filesystem
	*/
	void
	nfs_setup_diskless(void)
	{
	+ INIT_VNET_NET(curvnet);
	struct nfs_diskless *nd = &nfs_diskless;
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl, ourdl;
	struct sockaddr_in myaddr, netmask;
	char *cp;

	if (nfs_diskless_valid)
	return;
	/* set up interface */
	if (inaddr_to_sockaddr("boot.netif.ip", &myaddr))
	return;
	if (inaddr_to_sockaddr("boot.netif.netmask", &netmask)) {
	printf("nfs_diskless: no netmask\n");
	return;
	}
	bcopy(&myaddr, &nd->myif.ifra_addr, sizeof(myaddr));
	bcopy(&myaddr, &nd->myif.ifra_broadaddr, sizeof(myaddr));
	((struct sockaddr_in *) &nd->myif.ifra_broadaddr)->sin_addr.s_addr =
	myaddr.sin_addr.s_addr \| ~ netmask.sin_addr.s_addr;
	bcopy(&netmask, &nd->myif.ifra_mask, sizeof(netmask));

	if (hwaddr_to_sockaddr("boot.netif.hwaddr", &ourdl)) {
	printf("nfs_diskless: no hardware address\n");
	return;
	}
	ifa = NULL;
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	if (ifa->ifa_addr->sa_family == AF_LINK) {
	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
	if ((sdl->sdl_type == ourdl.sdl_type) &&
	(sdl->sdl_alen == ourdl.sdl_alen) &&
	!bcmp(LLADDR(sdl),
	LLADDR(&ourdl),
	sdl->sdl_alen)) {
	IFNET_RUNLOCK();
	goto match_done;
	}
	}
	}
	}
	IFNET_RUNLOCK();
	printf("nfs_diskless: no interface\n");
	return; /* no matching interface */
	match_done:
	setenv("boot.netif.name", ifp->if_xname);
	strlcpy(nd->myif.ifra_name, ifp->if_xname, sizeof(nd->myif.ifra_name));

	/* set up gateway */
	inaddr_to_sockaddr("boot.netif.gateway", &nd->mygateway);

	/* set up root mount */
	nd->root_args.rsize = 8192; /* XXX tunable? */
	nd->root_args.wsize = 8192;
	nd->root_args.sotype = SOCK_STREAM;
	nd->root_args.flags = (NFSMNT_NFSV3 \| NFSMNT_WSIZE \| NFSMNT_RSIZE \| NFSMNT_RESVPORT);
	if (inaddr_to_sockaddr("boot.nfsroot.server", &nd->root_saddr)) {
	printf("nfs_diskless: no server\n");
	return;
	}
	nd->root_saddr.sin_port = htons(NFS_PORT);
	if (decode_nfshandle("boot.nfsroot.nfshandle", &nd->root_fh[0]) == 0) {
	printf("nfs_diskless: no NFS handle\n");
	return;
	}
	if ((cp = getenv("boot.nfsroot.path")) != NULL) {
	strncpy(nd->root_hostnam, cp, MNAMELEN - 1);
	freeenv(cp);
	}
	if ((cp = getenv("boot.nfsroot.options")) != NULL) {
	struct nfs_args args;

	/* XXX yech, convert between old and current arg format */
	args.flags = nd->root_args.flags;
	args.sotype = nd->root_args.sotype;
	args.rsize = nd->root_args.rsize;
	args.wsize = nd->root_args.wsize;
	nfs_parse_options(cp, &args);
	nd->root_args.flags = args.flags;
	nd->root_args.sotype = args.sotype;
	nd->root_args.rsize = args.rsize;
	nd->root_args.wsize = args.wsize;
	freeenv(cp);
	}

	nfs_diskless_valid = 1;
	}

	static int
	inaddr_to_sockaddr(char ev, struct sockaddr_in sa)
	{
	u_int32_t a[4];
	char *cp;
	int count;

	bzero(sa, sizeof(*sa));
	sa->sin_len = sizeof(*sa);
	sa->sin_family = AF_INET;

	if ((cp = getenv(ev)) == NULL)
	return (1);
	count = sscanf(cp, "%d.%d.%d.%d", &a[0], &a[1], &a[2], &a[3]);
	freeenv(cp);
	if (count != 4)
	return (1);
	sa->sin_addr.s_addr =
	htonl((a[0] << 24) \| (a[1] << 16) \| (a[2] << 8) \| a[3]);
	return (0);
	}

	static int
	hwaddr_to_sockaddr(char ev, struct sockaddr_dl sa)
	{
	char *cp;
	u_int32_t a[6];
	int count;

	bzero(sa, sizeof(*sa));
	sa->sdl_len = sizeof(*sa);
	sa->sdl_family = AF_LINK;
	sa->sdl_type = IFT_ETHER;
	sa->sdl_alen = ETHER_ADDR_LEN;
	if ((cp = getenv(ev)) == NULL)
	return (1);
	count = sscanf(cp, "%x:%x:%x:%x:%x:%x",
	&a[0], &a[1], &a[2], &a[3], &a[4], &a[5]);
	freeenv(cp);
	if (count != 6)
	return (1);
	sa->sdl_data[0] = a[0];
	sa->sdl_data[1] = a[1];
	sa->sdl_data[2] = a[2];
	sa->sdl_data[3] = a[3];
	sa->sdl_data[4] = a[4];
	sa->sdl_data[5] = a[5];
	return (0);
	}

	static int
	decode_nfshandle(char ev, u_char fh)
	{
	u_char cp, ep;
	int len, val;

	ep = cp = getenv(ev);
	if (cp == NULL)
	return (0);
	if ((strlen(cp) < 2) \|\| (*cp != 'X')) {
	freeenv(ep);
	return (0);
	}
	len = 0;
	cp++;
	for (;;) {
	if (*cp == 'X') {
	freeenv(ep);
	return (len);
	}
	if ((sscanf(cp, "%2x", &val) != 1) \|\| (val > 0xff)) {
	freeenv(ep);
	return (0);
	}
	*(fh++) = val;
	len++;
	cp += 2;
	if (len > NFSX_V2FH) {
	freeenv(ep);
	return (0);
	}
	}
	}

	#if !defined(BOOTP_NFSROOT)
	static void
	nfs_rootconf(void)
	{

	nfs_setup_diskless();
	if (nfs_diskless_valid)
	rootdevnames[0] = "nfs:";
	}

	SYSINIT(cpu_rootconf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, nfs_rootconf, NULL);
	#endif

	Index: head/sys/nfsclient/nfs_vfsops.c
	===================================================================
	--- head/sys/nfsclient/nfs_vfsops.c (revision 183549)
	+++ head/sys/nfsclient/nfs_vfsops.c (revision 183550)
	@@ -1,1411 +1,1412 @@
	/*-
	* Copyright (c) 1989, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)nfs_vfsops.c 8.12 (Berkeley) 5/20/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");


	#include "opt_bootp.h"
	#include "opt_nfsroot.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/module.h>
	#include <sys/mount.h>
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sockio.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>
	#include <sys/signalvar.h>
	#include <sys/vimage.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#include <net/if.h>
	#include <net/route.h>
	#include <netinet/in.h>

	#include <rpc/rpcclnt.h>

	#include <nfs/rpcv2.h>
	#include <nfs/nfsproto.h>
	#include <nfsclient/nfs.h>
	#include <nfsclient/nfsnode.h>
	#include <nfsclient/nfsmount.h>
	#include <nfs/xdr_subs.h>
	#include <nfsclient/nfsm_subs.h>
	#include <nfsclient/nfsdiskless.h>

	MALLOC_DEFINE(M_NFSREQ, "nfsclient_req", "NFS request header");
	MALLOC_DEFINE(M_NFSBIGFH, "nfsclient_bigfh", "NFS version 3 file handle");
	MALLOC_DEFINE(M_NFSDIROFF, "nfsclient_diroff", "NFS directory offset data");
	MALLOC_DEFINE(M_NFSHASH, "nfsclient_hash", "NFS hash tables");
	MALLOC_DEFINE(M_NFSDIRECTIO, "nfsclient_directio", "NFS Direct IO async write state");

	uma_zone_t nfsmount_zone;

	struct nfsstats nfsstats;

	SYSCTL_NODE(_vfs, OID_AUTO, nfs, CTLFLAG_RW, 0, "NFS filesystem");
	SYSCTL_STRUCT(_vfs_nfs, NFS_NFSSTATS, nfsstats, CTLFLAG_RW,
	&nfsstats, nfsstats, "S,nfsstats");
	static int nfs_ip_paranoia = 1;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW,
	&nfs_ip_paranoia, 0, "");
	#ifdef NFS_DEBUG
	int nfs_debug;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, "");
	#endif
	static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY;
	SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY,
	downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, "");
	/* how long between console messages "nfs server foo not responding" */
	static int nfs_tprintf_delay = NFS_TPRINTF_DELAY;
	SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY,
	downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, "");

	static void nfs_decode_args(struct mount mp, struct nfsmount nmp,
	struct nfs_args argp, const char hostname);
	static int mountnfs(struct nfs_args , struct mount ,
	struct sockaddr , char , struct vnode **,
	struct ucred *cred);
	static vfs_mount_t nfs_mount;
	static vfs_cmount_t nfs_cmount;
	static vfs_unmount_t nfs_unmount;
	static vfs_root_t nfs_root;
	static vfs_statfs_t nfs_statfs;
	static vfs_sync_t nfs_sync;
	static vfs_sysctl_t nfs_sysctl;

	static int fake_wchan;

	/*
	* nfs vfs operations.
	*/
	static struct vfsops nfs_vfsops = {
	.vfs_init = nfs_init,
	.vfs_mount = nfs_mount,
	.vfs_cmount = nfs_cmount,
	.vfs_root = nfs_root,
	.vfs_statfs = nfs_statfs,
	.vfs_sync = nfs_sync,
	.vfs_uninit = nfs_uninit,
	.vfs_unmount = nfs_unmount,
	.vfs_sysctl = nfs_sysctl,
	};
	VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK);

	/* So that loader and kldload(2) can find us, wherever we are.. */
	MODULE_VERSION(nfs, 1);

	static struct nfs_rpcops nfs_rpcops = {
	nfs_readrpc,
	nfs_writerpc,
	nfs_writebp,
	nfs_readlinkrpc,
	nfs_invaldir,
	nfs_commit,
	};

	/*
	* This structure must be filled in by a primary bootstrap or bootstrap
	* server for a diskless/dataless machine. It is initialized below just
	* to ensure that it is allocated to initialized data (.data not .bss).
	*/
	struct nfs_diskless nfs_diskless = { { { 0 } } };
	struct nfsv3_diskless nfsv3_diskless = { { { 0 } } };
	int nfs_diskless_valid = 0;

	SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD,
	&nfs_diskless_valid, 0, "");

	SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD,
	nfsv3_diskless.root_hostnam, 0, "");

	SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD,
	&nfsv3_diskless.root_saddr, sizeof nfsv3_diskless.root_saddr,
	"%Ssockaddr_in", "");


	void nfsargs_ntoh(struct nfs_args *);
	static int nfs_mountdiskless(char *,
	struct sockaddr_in , struct nfs_args ,
	struct thread , struct vnode , struct mount );
	static void nfs_convert_diskless(void);
	static void nfs_convert_oargs(struct nfs_args *args,
	struct onfs_args *oargs);

	int
	nfs_iosize(struct nfsmount *nmp)
	{
	int iosize;

	/*
	* Calculate the size used for io buffers. Use the larger
	* of the two sizes to minimise nfs requests but make sure
	* that it is at least one VM page to avoid wasting buffer
	* space.
	*/
	iosize = imax(nmp->nm_rsize, nmp->nm_wsize);
	iosize = imax(iosize, PAGE_SIZE);
	return (iosize);
	}

	static void
	nfs_convert_oargs(struct nfs_args args, struct onfs_args oargs)
	{

	args->version = NFS_ARGSVERSION;
	args->addr = oargs->addr;
	args->addrlen = oargs->addrlen;
	args->sotype = oargs->sotype;
	args->proto = oargs->proto;
	args->fh = oargs->fh;
	args->fhsize = oargs->fhsize;
	args->flags = oargs->flags;
	args->wsize = oargs->wsize;
	args->rsize = oargs->rsize;
	args->readdirsize = oargs->readdirsize;
	args->timeo = oargs->timeo;
	args->retrans = oargs->retrans;
	args->maxgrouplist = oargs->maxgrouplist;
	args->readahead = oargs->readahead;
	args->deadthresh = oargs->deadthresh;
	args->hostname = oargs->hostname;
	}

	static void
	nfs_convert_diskless(void)
	{

	bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif,
	sizeof(struct ifaliasreq));
	bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway,
	sizeof(struct sockaddr_in));
	nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args);
	if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) {
	nfsv3_diskless.root_fhsize = NFSX_V3FH;
	bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V3FH);
	} else {
	nfsv3_diskless.root_fhsize = NFSX_V2FH;
	bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH);
	}
	bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr,
	sizeof(struct sockaddr_in));
	bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN);
	nfsv3_diskless.root_time = nfs_diskless.root_time;
	bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam,
	MAXHOSTNAMELEN);
	nfs_diskless_valid = 3;
	}

	/*
	* nfs statfs call
	*/
	static int
	nfs_statfs(struct mount mp, struct statfs sbp, struct thread *td)
	{
	struct vnode *vp;
	struct nfs_statfs *sfp;
	caddr_t bpos, dpos;
	struct nfsmount *nmp = VFSTONFS(mp);
	int error = 0, v3 = (nmp->nm_flag & NFSMNT_NFSV3), retattr;
	struct mbuf mreq, mrep, md, mb;
	struct nfsnode *np;
	u_quad_t tquad;

	#ifndef nolint
	sfp = NULL;
	#endif
	error = vfs_busy(mp, LK_NOWAIT, NULL);
	if (error)
	return (error);
	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
	if (error) {
	vfs_unbusy(mp);
	return (error);
	}
	vp = NFSTOV(np);
	mtx_lock(&nmp->nm_mtx);
	if (v3 && (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
	mtx_unlock(&nmp->nm_mtx);
	(void)nfs_fsinfo(nmp, vp, td->td_ucred, td);
	} else
	mtx_unlock(&nmp->nm_mtx);
	nfsstats.rpccnt[NFSPROC_FSSTAT]++;
	mreq = nfsm_reqhead(vp, NFSPROC_FSSTAT, NFSX_FH(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	nfsm_request(vp, NFSPROC_FSSTAT, td, td->td_ucred);
	if (v3)
	nfsm_postop_attr(vp, retattr);
	if (error) {
	if (mrep != NULL)
	m_freem(mrep);
	goto nfsmout;
	}
	sfp = nfsm_dissect(struct nfs_statfs *, NFSX_STATFS(v3));
	mtx_lock(&nmp->nm_mtx);
	sbp->f_iosize = nfs_iosize(nmp);
	mtx_unlock(&nmp->nm_mtx);
	if (v3) {
	sbp->f_bsize = NFS_FABLKSIZE;
	tquad = fxdr_hyper(&sfp->sf_tbytes);
	sbp->f_blocks = tquad / NFS_FABLKSIZE;
	tquad = fxdr_hyper(&sfp->sf_fbytes);
	sbp->f_bfree = tquad / NFS_FABLKSIZE;
	tquad = fxdr_hyper(&sfp->sf_abytes);
	sbp->f_bavail = tquad / NFS_FABLKSIZE;
	sbp->f_files = (fxdr_unsigned(int32_t,
	sfp->sf_tfiles.nfsuquad[1]) & 0x7fffffff);
	sbp->f_ffree = (fxdr_unsigned(int32_t,
	sfp->sf_ffiles.nfsuquad[1]) & 0x7fffffff);
	} else {
	sbp->f_bsize = fxdr_unsigned(int32_t, sfp->sf_bsize);
	sbp->f_blocks = fxdr_unsigned(int32_t, sfp->sf_blocks);
	sbp->f_bfree = fxdr_unsigned(int32_t, sfp->sf_bfree);
	sbp->f_bavail = fxdr_unsigned(int32_t, sfp->sf_bavail);
	sbp->f_files = 0;
	sbp->f_ffree = 0;
	}
	m_freem(mrep);
	nfsmout:
	vput(vp);
	vfs_unbusy(mp);
	return (error);
	}

	/*
	* nfs version 3 fsinfo rpc call
	*/
	int
	nfs_fsinfo(struct nfsmount nmp, struct vnode vp, struct ucred *cred,
	struct thread *td)
	{
	struct nfsv3_fsinfo *fsp;
	u_int32_t pref, max;
	caddr_t bpos, dpos;
	int error = 0, retattr;
	struct mbuf mreq, mrep, md, mb;
	u_int64_t maxfsize;

	nfsstats.rpccnt[NFSPROC_FSINFO]++;
	mreq = nfsm_reqhead(vp, NFSPROC_FSINFO, NFSX_FH(1));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, 1);
	nfsm_request(vp, NFSPROC_FSINFO, td, cred);
	nfsm_postop_attr(vp, retattr);
	if (!error) {
	fsp = nfsm_dissect(struct nfsv3_fsinfo *, NFSX_V3FSINFO);
	pref = fxdr_unsigned(u_int32_t, fsp->fs_wtpref);
	mtx_lock(&nmp->nm_mtx);
	if (pref < nmp->nm_wsize && pref >= NFS_FABLKSIZE)
	nmp->nm_wsize = (pref + NFS_FABLKSIZE - 1) &
	~(NFS_FABLKSIZE - 1);
	max = fxdr_unsigned(u_int32_t, fsp->fs_wtmax);
	if (max < nmp->nm_wsize && max > 0) {
	nmp->nm_wsize = max & ~(NFS_FABLKSIZE - 1);
	if (nmp->nm_wsize == 0)
	nmp->nm_wsize = max;
	}
	pref = fxdr_unsigned(u_int32_t, fsp->fs_rtpref);
	if (pref < nmp->nm_rsize && pref >= NFS_FABLKSIZE)
	nmp->nm_rsize = (pref + NFS_FABLKSIZE - 1) &
	~(NFS_FABLKSIZE - 1);
	max = fxdr_unsigned(u_int32_t, fsp->fs_rtmax);
	if (max < nmp->nm_rsize && max > 0) {
	nmp->nm_rsize = max & ~(NFS_FABLKSIZE - 1);
	if (nmp->nm_rsize == 0)
	nmp->nm_rsize = max;
	}
	pref = fxdr_unsigned(u_int32_t, fsp->fs_dtpref);
	if (pref < nmp->nm_readdirsize && pref >= NFS_DIRBLKSIZ)
	nmp->nm_readdirsize = (pref + NFS_DIRBLKSIZ - 1) &
	~(NFS_DIRBLKSIZ - 1);
	if (max < nmp->nm_readdirsize && max > 0) {
	nmp->nm_readdirsize = max & ~(NFS_DIRBLKSIZ - 1);
	if (nmp->nm_readdirsize == 0)
	nmp->nm_readdirsize = max;
	}
	maxfsize = fxdr_hyper(&fsp->fs_maxfilesize);
	if (maxfsize > 0 && maxfsize < nmp->nm_maxfilesize)
	nmp->nm_maxfilesize = maxfsize;
	nmp->nm_mountp->mnt_stat.f_iosize = nfs_iosize(nmp);
	nmp->nm_state \|= NFSSTA_GOTFSINFO;
	mtx_unlock(&nmp->nm_mtx);
	}
	m_freem(mrep);
	nfsmout:
	return (error);
	}

	/*
	* Mount a remote root fs via. nfs. This depends on the info in the
	* nfs_diskless structure that has been filled in properly by some primary
	* bootstrap.
	* It goes something like this:
	* - do enough of "ifconfig" by calling ifioctl() so that the system
	* can talk to the server
	* - If nfs_diskless.mygateway is filled in, use that address as
	* a default gateway.
	* - build the rootfs mount point and call mountnfs() to do the rest.
	*
	* It is assumed to be safe to read, modify, and write the nfsv3_diskless
	* structure, as well as other global NFS client variables here, as
	* nfs_mountroot() will be called once in the boot before any other NFS
	* client activity occurs.
	*/
	int
	nfs_mountroot(struct mount mp, struct thread td)
	{
	+ INIT_VPROCG(TD_TO_VPROCG(td));
	struct nfsv3_diskless *nd = &nfsv3_diskless;
	struct socket *so;
	struct vnode *vp;
	struct ifreq ir;
	int error, i;
	u_long l;
	char buf[128];
	char *cp;

	#if defined(BOOTP_NFSROOT) && defined(BOOTP)
	bootpc_init(); /* use bootp to get nfs_diskless filled in */
	#elif defined(NFS_ROOT)
	nfs_setup_diskless();
	#endif

	if (nfs_diskless_valid == 0)
	return (-1);
	if (nfs_diskless_valid == 1)
	nfs_convert_diskless();

	/*
	* XXX splnet, so networks will receive...
	*/
	splnet();

	/*
	* Do enough of ifconfig(8) so that the critical net interface can
	* talk to the server.
	*/
	error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0,
	td->td_ucred, td);
	if (error)
	panic("nfs_mountroot: socreate(%04x): %d",
	nd->myif.ifra_addr.sa_family, error);

	#if 0 /* XXX Bad idea */
	/*
	* We might not have been told the right interface, so we pass
	* over the first ten interfaces of the same kind, until we get
	* one of them configured.
	*/

	for (i = strlen(nd->myif.ifra_name) - 1;
	nd->myif.ifra_name[i] >= '0' &&
	nd->myif.ifra_name[i] <= '9';
	nd->myif.ifra_name[i] ++) {
	error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
	if(!error)
	break;
	}
	#endif
	error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td);
	if (error)
	panic("nfs_mountroot: SIOCAIFADDR: %d", error);
	if ((cp = getenv("boot.netif.mtu")) != NULL) {
	ir.ifr_mtu = strtol(cp, NULL, 10);
	bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ);
	freeenv(cp);
	error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td);
	if (error)
	printf("nfs_mountroot: SIOCSIFMTU: %d", error);
	}
	soclose(so);

	/*
	* If the gateway field is filled in, set it as the default route.
	* Note that pxeboot will set a default route of 0 if the route
	* is not set by the DHCP server. Check also for a value of 0
	* to avoid panicking inappropriately in that situation.
	*/
	if (nd->mygateway.sin_len != 0 &&
	nd->mygateway.sin_addr.s_addr != 0) {
	struct sockaddr_in mask, sin;

	bzero((caddr_t)&mask, sizeof(mask));
	sin = mask;
	sin.sin_family = AF_INET;
	sin.sin_len = sizeof(sin);
	/* XXX MRT use table 0 for this sort of thing */
	error = rtrequest(RTM_ADD, (struct sockaddr *)&sin,
	(struct sockaddr *)&nd->mygateway,
	(struct sockaddr *)&mask,
	RTF_UP \| RTF_GATEWAY, NULL);
	if (error)
	panic("nfs_mountroot: RTM_ADD: %d", error);
	}

	/*
	* Create the rootfs mount point.
	*/
	nd->root_args.fh = nd->root_fh;
	nd->root_args.fhsize = nd->root_fhsize;
	l = ntohl(nd->root_saddr.sin_addr.s_addr);
	snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s",
	(l >> 24) & 0xff, (l >> 16) & 0xff,
	(l >> 8) & 0xff, (l >> 0) & 0xff, nd->root_hostnam);
	printf("NFS ROOT: %s\n", buf);
	nd->root_args.hostname = buf;
	if ((error = nfs_mountdiskless(buf,
	&nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) {
	return (error);
	}

	/*
	* This is not really an nfs issue, but it is much easier to
	* set hostname here and then let the "/etc/rc.xxx" files
	* mount the right /var based upon its preset value.
	*/
	mtx_lock(&hostname_mtx);
	bcopy(nd->my_hostnam, V_hostname, MAXHOSTNAMELEN);
	V_hostname[MAXHOSTNAMELEN - 1] = '\0';
	for (i = 0; i < MAXHOSTNAMELEN; i++)
	if (V_hostname[i] == '\0')
	break;
	mtx_unlock(&hostname_mtx);
	inittodr(ntohl(nd->root_time));
	return (0);
	}

	/*
	* Internal version of mount system call for diskless setup.
	*/
	static int
	nfs_mountdiskless(char *path,
	struct sockaddr_in sin, struct nfs_args args, struct thread *td,
	struct vnode *vpp, struct mount mp)
	{
	struct sockaddr *nam;
	int error;

	nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK);
	if ((error = mountnfs(args, mp, nam, path, vpp,
	td->td_ucred)) != 0) {
	printf("nfs_mountroot: mount %s on /: %d\n", path, error);
	return (error);
	}
	return (0);
	}

	static void
	nfs_decode_args(struct mount mp, struct nfsmount nmp, struct nfs_args *argp,
	const char *hostname)
	{
	int s;
	int adjsock;
	int maxio;
	char *p;

	s = splnet();

	/*
	* Set read-only flag if requested; otherwise, clear it if this is
	* an update. If this is not an update, then either the read-only
	* flag is already clear, or this is a root mount and it was set
	* intentionally at some previous point.
	*/
	if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) {
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_RDONLY;
	MNT_IUNLOCK(mp);
	} else if (mp->mnt_flag & MNT_UPDATE) {
	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_RDONLY;
	MNT_IUNLOCK(mp);
	}

	/*
	* Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes
	* no sense in that context. Also, set up appropriate retransmit
	* and soft timeout behavior.
	*/
	if (argp->sotype == SOCK_STREAM) {
	nmp->nm_flag &= ~NFSMNT_NOCONN;
	nmp->nm_flag \|= NFSMNT_DUMBTIMR;
	nmp->nm_timeo = NFS_MAXTIMEO;
	nmp->nm_retry = NFS_RETRANS_TCP;
	}

	/* Also clear RDIRPLUS if not NFSv3, it crashes some servers */
	if ((argp->flags & NFSMNT_NFSV3) == 0)
	nmp->nm_flag &= ~NFSMNT_RDIRPLUS;

	/* Re-bind if rsrvd port requested and wasn't on one */
	adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT)
	&& (argp->flags & NFSMNT_RESVPORT);
	/* Also re-bind if we're switching to/from a connected UDP socket */
	adjsock \|= ((nmp->nm_flag & NFSMNT_NOCONN) !=
	(argp->flags & NFSMNT_NOCONN));

	/* Update flags atomically. Don't change the lock bits. */
	nmp->nm_flag = argp->flags \| nmp->nm_flag;
	splx(s);

	if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) {
	nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10;
	if (nmp->nm_timeo < NFS_MINTIMEO)
	nmp->nm_timeo = NFS_MINTIMEO;
	else if (nmp->nm_timeo > NFS_MAXTIMEO)
	nmp->nm_timeo = NFS_MAXTIMEO;
	}

	if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) {
	nmp->nm_retry = argp->retrans;
	if (nmp->nm_retry > NFS_MAXREXMIT)
	nmp->nm_retry = NFS_MAXREXMIT;
	}

	if (argp->flags & NFSMNT_NFSV3) {
	if (argp->sotype == SOCK_DGRAM)
	maxio = NFS_MAXDGRAMDATA;
	else
	maxio = NFS_MAXDATA;
	} else
	maxio = NFS_V2MAXDATA;

	if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) {
	nmp->nm_wsize = argp->wsize;
	/* Round down to multiple of blocksize */
	nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1);
	if (nmp->nm_wsize <= 0)
	nmp->nm_wsize = NFS_FABLKSIZE;
	}
	if (nmp->nm_wsize > maxio)
	nmp->nm_wsize = maxio;
	if (nmp->nm_wsize > MAXBSIZE)
	nmp->nm_wsize = MAXBSIZE;

	if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) {
	nmp->nm_rsize = argp->rsize;
	/* Round down to multiple of blocksize */
	nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1);
	if (nmp->nm_rsize <= 0)
	nmp->nm_rsize = NFS_FABLKSIZE;
	}
	if (nmp->nm_rsize > maxio)
	nmp->nm_rsize = maxio;
	if (nmp->nm_rsize > MAXBSIZE)
	nmp->nm_rsize = MAXBSIZE;

	if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) {
	nmp->nm_readdirsize = argp->readdirsize;
	}
	if (nmp->nm_readdirsize > maxio)
	nmp->nm_readdirsize = maxio;
	if (nmp->nm_readdirsize > nmp->nm_rsize)
	nmp->nm_readdirsize = nmp->nm_rsize;

	if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0)
	nmp->nm_acregmin = argp->acregmin;
	else
	nmp->nm_acregmin = NFS_MINATTRTIMO;
	if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0)
	nmp->nm_acregmax = argp->acregmax;
	else
	nmp->nm_acregmax = NFS_MAXATTRTIMO;
	if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0)
	nmp->nm_acdirmin = argp->acdirmin;
	else
	nmp->nm_acdirmin = NFS_MINDIRATTRTIMO;
	if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0)
	nmp->nm_acdirmax = argp->acdirmax;
	else
	nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO;
	if (nmp->nm_acdirmin > nmp->nm_acdirmax)
	nmp->nm_acdirmin = nmp->nm_acdirmax;
	if (nmp->nm_acregmin > nmp->nm_acregmax)
	nmp->nm_acregmin = nmp->nm_acregmax;

	if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0) {
	if (argp->maxgrouplist <= NFS_MAXGRPS)
	nmp->nm_numgrps = argp->maxgrouplist;
	else
	nmp->nm_numgrps = NFS_MAXGRPS;
	}
	if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) {
	if (argp->readahead <= NFS_MAXRAHEAD)
	nmp->nm_readahead = argp->readahead;
	else
	nmp->nm_readahead = NFS_MAXRAHEAD;
	}
	if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
	if (argp->wcommitsize < nmp->nm_wsize)
	nmp->nm_wcommitsize = nmp->nm_wsize;
	else
	nmp->nm_wcommitsize = argp->wcommitsize;
	}
	if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
	if (argp->deadthresh <= NFS_MAXDEADTHRESH)
	nmp->nm_deadthresh = argp->deadthresh;
	else
	nmp->nm_deadthresh = NFS_MAXDEADTHRESH;
	}

	adjsock \|= ((nmp->nm_sotype != argp->sotype) \|\|
	(nmp->nm_soproto != argp->proto));
	nmp->nm_sotype = argp->sotype;
	nmp->nm_soproto = argp->proto;

	if (nmp->nm_so && adjsock) {
	nfs_safedisconnect(nmp);
	if (nmp->nm_sotype == SOCK_DGRAM)
	while (nfs_connect(nmp, NULL)) {
	printf("nfs_args: retrying connect\n");
	(void) tsleep(&fake_wchan, PSOCK, "nfscon", hz);
	}
	}

	if (hostname) {
	strlcpy(nmp->nm_hostname, hostname,
	sizeof(nmp->nm_hostname));
	p = strchr(nmp->nm_hostname, ':');
	if (p)
	*p = '\0';
	}
	}

	static const char *nfs_opts[] = { "from", "nfs_args",
	"noatime", "noexec", "suiddir", "nosuid", "nosymfollow", "union",
	"noclusterr", "noclusterw", "multilabel", "acls", "force", "update",
	"async", "dumbtimer", "noconn", "nolockd", "intr", "rdirplus", "resvport",
	"readdirsize", "soft", "hard", "mntudp", "tcp", "udp", "wsize", "rsize",
	"retrans", "acregmin", "acregmax", "acdirmin", "acdirmax",
	"deadthresh", "hostname", "timeout", "addr", "fh", "nfsv3",
	"maxgroups",
	NULL };

	/*
	* VFS Operations.
	*
	* mount system call
	* It seems a bit dumb to copyinstr() the host and path here and then
	* bcopy() them in mountnfs(), but I wanted to detect errors before
	* doing the sockargs() call because sockargs() allocates an mbuf and
	* an error after that means that I have to release the mbuf.
	*/
	/* ARGSUSED */
	static int
	nfs_mount(struct mount mp, struct thread td)
	{
	struct nfs_args args = {
	.version = NFS_ARGSVERSION,
	.addr = NULL,
	.addrlen = sizeof (struct sockaddr_in),
	.sotype = SOCK_STREAM,
	.proto = 0,
	.fh = NULL,
	.fhsize = 0,
	.flags = NFSMNT_RESVPORT,
	.wsize = NFS_WSIZE,
	.rsize = NFS_RSIZE,
	.readdirsize = NFS_READDIRSIZE,
	.timeo = 10,
	.retrans = NFS_RETRANS,
	.maxgrouplist = NFS_MAXGRPS,
	.readahead = NFS_DEFRAHEAD,
	.wcommitsize = 0, /* was: NQ_DEFLEASE */
	.deadthresh = NFS_MAXDEADTHRESH, /* was: NQ_DEADTHRESH */
	.hostname = NULL,
	/* args version 4 */
	.acregmin = NFS_MINATTRTIMO,
	.acregmax = NFS_MAXATTRTIMO,
	.acdirmin = NFS_MINDIRATTRTIMO,
	.acdirmax = NFS_MAXDIRATTRTIMO,
	};
	int error, ret, has_nfs_args_opt;
	int has_addr_opt, has_fh_opt, has_hostname_opt;
	struct sockaddr *nam;
	struct vnode *vp;
	char hst[MNAMELEN];
	size_t len;
	u_char nfh[NFSX_V3FHMAX];
	char *opt;

	has_nfs_args_opt = 0;
	has_addr_opt = 0;
	has_fh_opt = 0;
	has_hostname_opt = 0;

	if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) {
	error = EINVAL;
	goto out;
	}

	if ((mp->mnt_flag & (MNT_ROOTFS \| MNT_UPDATE)) == MNT_ROOTFS) {
	error = nfs_mountroot(mp, td);
	goto out;
	}

	/*
	* The old mount_nfs program passed the struct nfs_args
	* from userspace to kernel. The new mount_nfs program
	* passes string options via nmount() from userspace to kernel
	* and we populate the struct nfs_args in the kernel.
	*/
	if (vfs_getopt(mp->mnt_optnew, "nfs_args", NULL, NULL) == 0) {
	error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args,
	sizeof args);
	if (error)
	goto out;

	if (args.version != NFS_ARGSVERSION) {
	error = EPROGMISMATCH;
	goto out;
	}
	has_nfs_args_opt = 1;
	}

	if (vfs_getopt(mp->mnt_optnew, "dumbtimer", NULL, NULL) == 0)
	args.flags \|= NFSMNT_DUMBTIMR;
	if (vfs_getopt(mp->mnt_optnew, "noconn", NULL, NULL) == 0)
	args.flags \|= NFSMNT_NOCONN;
	if (vfs_getopt(mp->mnt_optnew, "conn", NULL, NULL) == 0)
	args.flags \|= NFSMNT_NOCONN;
	if (vfs_getopt(mp->mnt_optnew, "nolockd", NULL, NULL) == 0)
	args.flags \|= NFSMNT_NOLOCKD;
	if (vfs_getopt(mp->mnt_optnew, "lockd", NULL, NULL) == 0)
	args.flags &= ~NFSMNT_NOLOCKD;
	if (vfs_getopt(mp->mnt_optnew, "intr", NULL, NULL) == 0)
	args.flags \|= NFSMNT_INT;
	if (vfs_getopt(mp->mnt_optnew, "rdirplus", NULL, NULL) == 0)
	args.flags \|= NFSMNT_RDIRPLUS;
	if (vfs_getopt(mp->mnt_optnew, "resvport", NULL, NULL) == 0)
	args.flags \|= NFSMNT_RESVPORT;
	if (vfs_getopt(mp->mnt_optnew, "noresvport", NULL, NULL) == 0)
	args.flags &= ~NFSMNT_RESVPORT;
	if (vfs_getopt(mp->mnt_optnew, "soft", NULL, NULL) == 0)
	args.flags \|= NFSMNT_SOFT;
	if (vfs_getopt(mp->mnt_optnew, "hard", NULL, NULL) == 0)
	args.flags &= ~NFSMNT_SOFT;
	if (vfs_getopt(mp->mnt_optnew, "mntudp", NULL, NULL) == 0)
	args.sotype = SOCK_DGRAM;
	if (vfs_getopt(mp->mnt_optnew, "udp", NULL, NULL) == 0)
	args.sotype = SOCK_DGRAM;
	if (vfs_getopt(mp->mnt_optnew, "tcp", NULL, NULL) == 0)
	args.sotype = SOCK_STREAM;
	if (vfs_getopt(mp->mnt_optnew, "nfsv3", NULL, NULL) == 0)
	args.flags \|= NFSMNT_NFSV3;
	if (vfs_getopt(mp->mnt_optnew, "readdirsize", (void **)&opt, NULL) == 0) {
	if (opt == NULL) {
	vfs_mount_error(mp, "illegal readdirsize");
	error = EINVAL;
	goto out;
	}
	ret = sscanf(opt, "%d", &args.readdirsize);
	if (ret != 1 \|\| args.readdirsize <= 0) {
	vfs_mount_error(mp, "illegal readdirsize: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_READDIRSIZE;
	}
	if (vfs_getopt(mp->mnt_optnew, "readahead", (void **)&opt, NULL) == 0) {
	if (opt == NULL) {
	vfs_mount_error(mp, "illegal readahead");
	error = EINVAL;
	goto out;
	}
	ret = sscanf(opt, "%d", &args.readahead);
	if (ret != 1 \|\| args.readahead <= 0) {
	vfs_mount_error(mp, "illegal readahead: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_READAHEAD;
	}
	if (vfs_getopt(mp->mnt_optnew, "wsize", (void **)&opt, NULL) == 0) {
	if (opt == NULL) {
	vfs_mount_error(mp, "illegal wsize");
	error = EINVAL;
	goto out;
	}
	ret = sscanf(opt, "%d", &args.wsize);
	if (ret != 1 \|\| args.wsize <= 0) {
	vfs_mount_error(mp, "illegal wsize: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_WSIZE;
	}
	if (vfs_getopt(mp->mnt_optnew, "rsize", (void **)&opt, NULL) == 0) {
	if (opt == NULL) {
	vfs_mount_error(mp, "illegal rsize");
	error = EINVAL;
	goto out;
	}
	ret = sscanf(opt, "%d", &args.rsize);
	if (ret != 1 \|\| args.rsize <= 0) {
	vfs_mount_error(mp, "illegal wsize: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_RSIZE;
	}
	if (vfs_getopt(mp->mnt_optnew, "retrans", (void **)&opt, NULL) == 0) {
	if (opt == NULL) {
	vfs_mount_error(mp, "illegal retrans");
	error = EINVAL;
	goto out;
	}
	ret = sscanf(opt, "%d", &args.retrans);
	if (ret != 1 \|\| args.retrans <= 0) {
	vfs_mount_error(mp, "illegal retrans: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_RETRANS;
	}
	if (vfs_getopt(mp->mnt_optnew, "acregmin", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.acregmin);
	if (ret != 1 \|\| args.acregmin <= 0) {
	vfs_mount_error(mp, "illegal acregmin: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	}
	if (vfs_getopt(mp->mnt_optnew, "acregmax", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.acregmax);
	if (ret != 1 \|\| args.acregmax <= 0) {
	vfs_mount_error(mp, "illegal acregmax: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	}
	if (vfs_getopt(mp->mnt_optnew, "acdirmin", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.acdirmin);
	if (ret != 1 \|\| args.acdirmin <= 0) {
	vfs_mount_error(mp, "illegal acdirmin: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	}
	if (vfs_getopt(mp->mnt_optnew, "acdirmax", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.acdirmax);
	if (ret != 1 \|\| args.acdirmax <= 0) {
	vfs_mount_error(mp, "illegal acdirmax: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	}
	if (vfs_getopt(mp->mnt_optnew, "deadthresh", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.deadthresh);
	if (ret != 1 \|\| args.deadthresh <= 0) {
	vfs_mount_error(mp, "illegal deadthresh: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_DEADTHRESH;
	}
	if (vfs_getopt(mp->mnt_optnew, "timeout", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.timeo);
	if (ret != 1 \|\| args.timeo <= 0) {
	vfs_mount_error(mp, "illegal timeout: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_TIMEO;
	}
	if (vfs_getopt(mp->mnt_optnew, "maxgroups", (void **)&opt, NULL) == 0) {
	ret = sscanf(opt, "%d", &args.maxgrouplist);
	if (ret != 1 \|\| args.timeo <= 0) {
	vfs_mount_error(mp, "illegal maxgroups: %s",
	opt);
	error = EINVAL;
	goto out;
	}
	args.flags \|= NFSMNT_MAXGRPS;
	}
	if (vfs_getopt(mp->mnt_optnew, "addr", (void **)&args.addr,
	&args.addrlen) == 0) {
	has_addr_opt = 1;
	if (args.addrlen > SOCK_MAXADDRLEN) {
	error = ENAMETOOLONG;
	goto out;
	}
	MALLOC(nam, struct sockaddr *, args.addrlen, M_SONAME,
	M_WAITOK);
	bcopy(args.addr, nam, args.addrlen);
	nam->sa_len = args.addrlen;
	}
	if (vfs_getopt(mp->mnt_optnew, "fh", (void **)&args.fh,
	&args.fhsize) == 0) {
	has_fh_opt = 1;
	}
	if (vfs_getopt(mp->mnt_optnew, "hostname", (void **)&args.hostname,
	NULL) == 0) {
	has_hostname_opt = 1;
	}
	if (args.hostname == NULL) {
	vfs_mount_error(mp, "Invalid hostname");
	error = EINVAL;
	goto out;
	}

	if (mp->mnt_flag & MNT_UPDATE) {
	struct nfsmount *nmp = VFSTONFS(mp);

	if (nmp == NULL) {
	error = EIO;
	goto out;
	}
	/*
	* When doing an update, we can't change from or to
	* v3, switch lockd strategies or change cookie translation
	*/
	args.flags = (args.flags &
	~(NFSMNT_NFSV3 \| NFSMNT_NOLOCKD /\|NFSMNT_XLATECOOKIE/)) \|
	(nmp->nm_flag &
	(NFSMNT_NFSV3 \| NFSMNT_NOLOCKD /\|NFSMNT_XLATECOOKIE/));
	nfs_decode_args(mp, nmp, &args, NULL);
	goto out;
	}

	/*
	* Make the nfs_ip_paranoia sysctl serve as the default connection
	* or no-connection mode for those protocols that support
	* no-connection mode (the flag will be cleared later for protocols
	* that do not support no-connection mode). This will allow a client
	* to receive replies from a different IP then the request was
	* sent to. Note: default value for nfs_ip_paranoia is 1 (paranoid),
	* not 0.
	*/
	if (nfs_ip_paranoia == 0)
	args.flags \|= NFSMNT_NOCONN;

	if (has_nfs_args_opt) {
	/*
	* In the 'nfs_args' case, the pointers in the args
	* structure are in userland - we copy them in here.
	*/
	if (!has_fh_opt) {
	error = copyin((caddr_t)args.fh, (caddr_t)nfh,
	args.fhsize);
	if (error) {
	goto out;
	}
	args.fh = nfh;
	}
	if (!has_hostname_opt) {
	error = copyinstr(args.hostname, hst, MNAMELEN-1, &len);
	if (error) {
	goto out;
	}
	bzero(&hst[len], MNAMELEN - len);
	args.hostname = hst;
	}
	if (!has_addr_opt) {
	/* sockargs() call must be after above copyin() calls */
	error = getsockaddr(&nam, (caddr_t)args.addr,
	args.addrlen);
	if (error) {
	goto out;
	}
	}
	}
	error = mountnfs(&args, mp, nam, args.hostname, &vp, td->td_ucred);
	out:
	if (!error) {
	MNT_ILOCK(mp);
	mp->mnt_kern_flag \|= (MNTK_MPSAFE\|MNTK_LOOKUP_SHARED);
	MNT_IUNLOCK(mp);
	}
	return (error);
	}


	/*
	* VFS Operations.
	*
	* mount system call
	* It seems a bit dumb to copyinstr() the host and path here and then
	* bcopy() them in mountnfs(), but I wanted to detect errors before
	* doing the sockargs() call because sockargs() allocates an mbuf and
	* an error after that means that I have to release the mbuf.
	*/
	/* ARGSUSED */
	static int
	nfs_cmount(struct mntarg ma, void data, int flags, struct thread *td)
	{
	int error;
	struct nfs_args args;

	error = copyin(data, &args, sizeof (struct nfs_args));
	if (error)
	return error;

	ma = mount_arg(ma, "nfs_args", &args, sizeof args);

	error = kernel_mount(ma, flags);
	return (error);
	}

	/*
	* Common code for mount and mountroot
	*/
	static int
	mountnfs(struct nfs_args argp, struct mount mp, struct sockaddr *nam,
	char hst, struct vnode vpp, struct ucred cred)
	{
	struct nfsmount *nmp;
	struct nfsnode *np;
	int error;
	struct vattr attrs;

	if (mp->mnt_flag & MNT_UPDATE) {
	nmp = VFSTONFS(mp);
	printf("%s: MNT_UPDATE is no longer handled here\n", __func__);
	FREE(nam, M_SONAME);
	return (0);
	} else {
	nmp = uma_zalloc(nfsmount_zone, M_WAITOK);
	bzero((caddr_t)nmp, sizeof (struct nfsmount));
	TAILQ_INIT(&nmp->nm_bufq);
	mp->mnt_data = nmp;
	}
	vfs_getnewfsid(mp);
	nmp->nm_mountp = mp;
	mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF);

	/*
	* V2 can only handle 32 bit filesizes. A 4GB-1 limit may be too
	* high, depending on whether we end up with negative offsets in
	* the client or server somewhere. 2GB-1 may be safer.
	*
	* For V3, nfs_fsinfo will adjust this as necessary. Assume maximum
	* that we can handle until we find out otherwise.
	* XXX Our "safe" limit on the client is what we can store in our
	* buffer cache using signed(!) block numbers.
	*/
	if ((argp->flags & NFSMNT_NFSV3) == 0)
	nmp->nm_maxfilesize = 0xffffffffLL;
	else
	nmp->nm_maxfilesize = (u_int64_t)0x80000000 * DEV_BSIZE - 1;

	nmp->nm_timeo = NFS_TIMEO;
	nmp->nm_retry = NFS_RETRANS;
	if ((argp->flags & NFSMNT_NFSV3) && argp->sotype == SOCK_STREAM) {
	nmp->nm_wsize = nmp->nm_rsize = NFS_MAXDATA;
	} else {
	nmp->nm_wsize = NFS_WSIZE;
	nmp->nm_rsize = NFS_RSIZE;
	}
	nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
	nmp->nm_readdirsize = NFS_READDIRSIZE;
	nmp->nm_numgrps = NFS_MAXGRPS;
	nmp->nm_readahead = NFS_DEFRAHEAD;
	nmp->nm_deadthresh = NFS_MAXDEADTHRESH;
	nmp->nm_tprintf_delay = nfs_tprintf_delay;
	if (nmp->nm_tprintf_delay < 0)
	nmp->nm_tprintf_delay = 0;
	nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay;
	if (nmp->nm_tprintf_initial_delay < 0)
	nmp->nm_tprintf_initial_delay = 0;
	nmp->nm_fhsize = argp->fhsize;
	bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize);
	bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN);
	nmp->nm_nam = nam;
	/* Set up the sockets and per-host congestion */
	nmp->nm_sotype = argp->sotype;
	nmp->nm_soproto = argp->proto;
	nmp->nm_rpcops = &nfs_rpcops;

	nfs_decode_args(mp, nmp, argp, hst);

	/*
	* For Connection based sockets (TCP,...) defer the connect until
	* the first request, in case the server is not responding.
	*/
	if (nmp->nm_sotype == SOCK_DGRAM &&
	(error = nfs_connect(nmp, NULL)))
	goto bad;

	/*
	* This is silly, but it has to be set so that vinifod() works.
	* We do not want to do an nfs_statfs() here since we can get
	* stuck on a dead server and we are holding a lock on the mount
	* point.
	*/
	mtx_lock(&nmp->nm_mtx);
	mp->mnt_stat.f_iosize = nfs_iosize(nmp);
	mtx_unlock(&nmp->nm_mtx);
	/*
	* A reference count is needed on the nfsnode representing the
	* remote root. If this object is not persistent, then backward
	* traversals of the mount point (i.e. "..") will not work if
	* the nfsnode gets flushed out of the cache. Ufs does not have
	* this problem, because one can identify root inodes by their
	* number == ROOTINO (2).
	*/
	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE);
	if (error)
	goto bad;
	*vpp = NFSTOV(np);

	/*
	* Get file attributes and transfer parameters for the
	* mountpoint. This has the side effect of filling in
	* (*vpp)->v_type with the correct value.
	*/
	if (argp->flags & NFSMNT_NFSV3)
	nfs_fsinfo(nmp, *vpp, curthread->td_ucred, curthread);
	else
	VOP_GETATTR(*vpp, &attrs, curthread->td_ucred);

	/*
	* Lose the lock but keep the ref.
	*/
	VOP_UNLOCK(*vpp, 0);

	return (0);
	bad:
	nfs_disconnect(nmp);
	mtx_destroy(&nmp->nm_mtx);
	uma_zfree(nfsmount_zone, nmp);
	FREE(nam, M_SONAME);
	return (error);
	}

	/*
	* unmount system call
	*/
	static int
	nfs_unmount(struct mount mp, int mntflags, struct thread td)
	{
	struct nfsmount *nmp;
	int error, flags = 0;

	if (mntflags & MNT_FORCE)
	flags \|= FORCECLOSE;
	nmp = VFSTONFS(mp);
	/*
	* Goes something like this..
	* - Call vflush() to clear out vnodes for this filesystem
	* - Close the socket
	* - Free up the data structures
	*/
	/* In the forced case, cancel any outstanding requests. */
	if (flags & FORCECLOSE) {
	error = nfs_nmcancelreqs(nmp);
	if (error)
	goto out;
	}
	/* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */
	error = vflush(mp, 1, flags, td);
	if (error)
	goto out;

	/*
	* We are now committed to the unmount.
	*/
	nfs_disconnect(nmp);
	FREE(nmp->nm_nam, M_SONAME);

	mtx_destroy(&nmp->nm_mtx);
	uma_zfree(nfsmount_zone, nmp);
	out:
	return (error);
	}

	/*
	* Return root of a filesystem
	*/
	static int
	nfs_root(struct mount mp, int flags, struct vnode vpp, struct thread td)
	{
	struct vnode *vp;
	struct nfsmount *nmp;
	struct nfsnode *np;
	int error;

	nmp = VFSTONFS(mp);
	error = nfs_nget(mp, (nfsfh_t *)nmp->nm_fh, nmp->nm_fhsize, &np, flags);
	if (error)
	return error;
	vp = NFSTOV(np);
	/*
	* Get transfer parameters and attributes for root vnode once.
	*/
	mtx_lock(&nmp->nm_mtx);
	if ((nmp->nm_state & NFSSTA_GOTFSINFO) == 0 &&
	(nmp->nm_flag & NFSMNT_NFSV3)) {
	mtx_unlock(&nmp->nm_mtx);
	nfs_fsinfo(nmp, vp, curthread->td_ucred, curthread);
	} else
	mtx_unlock(&nmp->nm_mtx);
	if (vp->v_type == VNON)
	vp->v_type = VDIR;
	vp->v_vflag \|= VV_ROOT;
	*vpp = vp;
	return (0);
	}

	/*
	* Flush out the buffer cache
	*/
	/* ARGSUSED */
	static int
	nfs_sync(struct mount mp, int waitfor, struct thread td)
	{
	struct vnode vp, mvp;
	int error, allerror = 0;

	/*
	* Force stale buffer cache information to be flushed.
	*/
	MNT_ILOCK(mp);
	loop:
	MNT_VNODE_FOREACH(vp, mp, mvp) {
	VI_LOCK(vp);
	MNT_IUNLOCK(mp);
	/* XXX Racy bv_cnt check. */
	if (VOP_ISLOCKED(vp) \|\| vp->v_bufobj.bo_dirty.bv_cnt == 0 \|\|
	waitfor == MNT_LAZY) {
	VI_UNLOCK(vp);
	MNT_ILOCK(mp);
	continue;
	}
	if (vget(vp, LK_EXCLUSIVE \| LK_INTERLOCK, td)) {
	MNT_ILOCK(mp);
	MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
	goto loop;
	}
	error = VOP_FSYNC(vp, waitfor, td);
	if (error)
	allerror = error;
	VOP_UNLOCK(vp, 0);
	vrele(vp);

	MNT_ILOCK(mp);
	}
	MNT_IUNLOCK(mp);
	return (allerror);
	}

	static int
	nfs_sysctl(struct mount mp, fsctlop_t op, struct sysctl_req req)
	{
	struct nfsmount *nmp = VFSTONFS(mp);
	struct vfsquery vq;
	int error;

	bzero(&vq, sizeof(vq));
	switch (op) {
	#if 0
	case VFS_CTL_NOLOCKS:
	val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0;
	if (req->oldptr != NULL) {
	error = SYSCTL_OUT(req, &val, sizeof(val));
	if (error)
	return (error);
	}
	if (req->newptr != NULL) {
	error = SYSCTL_IN(req, &val, sizeof(val));
	if (error)
	return (error);
	if (val)
	nmp->nm_flag \|= NFSMNT_NOLOCKS;
	else
	nmp->nm_flag &= ~NFSMNT_NOLOCKS;
	}
	break;
	#endif
	case VFS_CTL_QUERY:
	mtx_lock(&nmp->nm_mtx);
	if (nmp->nm_state & NFSSTA_TIMEO)
	vq.vq_flags \|= VQ_NOTRESP;
	mtx_unlock(&nmp->nm_mtx);
	#if 0
	if (!(nmp->nm_flag & NFSMNT_NOLOCKS) &&
	(nmp->nm_state & NFSSTA_LOCKTIMEO))
	vq.vq_flags \|= VQ_NOTRESPLOCK;
	#endif
	error = SYSCTL_OUT(req, &vq, sizeof(vq));
	break;
	case VFS_CTL_TIMEO:
	if (req->oldptr != NULL) {
	error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay,
	sizeof(nmp->nm_tprintf_initial_delay));
	if (error)
	return (error);
	}
	if (req->newptr != NULL) {
	error = vfs_suser(mp, req->td);
	if (error)
	return (error);
	error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay,
	sizeof(nmp->nm_tprintf_initial_delay));
	if (error)
	return (error);
	if (nmp->nm_tprintf_initial_delay < 0)
	nmp->nm_tprintf_initial_delay = 0;
	}
	break;
	default:
	return (ENOTSUP);
	}
	return (0);
	}
	Index: head/sys/nfsclient/nfs_vnops.c
	===================================================================
	--- head/sys/nfsclient/nfs_vnops.c (revision 183549)
	+++ head/sys/nfsclient/nfs_vnops.c (revision 183550)
	@@ -1,3323 +1,3326 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)nfs_vnops.c 8.16 (Berkeley) 5/27/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* vnode op calls for Sun NFS version 2 and 3
	*/

	#include "opt_inet.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/resourcevar.h>
	#include <sys/proc.h>
	#include <sys/mount.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/namei.h>
	#include <sys/socket.h>
	#include <sys/vnode.h>
	#include <sys/dirent.h>
	#include <sys/fcntl.h>
	#include <sys/lockf.h>
	#include <sys/stat.h>
	#include <sys/sysctl.h>
	#include <sys/signalvar.h>
	#include <sys/vimage.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>

	#include <fs/fifofs/fifo.h>

	#include <rpc/rpcclnt.h>

	#include <nfs/rpcv2.h>
	#include <nfs/nfsproto.h>
	#include <nfsclient/nfs.h>
	#include <nfsclient/nfsnode.h>
	#include <nfsclient/nfsmount.h>
	#include <nfsclient/nfs_lock.h>
	#include <nfs/xdr_subs.h>
	#include <nfsclient/nfsm_subs.h>

	#include <net/if.h>
	#include <netinet/in.h>
	#include <netinet/in_var.h>

	/* Defs */
	#define TRUE 1
	#define FALSE 0

	/*
	* Ifdef for FreeBSD-current merged buffer cache. It is unfortunate that these
	* calls are not in getblk() and brelse() so that they would not be necessary
	* here.
	*/
	#ifndef B_VMIO
	#define vfs_busy_pages(bp, f)
	#endif

	static vop_read_t nfsfifo_read;
	static vop_write_t nfsfifo_write;
	static vop_close_t nfsfifo_close;
	static int nfs_flush(struct vnode *, int, int);
	static int nfs_setattrrpc(struct vnode , struct vattr , struct ucred *);
	static vop_lookup_t nfs_lookup;
	static vop_create_t nfs_create;
	static vop_mknod_t nfs_mknod;
	static vop_open_t nfs_open;
	static vop_close_t nfs_close;
	static vop_access_t nfs_access;
	static vop_getattr_t nfs_getattr;
	static vop_setattr_t nfs_setattr;
	static vop_read_t nfs_read;
	static vop_fsync_t nfs_fsync;
	static vop_remove_t nfs_remove;
	static vop_link_t nfs_link;
	static vop_rename_t nfs_rename;
	static vop_mkdir_t nfs_mkdir;
	static vop_rmdir_t nfs_rmdir;
	static vop_symlink_t nfs_symlink;
	static vop_readdir_t nfs_readdir;
	static vop_strategy_t nfs_strategy;
	static int nfs_lookitup(struct vnode , const char , int,
	struct ucred , struct thread , struct nfsnode **);
	static int nfs_sillyrename(struct vnode , struct vnode ,
	struct componentname *);
	static vop_access_t nfsspec_access;
	static vop_readlink_t nfs_readlink;
	static vop_print_t nfs_print;
	static vop_advlock_t nfs_advlock;
	static vop_advlockasync_t nfs_advlockasync;

	/*
	* Global vfs data structures for nfs
	*/
	struct vop_vector nfs_vnodeops = {
	.vop_default = &default_vnodeops,
	.vop_access = nfs_access,
	.vop_advlock = nfs_advlock,
	.vop_advlockasync = nfs_advlockasync,
	.vop_close = nfs_close,
	.vop_create = nfs_create,
	.vop_fsync = nfs_fsync,
	.vop_getattr = nfs_getattr,
	.vop_getpages = nfs_getpages,
	.vop_putpages = nfs_putpages,
	.vop_inactive = nfs_inactive,
	.vop_lease = VOP_NULL,
	.vop_link = nfs_link,
	.vop_lookup = nfs_lookup,
	.vop_mkdir = nfs_mkdir,
	.vop_mknod = nfs_mknod,
	.vop_open = nfs_open,
	.vop_print = nfs_print,
	.vop_read = nfs_read,
	.vop_readdir = nfs_readdir,
	.vop_readlink = nfs_readlink,
	.vop_reclaim = nfs_reclaim,
	.vop_remove = nfs_remove,
	.vop_rename = nfs_rename,
	.vop_rmdir = nfs_rmdir,
	.vop_setattr = nfs_setattr,
	.vop_strategy = nfs_strategy,
	.vop_symlink = nfs_symlink,
	.vop_write = nfs_write,
	};

	struct vop_vector nfs_fifoops = {
	.vop_default = &fifo_specops,
	.vop_access = nfsspec_access,
	.vop_close = nfsfifo_close,
	.vop_fsync = nfs_fsync,
	.vop_getattr = nfs_getattr,
	.vop_inactive = nfs_inactive,
	.vop_print = nfs_print,
	.vop_read = nfsfifo_read,
	.vop_reclaim = nfs_reclaim,
	.vop_setattr = nfs_setattr,
	.vop_write = nfsfifo_write,
	};

	static int nfs_mknodrpc(struct vnode dvp, struct vnode *vpp,
	struct componentname cnp, struct vattr vap);
	static int nfs_removerpc(struct vnode dvp, const char name, int namelen,
	struct ucred cred, struct thread td);
	static int nfs_renamerpc(struct vnode fdvp, const char fnameptr,
	int fnamelen, struct vnode *tdvp,
	const char *tnameptr, int tnamelen,
	struct ucred cred, struct thread td);
	static int nfs_renameit(struct vnode sdvp, struct componentname scnp,
	struct sillyrename *sp);

	/*
	* Global variables
	*/
	struct mtx nfs_iod_mtx;
	struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
	struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON];
	int nfs_numasync = 0;
	vop_advlock_t *nfs_advlock_p = nfs_dolock;
	vop_reclaim_t *nfs_reclaim_p = NULL;
	#define DIRHDSIZ (sizeof (struct dirent) - (MAXNAMLEN + 1))

	SYSCTL_DECL(_vfs_nfs);

	static int nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
	&nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");

	static int nfsv3_commit_on_close = 0;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, nfsv3_commit_on_close, CTLFLAG_RW,
	&nfsv3_commit_on_close, 0, "write+commit on close, else only write");

	static int nfs_clean_pages_on_close = 1;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
	&nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");

	int nfs_directio_enable = 0;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
	&nfs_directio_enable, 0, "Enable NFS directio");

	/*
	* This sysctl allows other processes to mmap a file that has been opened
	* O_DIRECT by a process. In general, having processes mmap the file while
	* Direct IO is in progress can lead to Data Inconsistencies. But, we allow
	* this by default to prevent DoS attacks - to prevent a malicious user from
	* opening up files O_DIRECT preventing other users from mmap'ing these
	* files. "Protected" environments where stricter consistency guarantees are
	* required can disable this knob. The process that opened the file O_DIRECT
	* cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
	* meaningful.
	*/
	int nfs_directio_allow_mmap = 1;
	SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
	&nfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");

	#if 0
	SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_hits, CTLFLAG_RD,
	&nfsstats.accesscache_hits, 0, "NFS ACCESS cache hit count");

	SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_misses, CTLFLAG_RD,
	&nfsstats.accesscache_misses, 0, "NFS ACCESS cache miss count");
	#endif

	#define NFSV3ACCESS_ALL (NFSV3ACCESS_READ \| NFSV3ACCESS_MODIFY \
	\| NFSV3ACCESS_EXTEND \| NFSV3ACCESS_EXECUTE \
	\| NFSV3ACCESS_DELETE \| NFSV3ACCESS_LOOKUP)

	/*
	* SMP Locking Note :
	* The list of locks after the description of the lock is the ordering
	* of other locks acquired with the lock held.
	* np->n_mtx : Protects the fields in the nfsnode.
	VM Object Lock
	VI_MTX (acquired indirectly)
	* nmp->nm_mtx : Protects the fields in the nfsmount.
	rep->r_mtx
	* nfs_iod_mtx : Global lock, protects shared nfsiod state.
	* nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
	nmp->nm_mtx
	rep->r_mtx
	* rep->r_mtx : Protects the fields in an nfsreq.
	*/

	static int
	nfs3_access_otw(struct vnode vp, int wmode, struct thread td,
	struct ucred *cred)
	{
	const int v3 = 1;
	u_int32_t *tl;
	int error = 0, attrflag;

	struct mbuf mreq, mrep, md, mb;
	caddr_t bpos, dpos;
	u_int32_t rmode;
	struct nfsnode *np = VTONFS(vp);

	nfsstats.rpccnt[NFSPROC_ACCESS]++;
	mreq = nfsm_reqhead(vp, NFSPROC_ACCESS, NFSX_FH(v3) + NFSX_UNSIGNED);
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
	*tl = txdr_unsigned(wmode);
	nfsm_request(vp, NFSPROC_ACCESS, td, cred);
	nfsm_postop_attr(vp, attrflag);
	if (!error) {
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	rmode = fxdr_unsigned(u_int32_t, *tl);
	mtx_lock(&np->n_mtx);
	np->n_mode = rmode;
	np->n_modeuid = cred->cr_uid;
	np->n_modestamp = time_second;
	mtx_unlock(&np->n_mtx);
	}
	m_freem(mrep);
	nfsmout:
	return (error);
	}

	/*
	* nfs access vnode op.
	* For nfs version 2, just return ok. File accesses may fail later.
	* For nfs version 3, use the access rpc to check accessibility. If file modes
	* are changed on the server, accesses might still fail later.
	*/
	static int
	nfs_access(struct vop_access_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	int error = 0;
	u_int32_t mode, wmode;
	int v3 = NFS_ISV3(vp);
	struct nfsnode *np = VTONFS(vp);

	/*
	* Disallow write attempts on filesystems mounted read-only;
	* unless the file is a socket, fifo, or a block or character
	* device resident on the filesystem.
	*/
	if ((ap->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
	switch (vp->v_type) {
	case VREG:
	case VDIR:
	case VLNK:
	return (EROFS);
	default:
	break;
	}
	}
	/*
	* For nfs v3, check to see if we have done this recently, and if
	* so return our cached result instead of making an ACCESS call.
	* If not, do an access rpc, otherwise you are stuck emulating
	* ufs_access() locally using the vattr. This may not be correct,
	* since the server may apply other access criteria such as
	* client uid-->server uid mapping that we do not know about.
	*/
	if (v3) {
	if (ap->a_mode & VREAD)
	mode = NFSV3ACCESS_READ;
	else
	mode = 0;
	if (vp->v_type != VDIR) {
	if (ap->a_mode & VWRITE)
	mode \|= (NFSV3ACCESS_MODIFY \| NFSV3ACCESS_EXTEND);
	if (ap->a_mode & VEXEC)
	mode \|= NFSV3ACCESS_EXECUTE;
	} else {
	if (ap->a_mode & VWRITE)
	mode \|= (NFSV3ACCESS_MODIFY \| NFSV3ACCESS_EXTEND \|
	NFSV3ACCESS_DELETE);
	if (ap->a_mode & VEXEC)
	mode \|= NFSV3ACCESS_LOOKUP;
	}
	/* XXX safety belt, only make blanket request if caching */
	if (nfsaccess_cache_timeout > 0) {
	wmode = NFSV3ACCESS_READ \| NFSV3ACCESS_MODIFY \|
	NFSV3ACCESS_EXTEND \| NFSV3ACCESS_EXECUTE \|
	NFSV3ACCESS_DELETE \| NFSV3ACCESS_LOOKUP;
	} else {
	wmode = mode;
	}

	/*
	* Does our cached result allow us to give a definite yes to
	* this request?
	*/
	mtx_lock(&np->n_mtx);
	if ((time_second < (np->n_modestamp + nfsaccess_cache_timeout)) &&
	(ap->a_cred->cr_uid == np->n_modeuid) &&
	((np->n_mode & mode) == mode)) {
	nfsstats.accesscache_hits++;
	} else {
	/*
	* Either a no, or a don't know. Go to the wire.
	*/
	nfsstats.accesscache_misses++;
	mtx_unlock(&np->n_mtx);
	error = nfs3_access_otw(vp, wmode, ap->a_td,ap->a_cred);
	mtx_lock(&np->n_mtx);
	if (!error) {
	if ((np->n_mode & mode) != mode) {
	error = EACCES;
	}
	}
	}
	mtx_unlock(&np->n_mtx);
	return (error);
	} else {
	if ((error = nfsspec_access(ap)) != 0) {
	return (error);
	}
	/*
	* Attempt to prevent a mapped root from accessing a file
	* which it shouldn't. We try to read a byte from the file
	* if the user is root and the file is not zero length.
	* After calling nfsspec_access, we should have the correct
	* file size cached.
	*/
	mtx_lock(&np->n_mtx);
	if (ap->a_cred->cr_uid == 0 && (ap->a_mode & VREAD)
	&& VTONFS(vp)->n_size > 0) {
	struct iovec aiov;
	struct uio auio;
	char buf[1];

	mtx_unlock(&np->n_mtx);
	aiov.iov_base = buf;
	aiov.iov_len = 1;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	auio.uio_resid = 1;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_READ;
	auio.uio_td = ap->a_td;

	if (vp->v_type == VREG)
	error = nfs_readrpc(vp, &auio, ap->a_cred);
	else if (vp->v_type == VDIR) {
	char* bp;
	bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
	aiov.iov_base = bp;
	aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
	error = nfs_readdirrpc(vp, &auio, ap->a_cred);
	free(bp, M_TEMP);
	} else if (vp->v_type == VLNK)
	error = nfs_readlinkrpc(vp, &auio, ap->a_cred);
	else
	error = EACCES;
	} else
	mtx_unlock(&np->n_mtx);
	return (error);
	}
	}

	int nfs_otw_getattr_avoid = 0;

	/*
	* nfs open vnode op
	* Check to see if the type is ok
	* and that deletion is not in progress.
	* For paged in text files, you will need to flush the page cache
	* if consistency is lost.
	*/
	/* ARGSUSED */
	static int
	nfs_open(struct vop_open_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	struct vattr vattr;
	int error;
	int fmode = ap->a_mode;

	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
	return (EOPNOTSUPP);

	/*
	* Get a valid lease. If cached data is stale, flush it.
	*/
	mtx_lock(&np->n_mtx);
	if (np->n_flag & NMODIFIED) {
	mtx_unlock(&np->n_mtx);
	error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
	if (error == EINTR \|\| error == EIO)
	return (error);
	np->n_attrstamp = 0;
	if (vp->v_type == VDIR)
	np->n_direofoffset = 0;
	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
	if (error)
	return (error);
	mtx_lock(&np->n_mtx);
	np->n_mtime = vattr.va_mtime;
	mtx_unlock(&np->n_mtx);
	} else {
	struct thread *td = curthread;

	if (np->n_ac_ts_syscalls != td->td_syscalls \|\|
	np->n_ac_ts_tid != td->td_tid \|\|
	td->td_proc == NULL \|\|
	np->n_ac_ts_pid != td->td_proc->p_pid) {
	np->n_attrstamp = 0;
	}
	mtx_unlock(&np->n_mtx);
	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
	if (error)
	return (error);
	mtx_lock(&np->n_mtx);
	if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
	if (vp->v_type == VDIR)
	np->n_direofoffset = 0;
	mtx_unlock(&np->n_mtx);
	error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
	if (error == EINTR \|\| error == EIO) {
	return (error);
	}
	mtx_lock(&np->n_mtx);
	np->n_mtime = vattr.va_mtime;
	}
	mtx_unlock(&np->n_mtx);
	}
	/*
	* If the object has >= 1 O_DIRECT active opens, we disable caching.
	*/
	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
	if (np->n_directio_opens == 0) {
	error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
	if (error)
	return (error);
	mtx_lock(&np->n_mtx);
	np->n_flag \|= NNONCACHE;
	mtx_unlock(&np->n_mtx);
	}
	np->n_directio_opens++;
	}
	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
	return (0);
	}

	/*
	* nfs close vnode op
	* What an NFS client should do upon close after writing is a debatable issue.
	* Most NFS clients push delayed writes to the server upon close, basically for
	* two reasons:
	* 1 - So that any write errors may be reported back to the client process
	* doing the close system call. By far the two most likely errors are
	* NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
	* 2 - To put a worst case upper bound on cache inconsistency between
	* multiple clients for the file.
	* There is also a consistency problem for Version 2 of the protocol w.r.t.
	* not being able to tell if other clients are writing a file concurrently,
	* since there is no way of knowing if the changed modify time in the reply
	* is only due to the write for this client.
	* (NFS Version 3 provides weak cache consistency data in the reply that
	* should be sufficient to detect and handle this case.)
	*
	* The current code does the following:
	* for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
	* for NFS Version 3 - flush dirty buffers to the server but don't invalidate
	* or commit them (this satisfies 1 and 2 except for the
	* case where the server crashes after this close but
	* before the commit RPC, which is felt to be "good
	* enough". Changing the last argument to nfs_flush() to
	* a 1 would force a commit operation, if it is felt a
	* commit is necessary now.
	*/
	/* ARGSUSED */
	static int
	nfs_close(struct vop_close_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	int error = 0;
	int fmode = ap->a_fflag;

	if (vp->v_type == VREG) {
	/*
	* Examine and clean dirty pages, regardless of NMODIFIED.
	* This closes a major hole in close-to-open consistency.
	* We want to push out all dirty pages (and buffers) on
	* close, regardless of whether they were dirtied by
	* mmap'ed writes or via write().
	*/
	if (nfs_clean_pages_on_close && vp->v_object) {
	VM_OBJECT_LOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, 0);
	VM_OBJECT_UNLOCK(vp->v_object);
	}
	mtx_lock(&np->n_mtx);
	if (np->n_flag & NMODIFIED) {
	mtx_unlock(&np->n_mtx);
	if (NFS_ISV3(vp)) {
	/*
	* Under NFSv3 we have dirty buffers to dispose of. We
	* must flush them to the NFS server. We have the option
	* of waiting all the way through the commit rpc or just
	* waiting for the initial write. The default is to only
	* wait through the initial write so the data is in the
	* server's cache, which is roughly similar to the state
	* a standard disk subsystem leaves the file in on close().
	*
	* We cannot clear the NMODIFIED bit in np->n_flag due to
	* potential races with other processes, and certainly
	* cannot clear it if we don't commit.
	*/
	int cm = nfsv3_commit_on_close ? 1 : 0;
	error = nfs_flush(vp, MNT_WAIT, cm);
	/* np->n_flag &= ~NMODIFIED; */
	} else
	error = nfs_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
	mtx_lock(&np->n_mtx);
	}
	/*
	* Invalidate the attribute cache in all cases.
	* An open is going to fetch fresh attrs any way, other procs
	* on this node that have file open will be forced to do an
	* otw attr fetch, but this is safe.
	*/
	np->n_attrstamp = 0;
	if (np->n_flag & NWRITEERR) {
	np->n_flag &= ~NWRITEERR;
	error = np->n_error;
	}
	mtx_unlock(&np->n_mtx);
	}
	if (nfs_directio_enable)
	KASSERT((np->n_directio_asyncwr == 0),
	("nfs_close: dirty unflushed (%d) directio buffers\n",
	np->n_directio_asyncwr));
	if (nfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
	mtx_lock(&np->n_mtx);
	KASSERT((np->n_directio_opens > 0),
	("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
	np->n_directio_opens--;
	if (np->n_directio_opens == 0)
	np->n_flag &= ~NNONCACHE;
	mtx_unlock(&np->n_mtx);
	}
	return (error);
	}

	/*
	* nfs getattr call from vfs.
	*/
	static int
	nfs_getattr(struct vop_getattr_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	struct thread *td = curthread;
	struct vattr *vap = ap->a_vap;
	struct vattr vattr;
	caddr_t bpos, dpos;
	int error = 0;
	struct mbuf mreq, mrep, md, mb;
	int v3 = NFS_ISV3(vp);

	/*
	* Update local times for special files.
	*/
	mtx_lock(&np->n_mtx);
	if (np->n_flag & (NACC \| NUPD))
	np->n_flag \|= NCHG;
	mtx_unlock(&np->n_mtx);
	/*
	* First look in the cache.
	*/
	if (nfs_getattrcache(vp, &vattr) == 0)
	goto nfsmout;
	if (v3 && nfsaccess_cache_timeout > 0) {
	nfsstats.accesscache_misses++;
	nfs3_access_otw(vp, NFSV3ACCESS_ALL, td, ap->a_cred);
	if (nfs_getattrcache(vp, &vattr) == 0)
	goto nfsmout;
	}
	nfsstats.rpccnt[NFSPROC_GETATTR]++;
	mreq = nfsm_reqhead(vp, NFSPROC_GETATTR, NFSX_FH(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	nfsm_request(vp, NFSPROC_GETATTR, td, ap->a_cred);
	if (!error) {
	nfsm_loadattr(vp, &vattr);
	}
	m_freem(mrep);
	nfsmout:
	vap->va_type = vattr.va_type;
	vap->va_mode = vattr.va_mode;
	vap->va_nlink = vattr.va_nlink;
	vap->va_uid = vattr.va_uid;
	vap->va_gid = vattr.va_gid;
	vap->va_fsid = vattr.va_fsid;
	vap->va_fileid = vattr.va_fileid;
	vap->va_size = vattr.va_size;
	vap->va_blocksize = vattr.va_blocksize;
	vap->va_atime = vattr.va_atime;
	vap->va_mtime = vattr.va_mtime;
	vap->va_ctime = vattr.va_ctime;
	vap->va_gen = vattr.va_gen;
	vap->va_flags = vattr.va_flags;
	vap->va_rdev = vattr.va_rdev;
	vap->va_bytes = vattr.va_bytes;
	vap->va_filerev = vattr.va_filerev;

	return (error);
	}

	/*
	* nfs setattr call.
	*/
	static int
	nfs_setattr(struct vop_setattr_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	struct vattr *vap = ap->a_vap;
	struct thread *td = curthread;
	int error = 0;
	u_quad_t tsize;

	#ifndef nolint
	tsize = (u_quad_t)0;
	#endif

	/*
	* Setting of flags and marking of atimes are not supported.
	*/
	if (vap->va_flags != VNOVAL \|\| (vap->va_vaflags & VA_MARK_ATIME))
	return (EOPNOTSUPP);

	/*
	* Disallow write attempts if the filesystem is mounted read-only.
	*/
	if ((vap->va_flags != VNOVAL \|\| vap->va_uid != (uid_t)VNOVAL \|\|
	vap->va_gid != (gid_t)VNOVAL \|\| vap->va_atime.tv_sec != VNOVAL \|\|
	vap->va_mtime.tv_sec != VNOVAL \|\| vap->va_mode != (mode_t)VNOVAL) &&
	(vp->v_mount->mnt_flag & MNT_RDONLY)) {
	error = EROFS;
	goto out;
	}
	if (vap->va_size != VNOVAL) {
	switch (vp->v_type) {
	case VDIR:
	return (EISDIR);
	case VCHR:
	case VBLK:
	case VSOCK:
	case VFIFO:
	if (vap->va_mtime.tv_sec == VNOVAL &&
	vap->va_atime.tv_sec == VNOVAL &&
	vap->va_mode == (mode_t)VNOVAL &&
	vap->va_uid == (uid_t)VNOVAL &&
	vap->va_gid == (gid_t)VNOVAL)
	return (0);
	vap->va_size = VNOVAL;
	break;
	default:
	/*
	* Disallow write attempts if the filesystem is
	* mounted read-only.
	*/
	if (vp->v_mount->mnt_flag & MNT_RDONLY)
	return (EROFS);
	/*
	* We run vnode_pager_setsize() early (why?),
	* we must set np->n_size now to avoid vinvalbuf
	* V_SAVE races that might setsize a lower
	* value.
	*/
	mtx_lock(&np->n_mtx);
	tsize = np->n_size;
	mtx_unlock(&np->n_mtx);
	error = nfs_meta_setsize(vp, ap->a_cred, td,
	vap->va_size);
	mtx_lock(&np->n_mtx);
	if (np->n_flag & NMODIFIED) {
	tsize = np->n_size;
	mtx_unlock(&np->n_mtx);
	if (vap->va_size == 0)
	error = nfs_vinvalbuf(vp, 0, td, 1);
	else
	error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
	if (error) {
	vnode_pager_setsize(vp, tsize);
	goto out;
	}
	} else
	mtx_unlock(&np->n_mtx);
	/*
	* np->n_size has already been set to vap->va_size
	* in nfs_meta_setsize(). We must set it again since
	* nfs_loadattrcache() could be called through
	* nfs_meta_setsize() and could modify np->n_size.
	*/
	mtx_lock(&np->n_mtx);
	np->n_vattr.va_size = np->n_size = vap->va_size;
	mtx_unlock(&np->n_mtx);
	};
	} else {
	mtx_lock(&np->n_mtx);
	if ((vap->va_mtime.tv_sec != VNOVAL \|\| vap->va_atime.tv_sec != VNOVAL) &&
	(np->n_flag & NMODIFIED) && vp->v_type == VREG) {
	mtx_unlock(&np->n_mtx);
	if ((error = nfs_vinvalbuf(vp, V_SAVE, td, 1)) != 0 &&
	(error == EINTR \|\| error == EIO))
	return error;
	} else
	mtx_unlock(&np->n_mtx);
	}
	error = nfs_setattrrpc(vp, vap, ap->a_cred);
	if (error && vap->va_size != VNOVAL) {
	mtx_lock(&np->n_mtx);
	np->n_size = np->n_vattr.va_size = tsize;
	vnode_pager_setsize(vp, tsize);
	mtx_unlock(&np->n_mtx);
	}
	out:
	return (error);
	}

	/*
	* Do an nfs setattr rpc.
	*/
	static int
	nfs_setattrrpc(struct vnode vp, struct vattr vap, struct ucred *cred)
	{
	struct nfsv2_sattr *sp;
	struct nfsnode *np = VTONFS(vp);
	caddr_t bpos, dpos;
	u_int32_t *tl;
	int error = 0, wccflag = NFSV3_WCCRATTR;
	struct mbuf mreq, mrep, md, mb;
	int v3 = NFS_ISV3(vp);

	nfsstats.rpccnt[NFSPROC_SETATTR]++;
	mreq = nfsm_reqhead(vp, NFSPROC_SETATTR, NFSX_FH(v3) + NFSX_SATTR(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	if (v3) {
	nfsm_v3attrbuild(vap, TRUE);
	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
	*tl = nfs_false;
	} else {
	sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
	if (vap->va_mode == (mode_t)VNOVAL)
	sp->sa_mode = nfs_xdrneg1;
	else
	sp->sa_mode = vtonfsv2_mode(vp->v_type, vap->va_mode);
	if (vap->va_uid == (uid_t)VNOVAL)
	sp->sa_uid = nfs_xdrneg1;
	else
	sp->sa_uid = txdr_unsigned(vap->va_uid);
	if (vap->va_gid == (gid_t)VNOVAL)
	sp->sa_gid = nfs_xdrneg1;
	else
	sp->sa_gid = txdr_unsigned(vap->va_gid);
	sp->sa_size = txdr_unsigned(vap->va_size);
	txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
	txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
	}
	nfsm_request(vp, NFSPROC_SETATTR, curthread, cred);
	if (v3) {
	np->n_modestamp = 0;
	nfsm_wcc_data(vp, wccflag);
	} else
	nfsm_loadattr(vp, NULL);
	m_freem(mrep);
	nfsmout:
	return (error);
	}

	/*
	* nfs lookup call, one step at a time...
	* First look in cache
	* If not found, unlock the directory nfsnode and do the rpc
	*/
	static int
	nfs_lookup(struct vop_lookup_args *ap)
	{
	struct componentname *cnp = ap->a_cnp;
	struct vnode *dvp = ap->a_dvp;
	struct vnode **vpp = ap->a_vpp;
	int flags = cnp->cn_flags;
	struct vnode *newvp;
	struct nfsmount *nmp;
	caddr_t bpos, dpos;
	struct mbuf mreq, mrep, md, mb;
	long len;
	nfsfh_t *fhp;
	struct nfsnode *np;
	int error = 0, attrflag, fhsize;
	int v3 = NFS_ISV3(dvp);
	struct thread *td = cnp->cn_thread;

	*vpp = NULLVP;
	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
	(cnp->cn_nameiop == DELETE \|\| cnp->cn_nameiop == RENAME))
	return (EROFS);
	if (dvp->v_type != VDIR)
	return (ENOTDIR);
	nmp = VFSTONFS(dvp->v_mount);
	np = VTONFS(dvp);
	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0) {
	*vpp = NULLVP;
	return (error);
	}
	error = cache_lookup(dvp, vpp, cnp);
	if (error > 0 && error != ENOENT)
	return (error);
	if (error == -1) {
	struct vattr vattr;

	newvp = *vpp;
	if (!VOP_GETATTR(newvp, &vattr, cnp->cn_cred)
	&& vattr.va_ctime.tv_sec == VTONFS(newvp)->n_ctime) {
	nfsstats.lookupcache_hits++;
	if (cnp->cn_nameiop != LOOKUP &&
	(flags & ISLASTCN))
	cnp->cn_flags \|= SAVENAME;
	return (0);
	}
	cache_purge(newvp);
	if (dvp != newvp)
	vput(newvp);
	else
	vrele(newvp);
	*vpp = NULLVP;
	}
	error = 0;
	newvp = NULLVP;
	nfsstats.lookupcache_misses++;
	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
	len = cnp->cn_namelen;
	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
	NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
	nfsm_request(dvp, NFSPROC_LOOKUP, cnp->cn_thread, cnp->cn_cred);
	if (error) {
	if (v3) {
	nfsm_postop_attr(dvp, attrflag);
	m_freem(mrep);
	}
	goto nfsmout;
	}
	nfsm_getfh(fhp, fhsize, v3);

	/*
	* Handle RENAME case...
	*/
	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
	if (NFS_CMPFH(np, fhp, fhsize)) {
	m_freem(mrep);
	return (EISDIR);
	}
	error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, LK_EXCLUSIVE);
	if (error) {
	m_freem(mrep);
	return (error);
	}
	newvp = NFSTOV(np);
	if (v3) {
	nfsm_postop_attr(newvp, attrflag);
	nfsm_postop_attr(dvp, attrflag);
	} else
	nfsm_loadattr(newvp, NULL);
	*vpp = newvp;
	m_freem(mrep);
	cnp->cn_flags \|= SAVENAME;
	return (0);
	}

	if (flags & ISDOTDOT) {
	VOP_UNLOCK(dvp, 0);
	error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
	vn_lock(dvp, LK_EXCLUSIVE \| LK_RETRY);
	if (error)
	return (error);
	newvp = NFSTOV(np);
	} else if (NFS_CMPFH(np, fhp, fhsize)) {
	VREF(dvp);
	newvp = dvp;
	} else {
	error = nfs_nget(dvp->v_mount, fhp, fhsize, &np, cnp->cn_lkflags);
	if (error) {
	m_freem(mrep);
	return (error);
	}
	newvp = NFSTOV(np);
	}
	if (v3) {
	nfsm_postop_attr(newvp, attrflag);
	nfsm_postop_attr(dvp, attrflag);
	} else
	nfsm_loadattr(newvp, NULL);
	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
	cnp->cn_flags \|= SAVENAME;
	if ((cnp->cn_flags & MAKEENTRY) &&
	(cnp->cn_nameiop != DELETE \|\| !(flags & ISLASTCN))) {
	np->n_ctime = np->n_vattr.va_ctime.tv_sec;
	cache_enter(dvp, newvp, cnp);
	}
	*vpp = newvp;
	m_freem(mrep);
	nfsmout:
	if (error) {
	if (newvp != NULLVP) {
	vput(newvp);
	*vpp = NULLVP;
	}
	if ((cnp->cn_nameiop == CREATE \|\| cnp->cn_nameiop == RENAME) &&
	(flags & ISLASTCN) && error == ENOENT) {
	if (dvp->v_mount->mnt_flag & MNT_RDONLY)
	error = EROFS;
	else
	error = EJUSTRETURN;
	}
	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
	cnp->cn_flags \|= SAVENAME;
	}
	return (error);
	}

	/*
	* nfs read call.
	* Just call nfs_bioread() to do the work.
	*/
	static int
	nfs_read(struct vop_read_args *ap)
	{
	struct vnode *vp = ap->a_vp;

	switch (vp->v_type) {
	case VREG:
	return (nfs_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
	case VDIR:
	return (EISDIR);
	default:
	return (EOPNOTSUPP);
	}
	}

	/*
	* nfs readlink call
	*/
	static int
	nfs_readlink(struct vop_readlink_args *ap)
	{
	struct vnode *vp = ap->a_vp;

	if (vp->v_type != VLNK)
	return (EINVAL);
	return (nfs_bioread(vp, ap->a_uio, 0, ap->a_cred));
	}

	/*
	* Do a readlink rpc.
	* Called by nfs_doio() from below the buffer cache.
	*/
	int
	nfs_readlinkrpc(struct vnode vp, struct uio uiop, struct ucred *cred)
	{
	caddr_t bpos, dpos;
	int error = 0, len, attrflag;
	struct mbuf mreq, mrep, md, mb;
	int v3 = NFS_ISV3(vp);

	nfsstats.rpccnt[NFSPROC_READLINK]++;
	mreq = nfsm_reqhead(vp, NFSPROC_READLINK, NFSX_FH(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	nfsm_request(vp, NFSPROC_READLINK, uiop->uio_td, cred);
	if (v3)
	nfsm_postop_attr(vp, attrflag);
	if (!error) {
	nfsm_strsiz(len, NFS_MAXPATHLEN);
	if (len == NFS_MAXPATHLEN) {
	struct nfsnode *np = VTONFS(vp);
	mtx_lock(&np->n_mtx);
	if (np->n_size && np->n_size < NFS_MAXPATHLEN)
	len = np->n_size;
	mtx_unlock(&np->n_mtx);
	}
	nfsm_mtouio(uiop, len);
	}
	m_freem(mrep);
	nfsmout:
	return (error);
	}

	/*
	* nfs read rpc call
	* Ditto above
	*/
	int
	nfs_readrpc(struct vnode vp, struct uio uiop, struct ucred *cred)
	{
	u_int32_t *tl;
	caddr_t bpos, dpos;
	struct mbuf mreq, mrep, md, mb;
	struct nfsmount *nmp;
	int error = 0, len, retlen, tsiz, eof, attrflag;
	int v3 = NFS_ISV3(vp);
	int rsize;

	#ifndef nolint
	eof = 0;
	#endif
	nmp = VFSTONFS(vp->v_mount);
	tsiz = uiop->uio_resid;
	mtx_lock(&nmp->nm_mtx);
	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
	mtx_unlock(&nmp->nm_mtx);
	return (EFBIG);
	}
	rsize = nmp->nm_rsize;
	mtx_unlock(&nmp->nm_mtx);
	while (tsiz > 0) {
	nfsstats.rpccnt[NFSPROC_READ]++;
	len = (tsiz > rsize) ? rsize : tsiz;
	mreq = nfsm_reqhead(vp, NFSPROC_READ, NFSX_FH(v3) + NFSX_UNSIGNED * 3);
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	tl = nfsm_build(u_int32_t , NFSX_UNSIGNED 3);
	if (v3) {
	txdr_hyper(uiop->uio_offset, tl);
	*(tl + 2) = txdr_unsigned(len);
	} else {
	*tl++ = txdr_unsigned(uiop->uio_offset);
	*tl++ = txdr_unsigned(len);
	*tl = 0;
	}
	nfsm_request(vp, NFSPROC_READ, uiop->uio_td, cred);
	if (v3) {
	nfsm_postop_attr(vp, attrflag);
	if (error) {
	m_freem(mrep);
	goto nfsmout;
	}
	tl = nfsm_dissect(u_int32_t , 2 NFSX_UNSIGNED);
	eof = fxdr_unsigned(int, *(tl + 1));
	} else {
	nfsm_loadattr(vp, NULL);
	}
	nfsm_strsiz(retlen, rsize);
	nfsm_mtouio(uiop, retlen);
	m_freem(mrep);
	tsiz -= retlen;
	if (v3) {
	if (eof \|\| retlen == 0) {
	tsiz = 0;
	}
	} else if (retlen < len) {
	tsiz = 0;
	}
	}
	nfsmout:
	return (error);
	}

	/*
	* nfs write call
	*/
	int
	nfs_writerpc(struct vnode vp, struct uio uiop, struct ucred *cred,
	int iomode, int must_commit)
	{
	u_int32_t *tl;
	int32_t backup;
	caddr_t bpos, dpos;
	struct mbuf mreq, mrep, md, mb;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	int error = 0, len, tsiz, wccflag = NFSV3_WCCRATTR, rlen, commit;
	int v3 = NFS_ISV3(vp), committed = NFSV3WRITE_FILESYNC;
	int wsize;

	#ifndef DIAGNOSTIC
	if (uiop->uio_iovcnt != 1)
	panic("nfs: writerpc iovcnt > 1");
	#endif
	*must_commit = 0;
	tsiz = uiop->uio_resid;
	mtx_lock(&nmp->nm_mtx);
	if (uiop->uio_offset + tsiz > nmp->nm_maxfilesize) {
	mtx_unlock(&nmp->nm_mtx);
	return (EFBIG);
	}
	wsize = nmp->nm_wsize;
	mtx_unlock(&nmp->nm_mtx);
	while (tsiz > 0) {
	nfsstats.rpccnt[NFSPROC_WRITE]++;
	len = (tsiz > wsize) ? wsize : tsiz;
	mreq = nfsm_reqhead(vp, NFSPROC_WRITE,
	NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	if (v3) {
	tl = nfsm_build(u_int32_t , 5 NFSX_UNSIGNED);
	txdr_hyper(uiop->uio_offset, tl);
	tl += 2;
	*tl++ = txdr_unsigned(len);
	tl++ = txdr_unsigned(iomode);
	*tl = txdr_unsigned(len);
	} else {
	u_int32_t x;

	tl = nfsm_build(u_int32_t , 4 NFSX_UNSIGNED);
	/* Set both "begin" and "current" to non-garbage. */
	x = txdr_unsigned((u_int32_t)uiop->uio_offset);
	tl++ = x; / "begin offset" */
	tl++ = x; / "current offset" */
	x = txdr_unsigned(len);
	tl++ = x; / total to this offset */
	tl = x; / size of this write */
	}
	nfsm_uiotom(uiop, len);
	nfsm_request(vp, NFSPROC_WRITE, uiop->uio_td, cred);
	if (v3) {
	wccflag = NFSV3_WCCCHK;
	nfsm_wcc_data(vp, wccflag);
	if (!error) {
	tl = nfsm_dissect(u_int32_t , 2 NFSX_UNSIGNED
	+ NFSX_V3WRITEVERF);
	rlen = fxdr_unsigned(int, *tl++);
	if (rlen == 0) {
	error = NFSERR_IO;
	m_freem(mrep);
	break;
	} else if (rlen < len) {
	backup = len - rlen;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base -
	backup;
	uiop->uio_iov->iov_len += backup;
	uiop->uio_offset -= backup;
	uiop->uio_resid += backup;
	len = rlen;
	}
	commit = fxdr_unsigned(int, *tl++);

	/*
	* Return the lowest committment level
	* obtained by any of the RPCs.
	*/
	if (committed == NFSV3WRITE_FILESYNC)
	committed = commit;
	else if (committed == NFSV3WRITE_DATASYNC &&
	commit == NFSV3WRITE_UNSTABLE)
	committed = commit;
	mtx_lock(&nmp->nm_mtx);
	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
	bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
	NFSX_V3WRITEVERF);
	nmp->nm_state \|= NFSSTA_HASWRITEVERF;
	} else if (bcmp((caddr_t)tl,
	(caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF)) {
	*must_commit = 1;
	bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
	NFSX_V3WRITEVERF);
	}
	mtx_unlock(&nmp->nm_mtx);
	}
	} else {
	nfsm_loadattr(vp, NULL);
	}
	if (wccflag) {
	mtx_lock(&(VTONFS(vp))->n_mtx);
	VTONFS(vp)->n_mtime = VTONFS(vp)->n_vattr.va_mtime;
	mtx_unlock(&(VTONFS(vp))->n_mtx);
	}
	m_freem(mrep);
	if (error)
	break;
	tsiz -= len;
	}
	nfsmout:
	if (vp->v_mount->mnt_kern_flag & MNTK_ASYNC)
	committed = NFSV3WRITE_FILESYNC;
	*iomode = committed;
	if (error)
	uiop->uio_resid = tsiz;
	return (error);
	}

	/*
	* nfs mknod rpc
	* For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
	* mode set to specify the file type and the size field for rdev.
	*/
	static int
	nfs_mknodrpc(struct vnode dvp, struct vnode vpp, struct componentname cnp,
	struct vattr *vap)
	{
	struct nfsv2_sattr *sp;
	u_int32_t *tl;
	struct vnode *newvp = NULL;
	struct nfsnode *np = NULL;
	struct vattr vattr;
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0;
	struct mbuf mreq, mrep, md, mb;
	u_int32_t rdev;
	int v3 = NFS_ISV3(dvp);

	if (vap->va_type == VCHR \|\| vap->va_type == VBLK)
	rdev = txdr_unsigned(vap->va_rdev);
	else if (vap->va_type == VFIFO \|\| vap->va_type == VSOCK)
	rdev = nfs_xdrneg1;
	else {
	return (EOPNOTSUPP);
	}
	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
	return (error);
	nfsstats.rpccnt[NFSPROC_MKNOD]++;
	mreq = nfsm_reqhead(dvp, NFSPROC_MKNOD, NFSX_FH(v3) + 4 * NFSX_UNSIGNED +
	+ nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
	if (v3) {
	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
	*tl++ = vtonfsv3_type(vap->va_type);
	nfsm_v3attrbuild(vap, FALSE);
	if (vap->va_type == VCHR \|\| vap->va_type == VBLK) {
	tl = nfsm_build(u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = txdr_unsigned(umajor(vap->va_rdev));
	*tl = txdr_unsigned(uminor(vap->va_rdev));
	}
	} else {
	sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
	sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
	sp->sa_uid = nfs_xdrneg1;
	sp->sa_gid = nfs_xdrneg1;
	sp->sa_size = rdev;
	txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
	txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
	}
	nfsm_request(dvp, NFSPROC_MKNOD, cnp->cn_thread, cnp->cn_cred);
	if (!error) {
	nfsm_mtofh(dvp, newvp, v3, gotvp);
	if (!gotvp) {
	if (newvp) {
	vput(newvp);
	newvp = NULL;
	}
	error = nfs_lookitup(dvp, cnp->cn_nameptr,
	cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
	if (!error)
	newvp = NFSTOV(np);
	}
	}
	if (v3)
	nfsm_wcc_data(dvp, wccflag);
	m_freem(mrep);
	nfsmout:
	if (error) {
	if (newvp)
	vput(newvp);
	} else {
	if (cnp->cn_flags & MAKEENTRY)
	cache_enter(dvp, newvp, cnp);
	*vpp = newvp;
	}
	mtx_lock(&(VTONFS(dvp))->n_mtx);
	VTONFS(dvp)->n_flag \|= NMODIFIED;
	if (!wccflag)
	VTONFS(dvp)->n_attrstamp = 0;
	mtx_unlock(&(VTONFS(dvp))->n_mtx);
	return (error);
	}

	/*
	* nfs mknod vop
	* just call nfs_mknodrpc() to do the work.
	*/
	/* ARGSUSED */
	static int
	nfs_mknod(struct vop_mknod_args *ap)
	{
	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
	}

	static u_long create_verf;
	/*
	* nfs file create call
	*/
	static int
	nfs_create(struct vop_create_args *ap)
	{
	struct vnode *dvp = ap->a_dvp;
	struct vattr *vap = ap->a_vap;
	struct componentname *cnp = ap->a_cnp;
	struct nfsv2_sattr *sp;
	u_int32_t *tl;
	struct nfsnode *np = NULL;
	struct vnode *newvp = NULL;
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR, gotvp = 0, fmode = 0;
	struct mbuf mreq, mrep, md, mb;
	struct vattr vattr;
	int v3 = NFS_ISV3(dvp);

	/*
	* Oops, not for me..
	*/
	if (vap->va_type == VSOCK)
	return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));

	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
	return (error);
	if (vap->va_vaflags & VA_EXCLUSIVE)
	fmode \|= O_EXCL;
	again:
	nfsstats.rpccnt[NFSPROC_CREATE]++;
	mreq = nfsm_reqhead(dvp, NFSPROC_CREATE, NFSX_FH(v3) + 2 * NFSX_UNSIGNED +
	nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
	if (v3) {
	tl = nfsm_build(u_int32_t *, NFSX_UNSIGNED);
	if (fmode & O_EXCL) {
	+ CURVNET_SET(VFSTONFS(dvp->v_mount)->nm_so->so_vnet);
	*tl = txdr_unsigned(NFSV3CREATE_EXCLUSIVE);
	tl = nfsm_build(u_int32_t *, NFSX_V3CREATEVERF);
	#ifdef INET
	+ INIT_VNET_INET(curvnet);
	if (!TAILQ_EMPTY(&V_in_ifaddrhead))
	*tl++ = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr.s_addr;
	else
	#endif
	*tl++ = create_verf;
	*tl = ++create_verf;
	+ CURVNET_RESTORE();
	} else {
	*tl = txdr_unsigned(NFSV3CREATE_UNCHECKED);
	nfsm_v3attrbuild(vap, FALSE);
	}
	} else {
	sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
	sp->sa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
	sp->sa_uid = nfs_xdrneg1;
	sp->sa_gid = nfs_xdrneg1;
	sp->sa_size = 0;
	txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
	txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
	}
	nfsm_request(dvp, NFSPROC_CREATE, cnp->cn_thread, cnp->cn_cred);
	if (!error) {
	nfsm_mtofh(dvp, newvp, v3, gotvp);
	if (!gotvp) {
	if (newvp) {
	vput(newvp);
	newvp = NULL;
	}
	error = nfs_lookitup(dvp, cnp->cn_nameptr,
	cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread, &np);
	if (!error)
	newvp = NFSTOV(np);
	}
	}
	if (v3)
	nfsm_wcc_data(dvp, wccflag);
	m_freem(mrep);
	nfsmout:
	if (error) {
	if (v3 && (fmode & O_EXCL) && error == NFSERR_NOTSUPP) {
	fmode &= ~O_EXCL;
	goto again;
	}
	if (newvp)
	vput(newvp);
	} else if (v3 && (fmode & O_EXCL)) {
	/*
	* We are normally called with only a partially initialized
	* VAP. Since the NFSv3 spec says that server may use the
	* file attributes to store the verifier, the spec requires
	* us to do a SETATTR RPC. FreeBSD servers store the verifier
	* in atime, but we can't really assume that all servers will
	* so we ensure that our SETATTR sets both atime and mtime.
	*/
	if (vap->va_mtime.tv_sec == VNOVAL)
	vfs_timestamp(&vap->va_mtime);
	if (vap->va_atime.tv_sec == VNOVAL)
	vap->va_atime = vap->va_mtime;
	error = nfs_setattrrpc(newvp, vap, cnp->cn_cred);
	if (error)
	vput(newvp);
	}
	if (!error) {
	if (cnp->cn_flags & MAKEENTRY)
	cache_enter(dvp, newvp, cnp);
	*ap->a_vpp = newvp;
	}
	mtx_lock(&(VTONFS(dvp))->n_mtx);
	VTONFS(dvp)->n_flag \|= NMODIFIED;
	if (!wccflag)
	VTONFS(dvp)->n_attrstamp = 0;
	mtx_unlock(&(VTONFS(dvp))->n_mtx);
	return (error);
	}

	/*
	* nfs file remove call
	* To try and make nfs semantics closer to ufs semantics, a file that has
	* other processes using the vnode is renamed instead of removed and then
	* removed later on the last close.
	* - If v_usecount > 1
	* If a rename is not already in the works
	* call nfs_sillyrename() to set it up
	* else
	* do the remove rpc
	*/
	static int
	nfs_remove(struct vop_remove_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct vnode *dvp = ap->a_dvp;
	struct componentname *cnp = ap->a_cnp;
	struct nfsnode *np = VTONFS(vp);
	int error = 0;
	struct vattr vattr;

	#ifndef DIAGNOSTIC
	if ((cnp->cn_flags & HASBUF) == 0)
	panic("nfs_remove: no name");
	if (vrefcnt(vp) < 1)
	panic("nfs_remove: bad v_usecount");
	#endif
	if (vp->v_type == VDIR)
	error = EPERM;
	else if (vrefcnt(vp) == 1 \|\| (np->n_sillyrename &&
	!VOP_GETATTR(vp, &vattr, cnp->cn_cred) && vattr.va_nlink > 1)) {
	/*
	* Purge the name cache so that the chance of a lookup for
	* the name succeeding while the remove is in progress is
	* minimized. Without node locking it can still happen, such
	* that an I/O op returns ESTALE, but since you get this if
	* another host removes the file..
	*/
	cache_purge(vp);
	/*
	* throw away biocache buffers, mainly to avoid
	* unnecessary delayed writes later.
	*/
	error = nfs_vinvalbuf(vp, 0, cnp->cn_thread, 1);
	/* Do the rpc */
	if (error != EINTR && error != EIO)
	error = nfs_removerpc(dvp, cnp->cn_nameptr,
	cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
	/*
	* Kludge City: If the first reply to the remove rpc is lost..
	* the reply to the retransmitted request will be ENOENT
	* since the file was in fact removed
	* Therefore, we cheat and return success.
	*/
	if (error == ENOENT)
	error = 0;
	} else if (!np->n_sillyrename)
	error = nfs_sillyrename(dvp, vp, cnp);
	np->n_attrstamp = 0;
	return (error);
	}

	/*
	* nfs file remove rpc called from nfs_inactive
	*/
	int
	nfs_removeit(struct sillyrename *sp)
	{
	/*
	* Make sure that the directory vnode is still valid.
	* XXX we should lock sp->s_dvp here.
	*/
	if (sp->s_dvp->v_type == VBAD)
	return (0);
	return (nfs_removerpc(sp->s_dvp, sp->s_name, sp->s_namlen, sp->s_cred,
	NULL));
	}

	/*
	* Nfs remove rpc, called from nfs_remove() and nfs_removeit().
	*/
	static int
	nfs_removerpc(struct vnode dvp, const char name, int namelen,
	struct ucred cred, struct thread td)
	{
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR;
	struct mbuf mreq, mrep, md, mb;
	int v3 = NFS_ISV3(dvp);

	nfsstats.rpccnt[NFSPROC_REMOVE]++;
	mreq = nfsm_reqhead(dvp, NFSPROC_REMOVE,
	NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(namelen));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(name, namelen, NFS_MAXNAMLEN);
	nfsm_request(dvp, NFSPROC_REMOVE, td, cred);
	if (v3)
	nfsm_wcc_data(dvp, wccflag);
	m_freem(mrep);
	nfsmout:
	mtx_lock(&(VTONFS(dvp))->n_mtx);
	VTONFS(dvp)->n_flag \|= NMODIFIED;
	if (!wccflag)
	VTONFS(dvp)->n_attrstamp = 0;
	mtx_unlock(&(VTONFS(dvp))->n_mtx);
	return (error);
	}

	/*
	* nfs file rename call
	*/
	static int
	nfs_rename(struct vop_rename_args *ap)
	{
	struct vnode *fvp = ap->a_fvp;
	struct vnode *tvp = ap->a_tvp;
	struct vnode *fdvp = ap->a_fdvp;
	struct vnode *tdvp = ap->a_tdvp;
	struct componentname *tcnp = ap->a_tcnp;
	struct componentname *fcnp = ap->a_fcnp;
	int error;

	#ifndef DIAGNOSTIC
	if ((tcnp->cn_flags & HASBUF) == 0 \|\|
	(fcnp->cn_flags & HASBUF) == 0)
	panic("nfs_rename: no name");
	#endif
	/* Check for cross-device rename */
	if ((fvp->v_mount != tdvp->v_mount) \|\|
	(tvp && (fvp->v_mount != tvp->v_mount))) {
	error = EXDEV;
	goto out;
	}

	if (fvp == tvp) {
	nfs_printf("nfs_rename: fvp == tvp (can't happen)\n");
	error = 0;
	goto out;
	}
	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
	goto out;

	/*
	* We have to flush B_DELWRI data prior to renaming
	* the file. If we don't, the delayed-write buffers
	* can be flushed out later after the file has gone stale
	* under NFSV3. NFSV2 does not have this problem because
	* ( as far as I can tell ) it flushes dirty buffers more
	* often.
	*
	* Skip the rename operation if the fsync fails, this can happen
	* due to the server's volume being full, when we pushed out data
	* that was written back to our cache earlier. Not checking for
	* this condition can result in potential (silent) data loss.
	*/
	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
	VOP_UNLOCK(fvp, 0);
	if (!error && tvp)
	error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
	if (error)
	goto out;

	/*
	* If the tvp exists and is in use, sillyrename it before doing the
	* rename of the new file over it.
	* XXX Can't sillyrename a directory.
	*/
	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
	tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
	vput(tvp);
	tvp = NULL;
	}

	error = nfs_renamerpc(fdvp, fcnp->cn_nameptr, fcnp->cn_namelen,
	tdvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
	tcnp->cn_thread);

	if (fvp->v_type == VDIR) {
	if (tvp != NULL && tvp->v_type == VDIR)
	cache_purge(tdvp);
	cache_purge(fdvp);
	}

	out:
	if (tdvp == tvp)
	vrele(tdvp);
	else
	vput(tdvp);
	if (tvp)
	vput(tvp);
	vrele(fdvp);
	vrele(fvp);
	/*
	* Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
	*/
	if (error == ENOENT)
	error = 0;
	return (error);
	}

	/*
	* nfs file rename rpc called from nfs_remove() above
	*/
	static int
	nfs_renameit(struct vnode sdvp, struct componentname scnp,
	struct sillyrename *sp)
	{

	return (nfs_renamerpc(sdvp, scnp->cn_nameptr, scnp->cn_namelen, sdvp,
	sp->s_name, sp->s_namlen, scnp->cn_cred, scnp->cn_thread));
	}

	/*
	* Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
	*/
	static int
	nfs_renamerpc(struct vnode fdvp, const char fnameptr, int fnamelen,
	struct vnode tdvp, const char tnameptr, int tnamelen, struct ucred *cred,
	struct thread *td)
	{
	caddr_t bpos, dpos;
	int error = 0, fwccflag = NFSV3_WCCRATTR, twccflag = NFSV3_WCCRATTR;
	struct mbuf mreq, mrep, md, mb;
	int v3 = NFS_ISV3(fdvp);

	nfsstats.rpccnt[NFSPROC_RENAME]++;
	mreq = nfsm_reqhead(fdvp, NFSPROC_RENAME,
	(NFSX_FH(v3) + NFSX_UNSIGNED)*2 + nfsm_rndup(fnamelen) +
	nfsm_rndup(tnamelen));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(fdvp, v3);
	nfsm_strtom(fnameptr, fnamelen, NFS_MAXNAMLEN);
	nfsm_fhtom(tdvp, v3);
	nfsm_strtom(tnameptr, tnamelen, NFS_MAXNAMLEN);
	nfsm_request(fdvp, NFSPROC_RENAME, td, cred);
	if (v3) {
	nfsm_wcc_data(fdvp, fwccflag);
	nfsm_wcc_data(tdvp, twccflag);
	}
	m_freem(mrep);
	nfsmout:
	mtx_lock(&(VTONFS(fdvp))->n_mtx);
	VTONFS(fdvp)->n_flag \|= NMODIFIED;
	mtx_unlock(&(VTONFS(fdvp))->n_mtx);
	mtx_lock(&(VTONFS(tdvp))->n_mtx);
	VTONFS(tdvp)->n_flag \|= NMODIFIED;
	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
	if (!fwccflag)
	VTONFS(fdvp)->n_attrstamp = 0;
	if (!twccflag)
	VTONFS(tdvp)->n_attrstamp = 0;
	return (error);
	}

	/*
	* nfs hard link create call
	*/
	static int
	nfs_link(struct vop_link_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct vnode *tdvp = ap->a_tdvp;
	struct componentname *cnp = ap->a_cnp;
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR, attrflag = 0;
	struct mbuf mreq, mrep, md, mb;
	int v3;

	if (vp->v_mount != tdvp->v_mount) {
	return (EXDEV);
	}

	/*
	* Push all writes to the server, so that the attribute cache
	* doesn't get "out of sync" with the server.
	* XXX There should be a better way!
	*/
	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);

	v3 = NFS_ISV3(vp);
	nfsstats.rpccnt[NFSPROC_LINK]++;
	mreq = nfsm_reqhead(vp, NFSPROC_LINK,
	NFSX_FH(v3)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	nfsm_fhtom(tdvp, v3);
	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
	nfsm_request(vp, NFSPROC_LINK, cnp->cn_thread, cnp->cn_cred);
	if (v3) {
	nfsm_postop_attr(vp, attrflag);
	nfsm_wcc_data(tdvp, wccflag);
	}
	m_freem(mrep);
	nfsmout:
	mtx_lock(&(VTONFS(tdvp))->n_mtx);
	VTONFS(tdvp)->n_flag \|= NMODIFIED;
	mtx_unlock(&(VTONFS(tdvp))->n_mtx);
	if (!attrflag)
	VTONFS(vp)->n_attrstamp = 0;
	if (!wccflag)
	VTONFS(tdvp)->n_attrstamp = 0;
	return (error);
	}

	/*
	* nfs symbolic link create call
	*/
	static int
	nfs_symlink(struct vop_symlink_args *ap)
	{
	struct vnode *dvp = ap->a_dvp;
	struct vattr *vap = ap->a_vap;
	struct componentname *cnp = ap->a_cnp;
	struct nfsv2_sattr *sp;
	caddr_t bpos, dpos;
	int slen, error = 0, wccflag = NFSV3_WCCRATTR, gotvp;
	struct mbuf mreq, mrep, md, mb;
	struct vnode *newvp = NULL;
	int v3 = NFS_ISV3(dvp);

	nfsstats.rpccnt[NFSPROC_SYMLINK]++;
	slen = strlen(ap->a_target);
	mreq = nfsm_reqhead(dvp, NFSPROC_SYMLINK, NFSX_FH(v3) + 2*NFSX_UNSIGNED +
	nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
	if (v3) {
	nfsm_v3attrbuild(vap, FALSE);
	}
	nfsm_strtom(ap->a_target, slen, NFS_MAXPATHLEN);
	if (!v3) {
	sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
	sp->sa_mode = vtonfsv2_mode(VLNK, vap->va_mode);
	sp->sa_uid = nfs_xdrneg1;
	sp->sa_gid = nfs_xdrneg1;
	sp->sa_size = nfs_xdrneg1;
	txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
	txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
	}

	/*
	* Issue the NFS request and get the rpc response.
	*
	* Only NFSv3 responses returning an error of 0 actually return
	* a file handle that can be converted into newvp without having
	* to do an extra lookup rpc.
	*/
	nfsm_request(dvp, NFSPROC_SYMLINK, cnp->cn_thread, cnp->cn_cred);
	if (v3) {
	if (error == 0)
	nfsm_mtofh(dvp, newvp, v3, gotvp);
	nfsm_wcc_data(dvp, wccflag);
	}

	/*
	* out code jumps -> here, mrep is also freed.
	*/

	m_freem(mrep);
	nfsmout:

	/*
	* If we do not have an error and we could not extract the newvp from
	* the response due to the request being NFSv2, we have to do a
	* lookup in order to obtain a newvp to return.
	*/
	if (error == 0 && newvp == NULL) {
	struct nfsnode *np = NULL;

	error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
	cnp->cn_cred, cnp->cn_thread, &np);
	if (!error)
	newvp = NFSTOV(np);
	}
	if (error) {
	if (newvp)
	vput(newvp);
	} else {
	*ap->a_vpp = newvp;
	}
	mtx_lock(&(VTONFS(dvp))->n_mtx);
	VTONFS(dvp)->n_flag \|= NMODIFIED;
	mtx_unlock(&(VTONFS(dvp))->n_mtx);
	if (!wccflag)
	VTONFS(dvp)->n_attrstamp = 0;
	return (error);
	}

	/*
	* nfs make dir call
	*/
	static int
	nfs_mkdir(struct vop_mkdir_args *ap)
	{
	struct vnode *dvp = ap->a_dvp;
	struct vattr *vap = ap->a_vap;
	struct componentname *cnp = ap->a_cnp;
	struct nfsv2_sattr *sp;
	int len;
	struct nfsnode *np = NULL;
	struct vnode *newvp = NULL;
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR;
	int gotvp = 0;
	struct mbuf mreq, mrep, md, mb;
	struct vattr vattr;
	int v3 = NFS_ISV3(dvp);

	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
	return (error);
	len = cnp->cn_namelen;
	nfsstats.rpccnt[NFSPROC_MKDIR]++;
	mreq = nfsm_reqhead(dvp, NFSPROC_MKDIR,
	NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len) + NFSX_SATTR(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(cnp->cn_nameptr, len, NFS_MAXNAMLEN);
	if (v3) {
	nfsm_v3attrbuild(vap, FALSE);
	} else {
	sp = nfsm_build(struct nfsv2_sattr *, NFSX_V2SATTR);
	sp->sa_mode = vtonfsv2_mode(VDIR, vap->va_mode);
	sp->sa_uid = nfs_xdrneg1;
	sp->sa_gid = nfs_xdrneg1;
	sp->sa_size = nfs_xdrneg1;
	txdr_nfsv2time(&vap->va_atime, &sp->sa_atime);
	txdr_nfsv2time(&vap->va_mtime, &sp->sa_mtime);
	}
	nfsm_request(dvp, NFSPROC_MKDIR, cnp->cn_thread, cnp->cn_cred);
	if (!error)
	nfsm_mtofh(dvp, newvp, v3, gotvp);
	if (v3)
	nfsm_wcc_data(dvp, wccflag);
	m_freem(mrep);
	nfsmout:
	mtx_lock(&(VTONFS(dvp))->n_mtx);
	VTONFS(dvp)->n_flag \|= NMODIFIED;
	mtx_unlock(&(VTONFS(dvp))->n_mtx);
	if (!wccflag)
	VTONFS(dvp)->n_attrstamp = 0;
	if (error == 0 && newvp == NULL) {
	error = nfs_lookitup(dvp, cnp->cn_nameptr, len, cnp->cn_cred,
	cnp->cn_thread, &np);
	if (!error) {
	newvp = NFSTOV(np);
	if (newvp->v_type != VDIR)
	error = EEXIST;
	}
	}
	if (error) {
	if (newvp)
	vput(newvp);
	} else
	*ap->a_vpp = newvp;
	return (error);
	}

	/*
	* nfs remove directory call
	*/
	static int
	nfs_rmdir(struct vop_rmdir_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct vnode *dvp = ap->a_dvp;
	struct componentname *cnp = ap->a_cnp;
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR;
	struct mbuf mreq, mrep, md, mb;
	int v3 = NFS_ISV3(dvp);

	if (dvp == vp)
	return (EINVAL);
	nfsstats.rpccnt[NFSPROC_RMDIR]++;
	mreq = nfsm_reqhead(dvp, NFSPROC_RMDIR,
	NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(cnp->cn_nameptr, cnp->cn_namelen, NFS_MAXNAMLEN);
	nfsm_request(dvp, NFSPROC_RMDIR, cnp->cn_thread, cnp->cn_cred);
	if (v3)
	nfsm_wcc_data(dvp, wccflag);
	m_freem(mrep);
	nfsmout:
	mtx_lock(&(VTONFS(dvp))->n_mtx);
	VTONFS(dvp)->n_flag \|= NMODIFIED;
	mtx_unlock(&(VTONFS(dvp))->n_mtx);
	if (!wccflag)
	VTONFS(dvp)->n_attrstamp = 0;
	cache_purge(dvp);
	cache_purge(vp);
	/*
	* Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
	*/
	if (error == ENOENT)
	error = 0;
	return (error);
	}

	/*
	* nfs readdir call
	*/
	static int
	nfs_readdir(struct vop_readdir_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	struct uio *uio = ap->a_uio;
	int tresid, error = 0;
	struct vattr vattr;

	if (vp->v_type != VDIR)
	return(EPERM);

	/*
	* First, check for hit on the EOF offset cache
	*/
	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
	(np->n_flag & NMODIFIED) == 0) {
	if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
	mtx_lock(&np->n_mtx);
	if (!NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
	mtx_unlock(&np->n_mtx);
	nfsstats.direofcache_hits++;
	goto out;
	} else
	mtx_unlock(&np->n_mtx);
	}
	}

	/*
	* Call nfs_bioread() to do the real work.
	*/
	tresid = uio->uio_resid;
	error = nfs_bioread(vp, uio, 0, ap->a_cred);

	if (!error && uio->uio_resid == tresid) {
	nfsstats.direofcache_misses++;
	}
	out:
	return (error);
	}

	/*
	* Readdir rpc call.
	* Called from below the buffer cache by nfs_doio().
	*/
	int
	nfs_readdirrpc(struct vnode vp, struct uio uiop, struct ucred *cred)
	{
	int len, left;
	struct dirent *dp = NULL;
	u_int32_t *tl;
	caddr_t cp;
	nfsuint64 *cookiep;
	caddr_t bpos, dpos;
	struct mbuf mreq, mrep, md, mb;
	nfsuint64 cookie;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	struct nfsnode *dnp = VTONFS(vp);
	u_quad_t fileno;
	int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1;
	int attrflag;
	int v3 = NFS_ISV3(vp);

	#ifndef DIAGNOSTIC
	if (uiop->uio_iovcnt != 1 \|\| (uiop->uio_offset & (DIRBLKSIZ - 1)) \|\|
	(uiop->uio_resid & (DIRBLKSIZ - 1)))
	panic("nfs readdirrpc bad uio");
	#endif

	/*
	* If there is no cookie, assume directory was stale.
	*/
	nfs_dircookie_lock(dnp);
	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
	if (cookiep) {
	cookie = *cookiep;
	nfs_dircookie_unlock(dnp);
	} else {
	nfs_dircookie_unlock(dnp);
	return (NFSERR_BAD_COOKIE);
	}

	/*
	* Loop around doing readdir rpc's of size nm_readdirsize
	* truncated to a multiple of DIRBLKSIZ.
	* The stopping criteria is EOF or buffer full.
	*/
	while (more_dirs && bigenough) {
	nfsstats.rpccnt[NFSPROC_READDIR]++;
	mreq = nfsm_reqhead(vp, NFSPROC_READDIR, NFSX_FH(v3) +
	NFSX_READDIR(v3));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, v3);
	if (v3) {
	tl = nfsm_build(u_int32_t , 5 NFSX_UNSIGNED);
	*tl++ = cookie.nfsuquad[0];
	*tl++ = cookie.nfsuquad[1];
	mtx_lock(&dnp->n_mtx);
	*tl++ = dnp->n_cookieverf.nfsuquad[0];
	*tl++ = dnp->n_cookieverf.nfsuquad[1];
	mtx_unlock(&dnp->n_mtx);
	} else {
	tl = nfsm_build(u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = cookie.nfsuquad[0];
	}
	*tl = txdr_unsigned(nmp->nm_readdirsize);
	nfsm_request(vp, NFSPROC_READDIR, uiop->uio_td, cred);
	if (v3) {
	nfsm_postop_attr(vp, attrflag);
	if (!error) {
	tl = nfsm_dissect(u_int32_t *,
	2 * NFSX_UNSIGNED);
	mtx_lock(&dnp->n_mtx);
	dnp->n_cookieverf.nfsuquad[0] = *tl++;
	dnp->n_cookieverf.nfsuquad[1] = *tl;
	mtx_unlock(&dnp->n_mtx);
	} else {
	m_freem(mrep);
	goto nfsmout;
	}
	}
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	more_dirs = fxdr_unsigned(int, *tl);

	/* loop thru the dir entries, doctoring them to 4bsd form */
	while (more_dirs && bigenough) {
	if (v3) {
	tl = nfsm_dissect(u_int32_t *,
	3 * NFSX_UNSIGNED);
	fileno = fxdr_hyper(tl);
	len = fxdr_unsigned(int, *(tl + 2));
	} else {
	tl = nfsm_dissect(u_int32_t *,
	2 * NFSX_UNSIGNED);
	fileno = fxdr_unsigned(u_quad_t, *tl++);
	len = fxdr_unsigned(int, *tl);
	}
	if (len <= 0 \|\| len > NFS_MAXNAMLEN) {
	error = EBADRPC;
	m_freem(mrep);
	goto nfsmout;
	}
	tlen = nfsm_rndup(len);
	if (tlen == len)
	tlen += 4; /* To ensure null termination */
	left = DIRBLKSIZ - blksiz;
	if ((tlen + DIRHDSIZ) > left) {
	dp->d_reclen += left;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + left;
	uiop->uio_iov->iov_len -= left;
	uiop->uio_offset += left;
	uiop->uio_resid -= left;
	blksiz = 0;
	}
	if ((tlen + DIRHDSIZ) > uiop->uio_resid)
	bigenough = 0;
	if (bigenough) {
	dp = (struct dirent *)uiop->uio_iov->iov_base;
	dp->d_fileno = (int)fileno;
	dp->d_namlen = len;
	dp->d_reclen = tlen + DIRHDSIZ;
	dp->d_type = DT_UNKNOWN;
	blksiz += dp->d_reclen;
	if (blksiz == DIRBLKSIZ)
	blksiz = 0;
	uiop->uio_offset += DIRHDSIZ;
	uiop->uio_resid -= DIRHDSIZ;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + DIRHDSIZ;
	uiop->uio_iov->iov_len -= DIRHDSIZ;
	nfsm_mtouio(uiop, len);
	cp = uiop->uio_iov->iov_base;
	tlen -= len;
	cp = '\0'; / null terminate */
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + tlen;
	uiop->uio_iov->iov_len -= tlen;
	uiop->uio_offset += tlen;
	uiop->uio_resid -= tlen;
	} else
	nfsm_adv(nfsm_rndup(len));
	if (v3) {
	tl = nfsm_dissect(u_int32_t *,
	3 * NFSX_UNSIGNED);
	} else {
	tl = nfsm_dissect(u_int32_t *,
	2 * NFSX_UNSIGNED);
	}
	if (bigenough) {
	cookie.nfsuquad[0] = *tl++;
	if (v3)
	cookie.nfsuquad[1] = *tl++;
	} else if (v3)
	tl += 2;
	else
	tl++;
	more_dirs = fxdr_unsigned(int, *tl);
	}
	/*
	* If at end of rpc data, get the eof boolean
	*/
	if (!more_dirs) {
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	more_dirs = (fxdr_unsigned(int, *tl) == 0);
	}
	m_freem(mrep);
	}
	/*
	* Fill last record, iff any, out to a multiple of DIRBLKSIZ
	* by increasing d_reclen for the last record.
	*/
	if (blksiz > 0) {
	left = DIRBLKSIZ - blksiz;
	dp->d_reclen += left;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + left;
	uiop->uio_iov->iov_len -= left;
	uiop->uio_offset += left;
	uiop->uio_resid -= left;
	}

	/*
	* We are now either at the end of the directory or have filled the
	* block.
	*/
	if (bigenough)
	dnp->n_direofoffset = uiop->uio_offset;
	else {
	if (uiop->uio_resid > 0)
	nfs_printf("EEK! readdirrpc resid > 0\n");
	nfs_dircookie_lock(dnp);
	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
	*cookiep = cookie;
	nfs_dircookie_unlock(dnp);
	}
	nfsmout:
	return (error);
	}

	/*
	* NFS V3 readdir plus RPC. Used in place of nfs_readdirrpc().
	*/
	int
	nfs_readdirplusrpc(struct vnode vp, struct uio uiop, struct ucred *cred)
	{
	int len, left;
	struct dirent *dp;
	u_int32_t *tl;
	caddr_t cp;
	struct vnode *newvp;
	nfsuint64 *cookiep;
	caddr_t bpos, dpos, dpossav1, dpossav2;
	struct mbuf mreq, mrep, md, mb, mdsav1, mdsav2;
	struct nameidata nami, *ndp = &nami;
	struct componentname *cnp = &ndp->ni_cnd;
	nfsuint64 cookie;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	struct nfsnode dnp = VTONFS(vp), np;
	nfsfh_t *fhp;
	u_quad_t fileno;
	int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
	int attrflag, fhsize;

	#ifndef nolint
	dp = NULL;
	#endif
	#ifndef DIAGNOSTIC
	if (uiop->uio_iovcnt != 1 \|\| (uiop->uio_offset & (DIRBLKSIZ - 1)) \|\|
	(uiop->uio_resid & (DIRBLKSIZ - 1)))
	panic("nfs readdirplusrpc bad uio");
	#endif
	ndp->ni_dvp = vp;
	newvp = NULLVP;

	/*
	* If there is no cookie, assume directory was stale.
	*/
	nfs_dircookie_lock(dnp);
	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 0);
	if (cookiep) {
	cookie = *cookiep;
	nfs_dircookie_unlock(dnp);
	} else {
	nfs_dircookie_unlock(dnp);
	return (NFSERR_BAD_COOKIE);
	}
	/*
	* Loop around doing readdir rpc's of size nm_readdirsize
	* truncated to a multiple of DIRBLKSIZ.
	* The stopping criteria is EOF or buffer full.
	*/
	while (more_dirs && bigenough) {
	nfsstats.rpccnt[NFSPROC_READDIRPLUS]++;
	mreq = nfsm_reqhead(vp, NFSPROC_READDIRPLUS,
	NFSX_FH(1) + 6 * NFSX_UNSIGNED);
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, 1);
	tl = nfsm_build(u_int32_t , 6 NFSX_UNSIGNED);
	*tl++ = cookie.nfsuquad[0];
	*tl++ = cookie.nfsuquad[1];
	mtx_lock(&dnp->n_mtx);
	*tl++ = dnp->n_cookieverf.nfsuquad[0];
	*tl++ = dnp->n_cookieverf.nfsuquad[1];
	mtx_unlock(&dnp->n_mtx);
	*tl++ = txdr_unsigned(nmp->nm_readdirsize);
	*tl = txdr_unsigned(nmp->nm_rsize);
	nfsm_request(vp, NFSPROC_READDIRPLUS, uiop->uio_td, cred);
	nfsm_postop_attr(vp, attrflag);
	if (error) {
	m_freem(mrep);
	goto nfsmout;
	}
	tl = nfsm_dissect(u_int32_t , 3 NFSX_UNSIGNED);
	mtx_lock(&dnp->n_mtx);
	dnp->n_cookieverf.nfsuquad[0] = *tl++;
	dnp->n_cookieverf.nfsuquad[1] = *tl++;
	mtx_unlock(&dnp->n_mtx);
	more_dirs = fxdr_unsigned(int, *tl);

	/* loop thru the dir entries, doctoring them to 4bsd form */
	while (more_dirs && bigenough) {
	tl = nfsm_dissect(u_int32_t , 3 NFSX_UNSIGNED);
	fileno = fxdr_hyper(tl);
	len = fxdr_unsigned(int, *(tl + 2));
	if (len <= 0 \|\| len > NFS_MAXNAMLEN) {
	error = EBADRPC;
	m_freem(mrep);
	goto nfsmout;
	}
	tlen = nfsm_rndup(len);
	if (tlen == len)
	tlen += 4; /* To ensure null termination*/
	left = DIRBLKSIZ - blksiz;
	if ((tlen + DIRHDSIZ) > left) {
	dp->d_reclen += left;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + left;
	uiop->uio_iov->iov_len -= left;
	uiop->uio_offset += left;
	uiop->uio_resid -= left;
	blksiz = 0;
	}
	if ((tlen + DIRHDSIZ) > uiop->uio_resid)
	bigenough = 0;
	if (bigenough) {
	dp = (struct dirent *)uiop->uio_iov->iov_base;
	dp->d_fileno = (int)fileno;
	dp->d_namlen = len;
	dp->d_reclen = tlen + DIRHDSIZ;
	dp->d_type = DT_UNKNOWN;
	blksiz += dp->d_reclen;
	if (blksiz == DIRBLKSIZ)
	blksiz = 0;
	uiop->uio_offset += DIRHDSIZ;
	uiop->uio_resid -= DIRHDSIZ;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + DIRHDSIZ;
	uiop->uio_iov->iov_len -= DIRHDSIZ;
	cnp->cn_nameptr = uiop->uio_iov->iov_base;
	cnp->cn_namelen = len;
	nfsm_mtouio(uiop, len);
	cp = uiop->uio_iov->iov_base;
	tlen -= len;
	*cp = '\0';
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + tlen;
	uiop->uio_iov->iov_len -= tlen;
	uiop->uio_offset += tlen;
	uiop->uio_resid -= tlen;
	} else
	nfsm_adv(nfsm_rndup(len));
	tl = nfsm_dissect(u_int32_t , 3 NFSX_UNSIGNED);
	if (bigenough) {
	cookie.nfsuquad[0] = *tl++;
	cookie.nfsuquad[1] = *tl++;
	} else
	tl += 2;

	/*
	* Since the attributes are before the file handle
	* (sigh), we must skip over the attributes and then
	* come back and get them.
	*/
	attrflag = fxdr_unsigned(int, *tl);
	if (attrflag) {
	dpossav1 = dpos;
	mdsav1 = md;
	nfsm_adv(NFSX_V3FATTR);
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	doit = fxdr_unsigned(int, *tl);
	/*
	* Skip loading the attrs for "..". There's a
	* race between loading the attrs here and
	* lookups that look for the directory currently
	* being read (in the parent). We try to acquire
	* the exclusive lock on ".." here, owning the
	* lock on the directory being read. Lookup will
	* hold the lock on ".." and try to acquire the
	* lock on the directory being read.
	*
	* There are other ways of fixing this, one would
	* be to do a trylock on the ".." vnode and skip
	* loading the attrs on ".." if it happens to be
	* locked by another process. But skipping the
	* attrload on ".." seems the easiest option.
	*/
	if (strcmp(dp->d_name, "..") == 0) {
	doit = 0;
	/*
	* We've already skipped over the attrs,
	* skip over the filehandle. And store d_type
	* as VDIR.
	*/
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	nfsm_adv(nfsm_rndup(i));
	dp->d_type = IFTODT(VTTOIF(VDIR));
	}
	if (doit) {
	nfsm_getfh(fhp, fhsize, 1);
	if (NFS_CMPFH(dnp, fhp, fhsize)) {
	VREF(vp);
	newvp = vp;
	np = dnp;
	} else {
	error = nfs_nget(vp->v_mount, fhp,
	fhsize, &np, LK_EXCLUSIVE);
	if (error)
	doit = 0;
	else
	newvp = NFSTOV(np);
	}
	}
	if (doit && bigenough) {
	dpossav2 = dpos;
	dpos = dpossav1;
	mdsav2 = md;
	md = mdsav1;
	nfsm_loadattr(newvp, NULL);
	dpos = dpossav2;
	md = mdsav2;
	dp->d_type =
	IFTODT(VTTOIF(np->n_vattr.va_type));
	ndp->ni_vp = newvp;
	/* Update n_ctime, so subsequent lookup doesn't purge entry */
	np->n_ctime = np->n_vattr.va_ctime.tv_sec;
	cache_enter(ndp->ni_dvp, ndp->ni_vp, cnp);
	}
	} else {
	/* Just skip over the file handle */
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	if (i) {
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	fhsize = fxdr_unsigned(int, *tl);
	nfsm_adv(nfsm_rndup(fhsize));
	}
	}
	if (newvp != NULLVP) {
	if (newvp == vp)
	vrele(newvp);
	else
	vput(newvp);
	newvp = NULLVP;
	}
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	more_dirs = fxdr_unsigned(int, *tl);
	}
	/*
	* If at end of rpc data, get the eof boolean
	*/
	if (!more_dirs) {
	tl = nfsm_dissect(u_int32_t *, NFSX_UNSIGNED);
	more_dirs = (fxdr_unsigned(int, *tl) == 0);
	}
	m_freem(mrep);
	}
	/*
	* Fill last record, iff any, out to a multiple of DIRBLKSIZ
	* by increasing d_reclen for the last record.
	*/
	if (blksiz > 0) {
	left = DIRBLKSIZ - blksiz;
	dp->d_reclen += left;
	uiop->uio_iov->iov_base =
	(char *)uiop->uio_iov->iov_base + left;
	uiop->uio_iov->iov_len -= left;
	uiop->uio_offset += left;
	uiop->uio_resid -= left;
	}

	/*
	* We are now either at the end of the directory or have filled the
	* block.
	*/
	if (bigenough)
	dnp->n_direofoffset = uiop->uio_offset;
	else {
	if (uiop->uio_resid > 0)
	nfs_printf("EEK! readdirplusrpc resid > 0\n");
	nfs_dircookie_lock(dnp);
	cookiep = nfs_getcookie(dnp, uiop->uio_offset, 1);
	*cookiep = cookie;
	nfs_dircookie_unlock(dnp);
	}
	nfsmout:
	if (newvp != NULLVP) {
	if (newvp == vp)
	vrele(newvp);
	else
	vput(newvp);
	newvp = NULLVP;
	}
	return (error);
	}

	/*
	* Silly rename. To make the NFS filesystem that is stateless look a little
	* more like the "ufs" a remove of an active vnode is translated to a rename
	* to a funny looking filename that is removed by nfs_inactive on the
	* nfsnode. There is the potential for another process on a different client
	* to create the same funny name between the nfs_lookitup() fails and the
	* nfs_rename() completes, but...
	*/
	static int
	nfs_sillyrename(struct vnode dvp, struct vnode vp, struct componentname *cnp)
	{
	struct sillyrename *sp;
	struct nfsnode *np;
	int error;
	short pid;
	unsigned int lticks;

	cache_purge(dvp);
	np = VTONFS(vp);
	#ifndef DIAGNOSTIC
	if (vp->v_type == VDIR)
	panic("nfs: sillyrename dir");
	#endif
	MALLOC(sp, struct sillyrename *, sizeof (struct sillyrename),
	M_NFSREQ, M_WAITOK);
	sp->s_cred = crhold(cnp->cn_cred);
	sp->s_dvp = dvp;
	sp->s_removeit = nfs_removeit;
	VREF(dvp);

	/*
	* Fudge together a funny name.
	* Changing the format of the funny name to accomodate more
	* sillynames per directory.
	* The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is
	* CPU ticks since boot.
	*/
	pid = cnp->cn_thread->td_proc->p_pid;
	lticks = (unsigned int)ticks;
	for ( ; ; ) {
	sp->s_namlen = sprintf(sp->s_name,
	".nfs.%08x.%04x4.4", lticks,
	pid);
	if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
	cnp->cn_thread, NULL))
	break;
	lticks++;
	}
	error = nfs_renameit(dvp, cnp, sp);
	if (error)
	goto bad;
	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
	cnp->cn_thread, &np);
	np->n_sillyrename = sp;
	return (0);
	bad:
	vrele(sp->s_dvp);
	crfree(sp->s_cred);
	free((caddr_t)sp, M_NFSREQ);
	return (error);
	}

	/*
	* Look up a file name and optionally either update the file handle or
	* allocate an nfsnode, depending on the value of npp.
	* npp == NULL --> just do the lookup
	* *npp == NULL --> allocate a new nfsnode and make sure attributes are
	* handled too
	* *npp != NULL --> update the file handle in the vnode
	*/
	static int
	nfs_lookitup(struct vnode dvp, const char name, int len, struct ucred *cred,
	struct thread td, struct nfsnode *npp)
	{
	struct vnode *newvp = NULL;
	struct nfsnode np, dnp = VTONFS(dvp);
	caddr_t bpos, dpos;
	int error = 0, fhlen, attrflag;
	struct mbuf mreq, mrep, md, mb;
	nfsfh_t *nfhp;
	int v3 = NFS_ISV3(dvp);

	nfsstats.rpccnt[NFSPROC_LOOKUP]++;
	mreq = nfsm_reqhead(dvp, NFSPROC_LOOKUP,
	NFSX_FH(v3) + NFSX_UNSIGNED + nfsm_rndup(len));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(dvp, v3);
	nfsm_strtom(name, len, NFS_MAXNAMLEN);
	nfsm_request(dvp, NFSPROC_LOOKUP, td, cred);
	if (npp && !error) {
	nfsm_getfh(nfhp, fhlen, v3);
	if (*npp) {
	np = *npp;
	if (np->n_fhsize > NFS_SMALLFH && fhlen <= NFS_SMALLFH) {
	free((caddr_t)np->n_fhp, M_NFSBIGFH);
	np->n_fhp = &np->n_fh;
	} else if (np->n_fhsize <= NFS_SMALLFH && fhlen>NFS_SMALLFH)
	np->n_fhp =(nfsfh_t *)malloc(fhlen, M_NFSBIGFH, M_WAITOK);
	bcopy((caddr_t)nfhp, (caddr_t)np->n_fhp, fhlen);
	np->n_fhsize = fhlen;
	newvp = NFSTOV(np);
	} else if (NFS_CMPFH(dnp, nfhp, fhlen)) {
	VREF(dvp);
	newvp = dvp;
	} else {
	error = nfs_nget(dvp->v_mount, nfhp, fhlen, &np, LK_EXCLUSIVE);
	if (error) {
	m_freem(mrep);
	return (error);
	}
	newvp = NFSTOV(np);
	}
	if (v3) {
	nfsm_postop_attr(newvp, attrflag);
	if (!attrflag && *npp == NULL) {
	m_freem(mrep);
	if (newvp == dvp)
	vrele(newvp);
	else
	vput(newvp);
	return (ENOENT);
	}
	} else
	nfsm_loadattr(newvp, NULL);
	}
	m_freem(mrep);
	nfsmout:
	if (npp && *npp == NULL) {
	if (error) {
	if (newvp) {
	if (newvp == dvp)
	vrele(newvp);
	else
	vput(newvp);
	}
	} else
	*npp = np;
	}
	return (error);
	}

	/*
	* Nfs Version 3 commit rpc
	*/
	int
	nfs_commit(struct vnode vp, u_quad_t offset, int cnt, struct ucred cred,
	struct thread *td)
	{
	u_int32_t *tl;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	caddr_t bpos, dpos;
	int error = 0, wccflag = NFSV3_WCCRATTR;
	struct mbuf mreq, mrep, md, mb;

	mtx_lock(&nmp->nm_mtx);
	if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
	mtx_unlock(&nmp->nm_mtx);
	return (0);
	}
	mtx_unlock(&nmp->nm_mtx);
	nfsstats.rpccnt[NFSPROC_COMMIT]++;
	mreq = nfsm_reqhead(vp, NFSPROC_COMMIT, NFSX_FH(1));
	mb = mreq;
	bpos = mtod(mb, caddr_t);
	nfsm_fhtom(vp, 1);
	tl = nfsm_build(u_int32_t , 3 NFSX_UNSIGNED);
	txdr_hyper(offset, tl);
	tl += 2;
	*tl = txdr_unsigned(cnt);
	nfsm_request(vp, NFSPROC_COMMIT, td, cred);
	nfsm_wcc_data(vp, wccflag);
	if (!error) {
	tl = nfsm_dissect(u_int32_t *, NFSX_V3WRITEVERF);
	if (bcmp((caddr_t)nmp->nm_verf, (caddr_t)tl,
	NFSX_V3WRITEVERF)) {
	bcopy((caddr_t)tl, (caddr_t)nmp->nm_verf,
	NFSX_V3WRITEVERF);
	error = NFSERR_STALEWRITEVERF;
	}
	}
	m_freem(mrep);
	nfsmout:
	return (error);
	}

	/*
	* Strategy routine.
	* For async requests when nfsiod(s) are running, queue the request by
	* calling nfs_asyncio(), otherwise just all nfs_doio() to do the
	* request.
	*/
	static int
	nfs_strategy(struct vop_strategy_args *ap)
	{
	struct buf *bp = ap->a_bp;
	struct ucred *cr;

	KASSERT(!(bp->b_flags & B_DONE),
	("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
	BUF_ASSERT_HELD(bp);

	if (bp->b_iocmd == BIO_READ)
	cr = bp->b_rcred;
	else
	cr = bp->b_wcred;

	/*
	* If the op is asynchronous and an i/o daemon is waiting
	* queue the request, wake it up and wait for completion
	* otherwise just do it ourselves.
	*/
	if ((bp->b_flags & B_ASYNC) == 0 \|\|
	nfs_asyncio(VFSTONFS(ap->a_vp->v_mount), bp, NOCRED, curthread))
	(void)nfs_doio(ap->a_vp, bp, cr, curthread);
	return (0);
	}

	/*
	* fsync vnode op. Just call nfs_flush() with commit == 1.
	*/
	/* ARGSUSED */
	static int
	nfs_fsync(struct vop_fsync_args *ap)
	{

	return (nfs_flush(ap->a_vp, ap->a_waitfor, 1));
	}

	/*
	* Flush all the blocks associated with a vnode.
	* Walk through the buffer pool and push any dirty pages
	* associated with the vnode.
	*/
	static int
	nfs_flush(struct vnode *vp, int waitfor, int commit)
	{
	struct nfsnode *np = VTONFS(vp);
	struct buf *bp;
	int i;
	struct buf *nbp;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
	int passone = 1;
	u_quad_t off, endoff, toff;
	struct ucred* wcred = NULL;
	struct buf **bvec = NULL;
	struct bufobj *bo;
	struct thread *td = curthread;
	#ifndef NFS_COMMITBVECSIZ
	#define NFS_COMMITBVECSIZ 20
	#endif
	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
	int bvecsize = 0, bveccount;

	if (nmp->nm_flag & NFSMNT_INT)
	slpflag = PCATCH;
	if (!commit)
	passone = 0;
	bo = &vp->v_bufobj;
	/*
	* A b_flags == (B_DELWRI \| B_NEEDCOMMIT) block has been written to the
	* server, but has not been committed to stable storage on the server
	* yet. On the first pass, the byte range is worked out and the commit
	* rpc is done. On the second pass, nfs_writebp() is called to do the
	* job.
	*/
	again:
	off = (u_quad_t)-1;
	endoff = 0;
	bvecpos = 0;
	if (NFS_ISV3(vp) && commit) {
	if (bvec != NULL && bvec != bvec_on_stack)
	free(bvec, M_TEMP);
	/*
	* Count up how many buffers waiting for a commit.
	*/
	bveccount = 0;
	BO_LOCK(bo);
	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	if (!BUF_ISLOCKED(bp) &&
	(bp->b_flags & (B_DELWRI \| B_NEEDCOMMIT))
	== (B_DELWRI \| B_NEEDCOMMIT))
	bveccount++;
	}
	/*
	* Allocate space to remember the list of bufs to commit. It is
	* important to use M_NOWAIT here to avoid a race with nfs_write.
	* If we can't get memory (for whatever reason), we will end up
	* committing the buffers one-by-one in the loop below.
	*/
	if (bveccount > NFS_COMMITBVECSIZ) {
	/*
	* Release the vnode interlock to avoid a lock
	* order reversal.
	*/
	BO_UNLOCK(bo);
	bvec = (struct buf **)
	malloc(bveccount * sizeof(struct buf *),
	M_TEMP, M_NOWAIT);
	BO_LOCK(bo);
	if (bvec == NULL) {
	bvec = bvec_on_stack;
	bvecsize = NFS_COMMITBVECSIZ;
	} else
	bvecsize = bveccount;
	} else {
	bvec = bvec_on_stack;
	bvecsize = NFS_COMMITBVECSIZ;
	}
	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	if (bvecpos >= bvecsize)
	break;
	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL)) {
	nbp = TAILQ_NEXT(bp, b_bobufs);
	continue;
	}
	if ((bp->b_flags & (B_DELWRI \| B_NEEDCOMMIT)) !=
	(B_DELWRI \| B_NEEDCOMMIT)) {
	BUF_UNLOCK(bp);
	nbp = TAILQ_NEXT(bp, b_bobufs);
	continue;
	}
	BO_UNLOCK(bo);
	bremfree(bp);
	/*
	* Work out if all buffers are using the same cred
	* so we can deal with them all with one commit.
	*
	* NOTE: we are not clearing B_DONE here, so we have
	* to do it later on in this routine if we intend to
	* initiate I/O on the bp.
	*
	* Note: to avoid loopback deadlocks, we do not
	* assign b_runningbufspace.
	*/
	if (wcred == NULL)
	wcred = bp->b_wcred;
	else if (wcred != bp->b_wcred)
	wcred = NOCRED;
	vfs_busy_pages(bp, 1);

	BO_LOCK(bo);
	/*
	* bp is protected by being locked, but nbp is not
	* and vfs_busy_pages() may sleep. We have to
	* recalculate nbp.
	*/
	nbp = TAILQ_NEXT(bp, b_bobufs);

	/*
	* A list of these buffers is kept so that the
	* second loop knows which buffers have actually
	* been committed. This is necessary, since there
	* may be a race between the commit rpc and new
	* uncommitted writes on the file.
	*/
	bvec[bvecpos++] = bp;
	toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
	bp->b_dirtyoff;
	if (toff < off)
	off = toff;
	toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
	if (toff > endoff)
	endoff = toff;
	}
	BO_UNLOCK(bo);
	}
	if (bvecpos > 0) {
	/*
	* Commit data on the server, as required.
	* If all bufs are using the same wcred, then use that with
	* one call for all of them, otherwise commit each one
	* separately.
	*/
	if (wcred != NOCRED)
	retv = nfs_commit(vp, off, (int)(endoff - off),
	wcred, td);
	else {
	retv = 0;
	for (i = 0; i < bvecpos; i++) {
	off_t off, size;
	bp = bvec[i];
	off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
	bp->b_dirtyoff;
	size = (u_quad_t)(bp->b_dirtyend
	- bp->b_dirtyoff);
	retv = nfs_commit(vp, off, (int)size,
	bp->b_wcred, td);
	if (retv) break;
	}
	}

	if (retv == NFSERR_STALEWRITEVERF)
	nfs_clearcommit(vp->v_mount);

	/*
	* Now, either mark the blocks I/O done or mark the
	* blocks dirty, depending on whether the commit
	* succeeded.
	*/
	for (i = 0; i < bvecpos; i++) {
	bp = bvec[i];
	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	if (retv) {
	/*
	* Error, leave B_DELWRI intact
	*/
	vfs_unbusy_pages(bp);
	brelse(bp);
	} else {
	/*
	* Success, remove B_DELWRI ( bundirty() ).
	*
	* b_dirtyoff/b_dirtyend seem to be NFS
	* specific. We should probably move that
	* into bundirty(). XXX
	*/
	bufobj_wref(bo);
	bp->b_flags \|= B_ASYNC;
	bundirty(bp);
	bp->b_flags &= ~B_DONE;
	bp->b_ioflags &= ~BIO_ERROR;
	bp->b_dirtyoff = bp->b_dirtyend = 0;
	bufdone(bp);
	}
	}
	}

	/*
	* Start/do any write(s) that are required.
	*/
	loop:
	BO_LOCK(bo);
	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL)) {
	if (waitfor != MNT_WAIT \|\| passone)
	continue;

	error = BUF_TIMELOCK(bp,
	LK_EXCLUSIVE \| LK_SLEEPFAIL \| LK_INTERLOCK,
	BO_MTX(bo), "nfsfsync", slpflag, slptimeo);
	if (error == 0) {
	BUF_UNLOCK(bp);
	goto loop;
	}
	if (error == ENOLCK) {
	error = 0;
	goto loop;
	}
	if (nfs_sigintr(nmp, NULL, td)) {
	error = EINTR;
	goto done;
	}
	if (slpflag == PCATCH) {
	slpflag = 0;
	slptimeo = 2 * hz;
	}
	goto loop;
	}
	if ((bp->b_flags & B_DELWRI) == 0)
	panic("nfs_fsync: not dirty");
	if ((passone \|\| !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
	BUF_UNLOCK(bp);
	continue;
	}
	BO_UNLOCK(bo);
	bremfree(bp);
	if (passone \|\| !commit)
	bp->b_flags \|= B_ASYNC;
	else
	bp->b_flags \|= B_ASYNC;
	bwrite(bp);
	if (nfs_sigintr(nmp, NULL, td)) {
	error = EINTR;
	goto done;
	}
	goto loop;
	}
	if (passone) {
	passone = 0;
	BO_UNLOCK(bo);
	goto again;
	}
	if (waitfor == MNT_WAIT) {
	while (bo->bo_numoutput) {
	error = bufobj_wwait(bo, slpflag, slptimeo);
	if (error) {
	BO_UNLOCK(bo);
	error = nfs_sigintr(nmp, NULL, td);
	if (error)
	goto done;
	if (slpflag == PCATCH) {
	slpflag = 0;
	slptimeo = 2 * hz;
	}
	BO_LOCK(bo);
	}
	}
	if (bo->bo_dirty.bv_cnt != 0 && commit) {
	BO_UNLOCK(bo);
	goto loop;
	}
	/*
	* Wait for all the async IO requests to drain
	*/
	BO_UNLOCK(bo);
	mtx_lock(&np->n_mtx);
	while (np->n_directio_asyncwr > 0) {
	np->n_flag \|= NFSYNCWAIT;
	error = nfs_msleep(td, (caddr_t)&np->n_directio_asyncwr,
	&np->n_mtx, slpflag \| (PRIBIO + 1),
	"nfsfsync", 0);
	if (error) {
	if (nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
	mtx_unlock(&np->n_mtx);
	error = EINTR;
	goto done;
	}
	}
	}
	mtx_unlock(&np->n_mtx);
	} else
	BO_UNLOCK(bo);
	mtx_lock(&np->n_mtx);
	if (np->n_flag & NWRITEERR) {
	error = np->n_error;
	np->n_flag &= ~NWRITEERR;
	}
	if (commit && bo->bo_dirty.bv_cnt == 0 &&
	bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
	np->n_flag &= ~NMODIFIED;
	mtx_unlock(&np->n_mtx);
	done:
	if (bvec != NULL && bvec != bvec_on_stack)
	free(bvec, M_TEMP);
	return (error);
	}

	/*
	* NFS advisory byte-level locks.
	*/
	static int
	nfs_advlock(struct vop_advlock_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	u_quad_t size;
	int error;

	error = vn_lock(vp, LK_SHARED);
	if (error)
	return (error);
	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
	size = VTONFS(vp)->n_size;
	VOP_UNLOCK(vp, 0);
	error = lf_advlock(ap, &(vp->v_lockf), size);
	} else {
	if (nfs_advlock_p)
	error = nfs_advlock_p(ap);
	else
	error = ENOLCK;
	}

	return (error);
	}

	/*
	* NFS advisory byte-level locks.
	*/
	static int
	nfs_advlockasync(struct vop_advlockasync_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	u_quad_t size;
	int error;

	error = vn_lock(vp, LK_SHARED);
	if (error)
	return (error);
	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
	size = VTONFS(vp)->n_size;
	VOP_UNLOCK(vp, 0);
	error = lf_advlockasync(ap, &(vp->v_lockf), size);
	} else {
	VOP_UNLOCK(vp, 0);
	error = EOPNOTSUPP;
	}
	return (error);
	}

	/*
	* Print out the contents of an nfsnode.
	*/
	static int
	nfs_print(struct vop_print_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);

	nfs_printf("\tfileid %ld fsid 0x%x",
	np->n_vattr.va_fileid, np->n_vattr.va_fsid);
	if (vp->v_type == VFIFO)
	fifo_printinfo(vp);
	printf("\n");
	return (0);
	}

	/*
	* This is the "real" nfs::bwrite(struct buf*).
	* We set B_CACHE if this is a VMIO buffer.
	*/
	int
	nfs_writebp(struct buf bp, int force __unused, struct thread td)
	{
	int s;
	int oldflags = bp->b_flags;
	#if 0
	int retv = 1;
	off_t off;
	#endif

	BUF_ASSERT_HELD(bp);

	if (bp->b_flags & B_INVAL) {
	brelse(bp);
	return(0);
	}

	bp->b_flags \|= B_CACHE;

	/*
	* Undirty the bp. We will redirty it later if the I/O fails.
	*/

	s = splbio();
	bundirty(bp);
	bp->b_flags &= ~B_DONE;
	bp->b_ioflags &= ~BIO_ERROR;
	bp->b_iocmd = BIO_WRITE;

	bufobj_wref(bp->b_bufobj);
	curthread->td_ru.ru_oublock++;
	splx(s);

	/*
	* Note: to avoid loopback deadlocks, we do not
	* assign b_runningbufspace.
	*/
	vfs_busy_pages(bp, 1);

	BUF_KERNPROC(bp);
	bp->b_iooffset = dbtob(bp->b_blkno);
	bstrategy(bp);

	if( (oldflags & B_ASYNC) == 0) {
	int rtval = bufwait(bp);

	if (oldflags & B_DELWRI) {
	s = splbio();
	reassignbuf(bp);
	splx(s);
	}
	brelse(bp);
	return (rtval);
	}

	return (0);
	}

	/*
	* nfs special file access vnode op.
	* Essentially just get vattr and then imitate iaccess() since the device is
	* local to the client.
	*/
	static int
	nfsspec_access(struct vop_access_args *ap)
	{
	struct vattr *vap;
	struct ucred *cred = ap->a_cred;
	struct vnode *vp = ap->a_vp;
	mode_t mode = ap->a_mode;
	struct vattr vattr;
	int error;

	/*
	* Disallow write attempts on filesystems mounted read-only;
	* unless the file is a socket, fifo, or a block or character
	* device resident on the filesystem.
	*/
	if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
	switch (vp->v_type) {
	case VREG:
	case VDIR:
	case VLNK:
	return (EROFS);
	default:
	break;
	}
	}
	vap = &vattr;
	error = VOP_GETATTR(vp, vap, cred);
	if (error)
	goto out;
	error = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
	mode, cred, NULL);
	out:
	return error;
	}

	/*
	* Read wrapper for fifos.
	*/
	static int
	nfsfifo_read(struct vop_read_args *ap)
	{
	struct nfsnode *np = VTONFS(ap->a_vp);
	int error;

	/*
	* Set access flag.
	*/
	mtx_lock(&np->n_mtx);
	np->n_flag \|= NACC;
	getnanotime(&np->n_atim);
	mtx_unlock(&np->n_mtx);
	error = fifo_specops.vop_read(ap);
	return error;
	}

	/*
	* Write wrapper for fifos.
	*/
	static int
	nfsfifo_write(struct vop_write_args *ap)
	{
	struct nfsnode *np = VTONFS(ap->a_vp);

	/*
	* Set update flag.
	*/
	mtx_lock(&np->n_mtx);
	np->n_flag \|= NUPD;
	getnanotime(&np->n_mtim);
	mtx_unlock(&np->n_mtx);
	return(fifo_specops.vop_write(ap));
	}

	/*
	* Close wrapper for fifos.
	*
	* Update the times on the nfsnode then do fifo close.
	*/
	static int
	nfsfifo_close(struct vop_close_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	struct vattr vattr;
	struct timespec ts;

	mtx_lock(&np->n_mtx);
	if (np->n_flag & (NACC \| NUPD)) {
	getnanotime(&ts);
	if (np->n_flag & NACC)
	np->n_atim = ts;
	if (np->n_flag & NUPD)
	np->n_mtim = ts;
	np->n_flag \|= NCHG;
	if (vrefcnt(vp) == 1 &&
	(vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
	VATTR_NULL(&vattr);
	if (np->n_flag & NACC)
	vattr.va_atime = np->n_atim;
	if (np->n_flag & NUPD)
	vattr.va_mtime = np->n_mtim;
	mtx_unlock(&np->n_mtx);
	(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
	goto out;
	}
	}
	mtx_unlock(&np->n_mtx);
	out:
	return (fifo_specops.vop_close(ap));
	}

	/*
	* Just call nfs_writebp() with the force argument set to 1.
	*
	* NOTE: B_DONE may or may not be set in a_bp on call.
	*/
	static int
	nfs_bwrite(struct buf *bp)
	{

	return (nfs_writebp(bp, 1, curthread));
	}

	struct buf_ops buf_ops_nfs = {
	.bop_name = "buf_ops_nfs",
	.bop_write = nfs_bwrite,
	.bop_strategy = bufstrategy,
	.bop_sync = bufsync,
	.bop_bdflush = bufbdflush,
	};
	Index: head/sys/rpc/authunix_prot.c
	===================================================================
	--- head/sys/rpc/authunix_prot.c (revision 183549)
	+++ head/sys/rpc/authunix_prot.c (revision 183550)
	@@ -1,132 +1,133 @@
	/* $NetBSD: authunix_prot.c,v 1.12 2000/01/22 22:19:17 mycroft Exp $ */

	/*
	* Sun RPC is a product of Sun Microsystems, Inc. and is provided for
	* unrestricted use provided that this legend is included on all tape
	* media and as a part of the software program in whole or part. Users
	* may copy or modify Sun RPC without charge, but are not authorized
	* to license or distribute it to anyone else except as part of a product or
	* program developed by the user.
	*
	* SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE
	* WARRANTIES OF DESIGN, MERCHANTIBILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE.
	*
	* Sun RPC is provided with no support and without any obligation on the
	* part of Sun Microsystems, Inc. to assist in its use, correction,
	* modification or enhancement.
	*
	* SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE
	* INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC
	* OR ANY PART THEREOF.
	*
	* In no event will Sun Microsystems, Inc. be liable for any lost revenue
	* or profits or other special, indirect and consequential damages, even if
	* Sun has been advised of the possibility of such damages.
	*
	* Sun Microsystems, Inc.
	* 2550 Garcia Avenue
	* Mountain View, California 94043
	*/

	#if defined(LIBC_SCCS) && !defined(lint)
	static char *sccsid2 = "@(#)authunix_prot.c 1.15 87/08/11 Copyr 1984 Sun Micro";
	static char *sccsid = "@(#)authunix_prot.c 2.1 88/07/29 4.0 RPCSRC";
	#endif
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* authunix_prot.c
	* XDR for UNIX style authentication parameters for RPC
	*
	* Copyright (C) 1984, Sun Microsystems, Inc.
	*/

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/systm.h>
	#include <sys/ucred.h>
	#include <sys/vimage.h>

	#include <rpc/types.h>
	#include <rpc/xdr.h>
	#include <rpc/auth.h>

	#include <rpc/rpc_com.h>

	/* gids compose part of a credential; there may not be more than 16 of them */
	#define NGRPS 16

	/*
	* XDR for unix authentication parameters.
	*/
	bool_t
	xdr_authunix_parms(XDR xdrs, uint32_t time, struct xucred *cred)
	{
	uint32_t namelen;
	uint32_t ngroups, i;
	uint32_t junk;
	+ INIT_VPROCG(TD_TO_VPROCG(&thread0)); /* XXX revisit - fixme! */

	mtx_lock(&hostname_mtx);
	if (xdrs->x_op == XDR_ENCODE) {
	/*
	* Restrict name length to 255 according to RFC 1057.
	*/
	namelen = strlen(V_hostname);
	if (namelen > 255)
	namelen = 255;
	} else {
	namelen = 0;
	}
	junk = 0;

	if (!xdr_uint32_t(xdrs, time)
	\|\| !xdr_uint32_t(xdrs, &namelen))
	return (FALSE);

	/*
	* Ignore the hostname on decode.
	*/
	if (xdrs->x_op == XDR_ENCODE) {
	if (!xdr_opaque(xdrs, V_hostname, namelen))
	return (FALSE);
	} else {
	xdr_setpos(xdrs, xdr_getpos(xdrs) + RNDUP(namelen));
	}
	mtx_unlock(&hostname_mtx);

	if (!xdr_uint32_t(xdrs, &cred->cr_uid))
	return (FALSE);
	if (!xdr_uint32_t(xdrs, &cred->cr_groups[0]))
	return (FALSE);

	if (xdrs->x_op == XDR_ENCODE) {
	ngroups = cred->cr_ngroups - 1;
	if (ngroups > NGRPS)
	ngroups = NGRPS;
	}

	if (!xdr_uint32_t(xdrs, &ngroups))
	return (FALSE);
	for (i = 0; i < ngroups; i++) {
	if (i + 1 < NGROUPS) {
	if (!xdr_uint32_t(xdrs, &cred->cr_groups[i + 1]))
	return (FALSE);
	} else {
	if (!xdr_uint32_t(xdrs, &junk))
	return (FALSE);
	}
	}

	if (xdrs->x_op == XDR_DECODE) {
	if (ngroups + 1 > NGROUPS)
	cred->cr_ngroups = NGROUPS;
	else
	cred->cr_ngroups = ngroups + 1;
	}

	return (TRUE);
	}
	Index: head/sys/sys/sysctl.h
	===================================================================
	--- head/sys/sys/sysctl.h (revision 183549)
	+++ head/sys/sys/sysctl.h (revision 183550)
	@@ -1,708 +1,777 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Mike Karels at Berkeley Software Design, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)sysctl.h 8.1 (Berkeley) 6/2/93
	* $FreeBSD$
	*/

	#ifndef _SYS_SYSCTL_H_
	#define _SYS_SYSCTL_H_

	#include <sys/queue.h>

	struct thread;
	/*
	* Definitions for sysctl call. The sysctl call uses a hierarchical name
	* for objects that can be examined or modified. The name is expressed as
	* a sequence of integers. Like a file path name, the meaning of each
	* component depends on its place in the hierarchy. The top-level and kern
	* identifiers are defined here, and other identifiers are defined in the
	* respective subsystem header files.
	*/

	#define CTL_MAXNAME 24 /* largest number of components supported */

	/*
	* Each subsystem defined by sysctl defines a list of variables
	* for that subsystem. Each name is either a node with further
	* levels defined below it, or it is a leaf of some particular
	* type given below. Each sysctl level defines a set of name/type
	* pairs to be used by sysctl(8) in manipulating the subsystem.
	*/
	struct ctlname {
	char ctl_name; / subsystem name */
	int ctl_type; /* type of name */
	};

	#define CTLTYPE 0xf /* Mask for the type */
	#define CTLTYPE_NODE 1 /* name is a node */
	#define CTLTYPE_INT 2 /* name describes an integer */
	#define CTLTYPE_STRING 3 /* name describes a string */
	#define CTLTYPE_QUAD 4 /* name describes a 64-bit number */
	#define CTLTYPE_OPAQUE 5 /* name describes a structure */
	#define CTLTYPE_STRUCT CTLTYPE_OPAQUE /* name describes a structure */
	#define CTLTYPE_UINT 6 /* name describes an unsigned integer */
	#define CTLTYPE_LONG 7 /* name describes a long */
	#define CTLTYPE_ULONG 8 /* name describes an unsigned long */

	#define CTLFLAG_RD 0x80000000 /* Allow reads of variable */
	#define CTLFLAG_WR 0x40000000 /* Allow writes to the variable */
	#define CTLFLAG_RW (CTLFLAG_RD\|CTLFLAG_WR)
	#define CTLFLAG_NOLOCK 0x20000000 /* XXX Don't Lock */
	#define CTLFLAG_ANYBODY 0x10000000 /* All users can set this var */
	#define CTLFLAG_SECURE 0x08000000 /* Permit set only if securelevel<=0 */
	#define CTLFLAG_PRISON 0x04000000 /* Prisoned roots can fiddle */
	#define CTLFLAG_DYN 0x02000000 /* Dynamic oid - can be freed */
	#define CTLFLAG_SKIP 0x01000000 /* Skip this sysctl when listing */
	#define CTLMASK_SECURE 0x00F00000 /* Secure level */
	#define CTLFLAG_TUN 0x00080000 /* Tunable variable */
	#define CTLFLAG_RDTUN (CTLFLAG_RD\|CTLFLAG_TUN)

	/*
	* Secure level. Note that CTLFLAG_SECURE == CTLFLAG_SECURE1.
	*
	* Secure when the securelevel is raised to at least N.
	*/
	#define CTLSHIFT_SECURE 20
	#define CTLFLAG_SECURE1 (CTLFLAG_SECURE \| (0 << CTLSHIFT_SECURE))
	#define CTLFLAG_SECURE2 (CTLFLAG_SECURE \| (1 << CTLSHIFT_SECURE))
	#define CTLFLAG_SECURE3 (CTLFLAG_SECURE \| (2 << CTLSHIFT_SECURE))

	/*
	* USE THIS instead of a hardwired number from the categories below
	* to get dynamically assigned sysctl entries using the linker-set
	* technology. This is the way nearly all new sysctl variables should
	* be implemented.
	* e.g. SYSCTL_INT(_parent, OID_AUTO, name, CTLFLAG_RW, &variable, 0, "");
	*/
	#define OID_AUTO (-1)

	/*
	* The starting number for dynamically-assigned entries. WARNING!
	* ALL static sysctl entries should have numbers LESS than this!
	*/
	#define CTL_AUTO_START 0x100

	#ifdef _KERNEL
	#define SYSCTL_HANDLER_ARGS struct sysctl_oid oidp, void arg1, int arg2, \
	struct sysctl_req *req

	/* definitions for sysctl_req 'lock' member */
	#define REQ_UNLOCKED 0 /* not locked and not wired */
	#define REQ_LOCKED 1 /* locked and not wired */
	#define REQ_WIRED 2 /* locked and wired */

	/* definitions for sysctl_req 'flags' member */
	#if defined(__amd64__) \|\| defined(__ia64__)
	#define SCTL_MASK32 1 /* 32 bit emulation */
	#endif

	/*
	* This describes the access space for a sysctl request. This is needed
	* so that we can use the interface from the kernel or from user-space.
	*/
	struct sysctl_req {
	struct thread td; / used for access checking */
	int lock; /* locking/wiring state */
	void *oldptr;
	size_t oldlen;
	size_t oldidx;
	int (oldfunc)(struct sysctl_req , const void *, size_t);
	void *newptr;
	size_t newlen;
	size_t newidx;
	int (newfunc)(struct sysctl_req , void *, size_t);
	size_t validlen;
	int flags;
	};

	SLIST_HEAD(sysctl_oid_list, sysctl_oid);

	/*
	* This describes one "oid" in the MIB tree. Potentially more nodes can
	* be hidden behind it, expanded by the handler.
	*/
	struct sysctl_oid {
	struct sysctl_oid_list *oid_parent;
	SLIST_ENTRY(sysctl_oid) oid_link;
	int oid_number;
	u_int oid_kind;
	void *oid_arg1;
	int oid_arg2;
	const char *oid_name;
	int (*oid_handler)(SYSCTL_HANDLER_ARGS);
	const char *oid_fmt;
	int oid_refcnt;
	const char *oid_descr;
	};

	#define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l)
	#define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l)

	int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
	int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS);
	int sysctl_handle_long(SYSCTL_HANDLER_ARGS);
	int sysctl_handle_quad(SYSCTL_HANDLER_ARGS);
	int sysctl_handle_intptr(SYSCTL_HANDLER_ARGS);
	int sysctl_handle_string(SYSCTL_HANDLER_ARGS);
	int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS);

	+#ifdef VIMAGE
	+int sysctl_handle_v_int(SYSCTL_HANDLER_ARGS);
	+int sysctl_handle_v_string(SYSCTL_HANDLER_ARGS);
	+int sysctl_handle_v_opaque(SYSCTL_HANDLER_ARGS);
	+#endif
	+
	/*
	* These functions are used to add/remove an oid from the mib.
	*/
	void sysctl_register_oid(struct sysctl_oid *oidp);
	void sysctl_unregister_oid(struct sysctl_oid *oidp);

	/* Declare a static oid to allow child oids to be added to it. */
	#define SYSCTL_DECL(name) \
	extern struct sysctl_oid_list sysctl_##name##_children

	/* Hide these in macros */
	#define SYSCTL_CHILDREN(oid_ptr) (struct sysctl_oid_list *) \
	(oid_ptr)->oid_arg1
	#define SYSCTL_CHILDREN_SET(oid_ptr, val) \
	(oid_ptr)->oid_arg1 = (val);
	#define SYSCTL_STATIC_CHILDREN(oid_name) \
	(&sysctl_##oid_name##_children)

	/* === Structs and macros related to context handling === */

	/* All dynamically created sysctls can be tracked in a context list. */
	struct sysctl_ctx_entry {
	struct sysctl_oid *entry;
	TAILQ_ENTRY(sysctl_ctx_entry) link;
	};

	TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry);

	#define SYSCTL_NODE_CHILDREN(parent, name) \
	sysctl_##parent##_##name##_children

	#ifndef NO_SYSCTL_DESCR
	#define __DESCR(d) d
	#else
	#define __DESCR(d) ""
	#endif

	/* This constructs a "raw" MIB oid. */
	#define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
	static struct sysctl_oid sysctl__##parent##_##name = { \
	&sysctl_##parent##_children, { 0 }, nbr, kind, \
	a1, a2, #name, handler, fmt, 0, __DESCR(descr) }; \
	DATA_SET(sysctl_set, sysctl__##parent##_##name)

	+#ifdef VIMAGE
	+#define SYSCTL_V_OID(subs, mod, parent, nbr, name, kind, a1, a2, \
	+ handler, fmt, descr) \
	+ static struct sysctl_oid sysctl__##parent##_##name = { \
	+ &sysctl_##parent##_children, { 0 }, nbr, kind, \
	+ (void *) offsetof(struct mod, _##a1), a2, #name, \
	+ handler, fmt, 0, __DESCR(descr), subs, V_MOD_##mod }; \
	+ DATA_SET(sysctl_set, sysctl__##parent##_##name)
	+#else
	+#define SYSCTL_V_OID(subs, mod, parent, nbr, name, kind, a1, a2, \
	+ handler, fmt, descr) \
	+ SYSCTL_OID(parent, nbr, name, kind, &a1, a2, handler, fmt, descr)
	+#endif
	+
	#define SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, __DESCR(descr))

	/* This constructs a node from which other oids can hang. */
	#define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \
	struct sysctl_oid_list SYSCTL_NODE_CHILDREN(parent, name); \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_NODE\|(access), \
	(void*)&SYSCTL_NODE_CHILDREN(parent, name), 0, handler, "N", descr)

	#define SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_NODE\|(access), \
	0, 0, handler, "N", __DESCR(descr))

	/* Oid for a string. len can be 0 to indicate '\0' termination. */
	#define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING\|(access), \
	arg, len, sysctl_handle_string, "A", descr)

	+#ifdef VIMAGE
	+#define SYSCTL_V_STRING(subs, mod, parent, nbr, name, access, sym, len, descr) \
	+ SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_STRING\|(access), \
	+ sym, len, sysctl_handle_v_string, "A", descr)
	+#else
	+#define SYSCTL_V_STRING(subs, mod, parent, nbr, name, access, sym, len, descr) \
	+ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING\|(access), \
	+ &sym, len, sysctl_handle_string, "A", descr)
	+#endif
	+
	#define SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING\|(access), \
	arg, len, sysctl_handle_string, "A", __DESCR(descr))

	/* Oid for an int. If ptr is NULL, val is returned. */
	#define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_INT\|(access), \
	ptr, val, sysctl_handle_int, "I", descr)

	+#ifdef VIMAGE
	+#define SYSCTL_V_INT(subs, mod, parent, nbr, name, access, sym, val, descr) \
	+ SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_INT\|(access), \
	+ sym, val, sysctl_handle_v_int, "I", descr)
	+#else
	+#define SYSCTL_V_INT(subs, mod, parent, nbr, name, access, sym, val, descr) \
	+ SYSCTL_OID(parent, nbr, name, CTLTYPE_INT\|(access), \
	+ &sym, val, sysctl_handle_int, "I", descr)
	+#endif
	+
	#define SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_INT\|(access), \
	ptr, val, sysctl_handle_int, "I", __DESCR(descr))

	/* Oid for an unsigned int. If ptr is NULL, val is returned. */
	#define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_UINT\|(access), \
	ptr, val, sysctl_handle_int, "IU", descr)

	+#ifdef VIMAGE
	+#define SYSCTL_V_UINT(subs, mod, parent, nbr, name, access, sym, val, descr) \
	+ SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_UINT\|(access), \
	+ sym, val, sysctl_handle_v_int, "IU", descr)
	+#else
	+#define SYSCTL_V_UINT(subs, mod, parent, nbr, name, access, sym, val, descr) \
	+ SYSCTL_OID(parent, nbr, name, CTLTYPE_UINT\|(access), \
	+ &sym, val, sysctl_handle_int, "IU", descr)
	+#endif
	+
	#define SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_UINT\|(access), \
	ptr, val, sysctl_handle_int, "IU", __DESCR(descr))

	#define SYSCTL_XINT(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_UINT\|(access), \
	ptr, val, sysctl_handle_int, "IX", descr)

	#define SYSCTL_ADD_XINT(ctx, parent, nbr, name, access, ptr, val, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_UINT\|(access), \
	ptr, val, sysctl_handle_int, "IX", __DESCR(descr))

	/* Oid for a long. The pointer must be non NULL. */
	#define SYSCTL_LONG(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_LONG\|(access), \
	ptr, val, sysctl_handle_long, "L", descr)

	#define SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_LONG\|(access), \
	ptr, 0, sysctl_handle_long, "L", __DESCR(descr))

	/* Oid for an unsigned long. The pointer must be non NULL. */
	#define SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_ULONG\|(access), \
	ptr, val, sysctl_handle_long, "LU", __DESCR(descr))

	#define SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_ULONG\|(access), \
	ptr, 0, sysctl_handle_long, "LU", __DESCR(descr))

	#define SYSCTL_XLONG(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_ULONG\|(access), \
	ptr, val, sysctl_handle_long, "LX", __DESCR(descr))

	#define SYSCTL_ADD_XLONG(ctx, parent, nbr, name, access, ptr, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_ULONG\|(access), \
	ptr, 0, sysctl_handle_long, "LX", __DESCR(descr))

	/* Oid for a quad. The pointer must be non NULL. */
	#define SYSCTL_QUAD(parent, nbr, name, access, ptr, val, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_QUAD\|(access), \
	ptr, val, sysctl_handle_quad, "Q", __DESCR(descr))

	#define SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_QUAD\|(access), \
	ptr, 0, sysctl_handle_quad, "Q", __DESCR(descr))

	/* Oid for an opaque object. Specified by a pointer and a length. */
	#define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE\|(access), \
	ptr, len, sysctl_handle_opaque, fmt, descr)

	#define SYSCTL_ADD_OPAQUE(ctx, parent, nbr, name, access, ptr, len, fmt, descr)\
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE\|(access), \
	ptr, len, sysctl_handle_opaque, fmt, __DESCR(descr))

	/* Oid for a struct. Specified by a pointer and a type. */
	#define SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) \
	SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE\|(access), \
	ptr, sizeof(struct type), sysctl_handle_opaque, \
	"S," #type, descr)

	+#ifdef VIMAGE
	+#define SYSCTL_V_STRUCT(subs, mod, parent, nbr, name, access, sym, \
	+ type, descr) \
	+ SYSCTL_V_OID(subs, mod, parent, nbr, name, CTLTYPE_OPAQUE\|(access), \
	+ sym, sizeof(struct type), sysctl_handle_v_opaque, \
	+ "S," #type, descr)
	+#else
	+#define SYSCTL_V_STRUCT(subs, mod, parent, nbr, name, access, sym, \
	+ type, descr) \
	+ SYSCTL_OID(parent, nbr, name, CTLTYPE_OPAQUE\|(access), \
	+ &sym, sizeof(struct type), sysctl_handle_opaque, \
	+ "S," #type, descr)
	+#endif
	+
	#define SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_OPAQUE\|(access), \
	ptr, sizeof(struct type), sysctl_handle_opaque, "S," #type, __DESCR(descr))

	/* Oid for a procedure. Specified by a pointer and an arg. */
	#define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
	SYSCTL_OID(parent, nbr, name, (access), \
	ptr, arg, handler, fmt, descr)
	+
	+#define SYSCTL_V_PROC(subs, mod, parent, nbr, name, access, sym, arg, \
	+ handler, fmt, descr) \
	+ SYSCTL_V_OID(subs, mod, parent, nbr, name, (access), \
	+ sym, arg, handler, fmt, descr)

	#define SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
	sysctl_add_oid(ctx, parent, nbr, name, (access), \
	ptr, arg, handler, fmt, __DESCR(descr))

	/*
	* A macro to generate a read-only sysctl to indicate the presense of optional
	* kernel features.
	*/
	#define FEATURE(name, desc) \
	SYSCTL_INT(_kern_features, OID_AUTO, name, CTLFLAG_RD, 0, 1, desc)

	#endif /* _KERNEL */

	/*
	* Top-level identifiers
	*/
	#define CTL_UNSPEC 0 /* unused */
	#define CTL_KERN 1 /* "high kernel": proc, limits */
	#define CTL_VM 2 /* virtual memory */
	#define CTL_VFS 3 /* filesystem, mount type is next */
	#define CTL_NET 4 /* network, see socket.h */
	#define CTL_DEBUG 5 /* debugging parameters */
	#define CTL_HW 6 /* generic cpu/io */
	#define CTL_MACHDEP 7 /* machine dependent */
	#define CTL_USER 8 /* user-level */
	#define CTL_P1003_1B 9 /* POSIX 1003.1B */
	#define CTL_MAXID 10 /* number of valid top-level ids */

	#define CTL_NAMES { \
	{ 0, 0 }, \
	{ "kern", CTLTYPE_NODE }, \
	{ "vm", CTLTYPE_NODE }, \
	{ "vfs", CTLTYPE_NODE }, \
	{ "net", CTLTYPE_NODE }, \
	{ "debug", CTLTYPE_NODE }, \
	{ "hw", CTLTYPE_NODE }, \
	{ "machdep", CTLTYPE_NODE }, \
	{ "user", CTLTYPE_NODE }, \
	{ "p1003_1b", CTLTYPE_NODE }, \
	}

	/*
	* CTL_KERN identifiers
	*/
	#define KERN_OSTYPE 1 /* string: system version */
	#define KERN_OSRELEASE 2 /* string: system release */
	#define KERN_OSREV 3 /* int: system revision */
	#define KERN_VERSION 4 /* string: compile time info */
	#define KERN_MAXVNODES 5 /* int: max vnodes */
	#define KERN_MAXPROC 6 /* int: max processes */
	#define KERN_MAXFILES 7 /* int: max open files */
	#define KERN_ARGMAX 8 /* int: max arguments to exec */
	#define KERN_SECURELVL 9 /* int: system security level */
	#define KERN_HOSTNAME 10 /* string: hostname */
	#define KERN_HOSTID 11 /* int: host identifier */
	#define KERN_CLOCKRATE 12 /* struct: struct clockrate */
	#define KERN_VNODE 13 /* struct: vnode structures */
	#define KERN_PROC 14 /* struct: process entries */
	#define KERN_FILE 15 /* struct: file entries */
	#define KERN_PROF 16 /* node: kernel profiling info */
	#define KERN_POSIX1 17 /* int: POSIX.1 version */
	#define KERN_NGROUPS 18 /* int: # of supplemental group ids */
	#define KERN_JOB_CONTROL 19 /* int: is job control available */
	#define KERN_SAVED_IDS 20 /* int: saved set-user/group-ID */
	#define KERN_BOOTTIME 21 /* struct: time kernel was booted */
	#define KERN_NISDOMAINNAME 22 /* string: YP domain name */
	#define KERN_UPDATEINTERVAL 23 /* int: update process sleep time */
	#define KERN_OSRELDATE 24 /* int: kernel release date */
	#define KERN_NTP_PLL 25 /* node: NTP PLL control */
	#define KERN_BOOTFILE 26 /* string: name of booted kernel */
	#define KERN_MAXFILESPERPROC 27 /* int: max open files per proc */
	#define KERN_MAXPROCPERUID 28 /* int: max processes per uid */
	#define KERN_DUMPDEV 29 /* struct cdev : device to dump on /
	#define KERN_IPC 30 /* node: anything related to IPC */
	#define KERN_DUMMY 31 /* unused */
	#define KERN_PS_STRINGS 32 /* int: address of PS_STRINGS */
	#define KERN_USRSTACK 33 /* int: address of USRSTACK */
	#define KERN_LOGSIGEXIT 34 /* int: do we log sigexit procs? */
	#define KERN_IOV_MAX 35 /* int: value of UIO_MAXIOV */
	#define KERN_HOSTUUID 36 /* string: host UUID identifier */
	#define KERN_ARND 37 /* int: from arc4rand() */
	#define KERN_MAXID 38 /* number of valid kern ids */

	#define CTL_KERN_NAMES { \
	{ 0, 0 }, \
	{ "ostype", CTLTYPE_STRING }, \
	{ "osrelease", CTLTYPE_STRING }, \
	{ "osrevision", CTLTYPE_INT }, \
	{ "version", CTLTYPE_STRING }, \
	{ "maxvnodes", CTLTYPE_INT }, \
	{ "maxproc", CTLTYPE_INT }, \
	{ "maxfiles", CTLTYPE_INT }, \
	{ "argmax", CTLTYPE_INT }, \
	{ "securelevel", CTLTYPE_INT }, \
	{ "hostname", CTLTYPE_STRING }, \
	{ "hostid", CTLTYPE_UINT }, \
	{ "clockrate", CTLTYPE_STRUCT }, \
	{ "vnode", CTLTYPE_STRUCT }, \
	{ "proc", CTLTYPE_STRUCT }, \
	{ "file", CTLTYPE_STRUCT }, \
	{ "profiling", CTLTYPE_NODE }, \
	{ "posix1version", CTLTYPE_INT }, \
	{ "ngroups", CTLTYPE_INT }, \
	{ "job_control", CTLTYPE_INT }, \
	{ "saved_ids", CTLTYPE_INT }, \
	{ "boottime", CTLTYPE_STRUCT }, \
	{ "nisdomainname", CTLTYPE_STRING }, \
	{ "update", CTLTYPE_INT }, \
	{ "osreldate", CTLTYPE_INT }, \
	{ "ntp_pll", CTLTYPE_NODE }, \
	{ "bootfile", CTLTYPE_STRING }, \
	{ "maxfilesperproc", CTLTYPE_INT }, \
	{ "maxprocperuid", CTLTYPE_INT }, \
	{ "ipc", CTLTYPE_NODE }, \
	{ "dummy", CTLTYPE_INT }, \
	{ "ps_strings", CTLTYPE_INT }, \
	{ "usrstack", CTLTYPE_INT }, \
	{ "logsigexit", CTLTYPE_INT }, \
	{ "iov_max", CTLTYPE_INT }, \
	{ "hostuuid", CTLTYPE_STRING }, \
	}

	/*
	* CTL_VFS identifiers
	*/
	#define CTL_VFS_NAMES { \
	{ "vfsconf", CTLTYPE_STRUCT }, \
	}

	/*
	* KERN_PROC subtypes
	*/
	#define KERN_PROC_ALL 0 /* everything */
	#define KERN_PROC_PID 1 /* by process id */
	#define KERN_PROC_PGRP 2 /* by process group id */
	#define KERN_PROC_SESSION 3 /* by session of pid */
	#define KERN_PROC_TTY 4 /* by controlling tty */
	#define KERN_PROC_UID 5 /* by effective uid */
	#define KERN_PROC_RUID 6 /* by real uid */
	#define KERN_PROC_ARGS 7 /* get/set arguments/proctitle */
	#define KERN_PROC_PROC 8 /* only return procs */
	#define KERN_PROC_SV_NAME 9 /* get syscall vector name */
	#define KERN_PROC_RGID 10 /* by real group id */
	#define KERN_PROC_GID 11 /* by effective group id */
	#define KERN_PROC_PATHNAME 12 /* path to executable */
	#define KERN_PROC_VMMAP 13 /* VM map entries for process */
	#define KERN_PROC_FILEDESC 14 /* File descriptors for process */
	#define KERN_PROC_KSTACK 15 /* Kernel stacks for process */
	#define KERN_PROC_INC_THREAD 0x10 /*
	* modifier for pid, pgrp, tty,
	* uid, ruid, gid, rgid and proc
	*/

	/*
	* KERN_IPC identifiers
	*/
	#define KIPC_MAXSOCKBUF 1 /* int: max size of a socket buffer */
	#define KIPC_SOCKBUF_WASTE 2 /* int: wastage factor in sockbuf */
	#define KIPC_SOMAXCONN 3 /* int: max length of connection q */
	#define KIPC_MAX_LINKHDR 4 /* int: max length of link header */
	#define KIPC_MAX_PROTOHDR 5 /* int: max length of network header */
	#define KIPC_MAX_HDR 6 /* int: max total length of headers */
	#define KIPC_MAX_DATALEN 7 /* int: max length of data? */

	/*
	* CTL_HW identifiers
	*/
	#define HW_MACHINE 1 /* string: machine class */
	#define HW_MODEL 2 /* string: specific machine model */
	#define HW_NCPU 3 /* int: number of cpus */
	#define HW_BYTEORDER 4 /* int: machine byte order */
	#define HW_PHYSMEM 5 /* int: total memory */
	#define HW_USERMEM 6 /* int: non-kernel memory */
	#define HW_PAGESIZE 7 /* int: software page size */
	#define HW_DISKNAMES 8 /* strings: disk drive names */
	#define HW_DISKSTATS 9 /* struct: diskstats[] */
	#define HW_FLOATINGPT 10 /* int: has HW floating point? */
	#define HW_MACHINE_ARCH 11 /* string: machine architecture */
	#define HW_REALMEM 12 /* int: 'real' memory */
	#define HW_MAXID 13 /* number of valid hw ids */

	#define CTL_HW_NAMES { \
	{ 0, 0 }, \
	{ "machine", CTLTYPE_STRING }, \
	{ "model", CTLTYPE_STRING }, \
	{ "ncpu", CTLTYPE_INT }, \
	{ "byteorder", CTLTYPE_INT }, \
	{ "physmem", CTLTYPE_ULONG }, \
	{ "usermem", CTLTYPE_ULONG }, \
	{ "pagesize", CTLTYPE_INT }, \
	{ "disknames", CTLTYPE_STRUCT }, \
	{ "diskstats", CTLTYPE_STRUCT }, \
	{ "floatingpoint", CTLTYPE_INT }, \
	{ "machine_arch", CTLTYPE_STRING }, \
	{ "realmem", CTLTYPE_ULONG }, \
	}

	/*
	* CTL_USER definitions
	*/
	#define USER_CS_PATH 1 /* string: _CS_PATH */
	#define USER_BC_BASE_MAX 2 /* int: BC_BASE_MAX */
	#define USER_BC_DIM_MAX 3 /* int: BC_DIM_MAX */
	#define USER_BC_SCALE_MAX 4 /* int: BC_SCALE_MAX */
	#define USER_BC_STRING_MAX 5 /* int: BC_STRING_MAX */
	#define USER_COLL_WEIGHTS_MAX 6 /* int: COLL_WEIGHTS_MAX */
	#define USER_EXPR_NEST_MAX 7 /* int: EXPR_NEST_MAX */
	#define USER_LINE_MAX 8 /* int: LINE_MAX */
	#define USER_RE_DUP_MAX 9 /* int: RE_DUP_MAX */
	#define USER_POSIX2_VERSION 10 /* int: POSIX2_VERSION */
	#define USER_POSIX2_C_BIND 11 /* int: POSIX2_C_BIND */
	#define USER_POSIX2_C_DEV 12 /* int: POSIX2_C_DEV */
	#define USER_POSIX2_CHAR_TERM 13 /* int: POSIX2_CHAR_TERM */
	#define USER_POSIX2_FORT_DEV 14 /* int: POSIX2_FORT_DEV */
	#define USER_POSIX2_FORT_RUN 15 /* int: POSIX2_FORT_RUN */
	#define USER_POSIX2_LOCALEDEF 16 /* int: POSIX2_LOCALEDEF */
	#define USER_POSIX2_SW_DEV 17 /* int: POSIX2_SW_DEV */
	#define USER_POSIX2_UPE 18 /* int: POSIX2_UPE */
	#define USER_STREAM_MAX 19 /* int: POSIX2_STREAM_MAX */
	#define USER_TZNAME_MAX 20 /* int: POSIX2_TZNAME_MAX */
	#define USER_MAXID 21 /* number of valid user ids */

	#define CTL_USER_NAMES { \
	{ 0, 0 }, \
	{ "cs_path", CTLTYPE_STRING }, \
	{ "bc_base_max", CTLTYPE_INT }, \
	{ "bc_dim_max", CTLTYPE_INT }, \
	{ "bc_scale_max", CTLTYPE_INT }, \
	{ "bc_string_max", CTLTYPE_INT }, \
	{ "coll_weights_max", CTLTYPE_INT }, \
	{ "expr_nest_max", CTLTYPE_INT }, \
	{ "line_max", CTLTYPE_INT }, \
	{ "re_dup_max", CTLTYPE_INT }, \
	{ "posix2_version", CTLTYPE_INT }, \
	{ "posix2_c_bind", CTLTYPE_INT }, \
	{ "posix2_c_dev", CTLTYPE_INT }, \
	{ "posix2_char_term", CTLTYPE_INT }, \
	{ "posix2_fort_dev", CTLTYPE_INT }, \
	{ "posix2_fort_run", CTLTYPE_INT }, \
	{ "posix2_localedef", CTLTYPE_INT }, \
	{ "posix2_sw_dev", CTLTYPE_INT }, \
	{ "posix2_upe", CTLTYPE_INT }, \
	{ "stream_max", CTLTYPE_INT }, \
	{ "tzname_max", CTLTYPE_INT }, \
	}

	#define CTL_P1003_1B_ASYNCHRONOUS_IO 1 /* boolean */
	#define CTL_P1003_1B_MAPPED_FILES 2 /* boolean */
	#define CTL_P1003_1B_MEMLOCK 3 /* boolean */
	#define CTL_P1003_1B_MEMLOCK_RANGE 4 /* boolean */
	#define CTL_P1003_1B_MEMORY_PROTECTION 5 /* boolean */
	#define CTL_P1003_1B_MESSAGE_PASSING 6 /* boolean */
	#define CTL_P1003_1B_PRIORITIZED_IO 7 /* boolean */
	#define CTL_P1003_1B_PRIORITY_SCHEDULING 8 /* boolean */
	#define CTL_P1003_1B_REALTIME_SIGNALS 9 /* boolean */
	#define CTL_P1003_1B_SEMAPHORES 10 /* boolean */
	#define CTL_P1003_1B_FSYNC 11 /* boolean */
	#define CTL_P1003_1B_SHARED_MEMORY_OBJECTS 12 /* boolean */
	#define CTL_P1003_1B_SYNCHRONIZED_IO 13 /* boolean */
	#define CTL_P1003_1B_TIMERS 14 /* boolean */
	#define CTL_P1003_1B_AIO_LISTIO_MAX 15 /* int */
	#define CTL_P1003_1B_AIO_MAX 16 /* int */
	#define CTL_P1003_1B_AIO_PRIO_DELTA_MAX 17 /* int */
	#define CTL_P1003_1B_DELAYTIMER_MAX 18 /* int */
	#define CTL_P1003_1B_MQ_OPEN_MAX 19 /* int */
	#define CTL_P1003_1B_PAGESIZE 20 /* int */
	#define CTL_P1003_1B_RTSIG_MAX 21 /* int */
	#define CTL_P1003_1B_SEM_NSEMS_MAX 22 /* int */
	#define CTL_P1003_1B_SEM_VALUE_MAX 23 /* int */
	#define CTL_P1003_1B_SIGQUEUE_MAX 24 /* int */
	#define CTL_P1003_1B_TIMER_MAX 25 /* int */

	#define CTL_P1003_1B_MAXID 26

	#define CTL_P1003_1B_NAMES { \
	{ 0, 0 }, \
	{ "asynchronous_io", CTLTYPE_INT }, \
	{ "mapped_files", CTLTYPE_INT }, \
	{ "memlock", CTLTYPE_INT }, \
	{ "memlock_range", CTLTYPE_INT }, \
	{ "memory_protection", CTLTYPE_INT }, \
	{ "message_passing", CTLTYPE_INT }, \
	{ "prioritized_io", CTLTYPE_INT }, \
	{ "priority_scheduling", CTLTYPE_INT }, \
	{ "realtime_signals", CTLTYPE_INT }, \
	{ "semaphores", CTLTYPE_INT }, \
	{ "fsync", CTLTYPE_INT }, \
	{ "shared_memory_objects", CTLTYPE_INT }, \
	{ "synchronized_io", CTLTYPE_INT }, \
	{ "timers", CTLTYPE_INT }, \
	{ "aio_listio_max", CTLTYPE_INT }, \
	{ "aio_max", CTLTYPE_INT }, \
	{ "aio_prio_delta_max", CTLTYPE_INT }, \
	{ "delaytimer_max", CTLTYPE_INT }, \
	{ "mq_open_max", CTLTYPE_INT }, \
	{ "pagesize", CTLTYPE_INT }, \
	{ "rtsig_max", CTLTYPE_INT }, \
	{ "nsems_max", CTLTYPE_INT }, \
	{ "sem_value_max", CTLTYPE_INT }, \
	{ "sigqueue_max", CTLTYPE_INT }, \
	{ "timer_max", CTLTYPE_INT }, \
	}

	#ifdef _KERNEL

	/*
	* Declare some common oids.
	*/
	extern struct sysctl_oid_list sysctl__children;
	SYSCTL_DECL(_kern);
	SYSCTL_DECL(_kern_features);
	SYSCTL_DECL(_kern_ipc);
	SYSCTL_DECL(_kern_proc);
	SYSCTL_DECL(_kern_sched);
	SYSCTL_DECL(_kern_sched_stats);
	SYSCTL_DECL(_sysctl);
	SYSCTL_DECL(_vm);
	SYSCTL_DECL(_vm_stats);
	SYSCTL_DECL(_vm_stats_misc);
	SYSCTL_DECL(_vfs);
	SYSCTL_DECL(_net);
	SYSCTL_DECL(_debug);
	SYSCTL_DECL(_debug_sizeof);
	SYSCTL_DECL(_hw);
	SYSCTL_DECL(_hw_bus);
	SYSCTL_DECL(_machdep);
	SYSCTL_DECL(_user);
	SYSCTL_DECL(_compat);
	SYSCTL_DECL(_regression);
	SYSCTL_DECL(_security);
	SYSCTL_DECL(_security_bsd);

	extern char machine[];
	extern char osrelease[];
	extern char ostype[];
	extern char kern_ident[];

	/* Dynamic oid handling */
	struct sysctl_oid sysctl_add_oid(struct sysctl_ctx_list clist,
	struct sysctl_oid_list parent, int nbr, const char name,
	int kind, void *arg1, int arg2,
	int (*handler) (SYSCTL_HANDLER_ARGS),
	const char fmt, const char descr);
	void sysctl_rename_oid(struct sysctl_oid oidp, const char name);
	int sysctl_move_oid(struct sysctl_oid *oidp,
	struct sysctl_oid_list *parent);
	int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse);
	int sysctl_ctx_init(struct sysctl_ctx_list *clist);
	int sysctl_ctx_free(struct sysctl_ctx_list *clist);
	struct sysctl_ctx_entry sysctl_ctx_entry_add(struct sysctl_ctx_list clist,
	struct sysctl_oid *oidp);
	struct sysctl_ctx_entry sysctl_ctx_entry_find(struct sysctl_ctx_list clist,
	struct sysctl_oid *oidp);
	int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist,
	struct sysctl_oid *oidp);

	int kernel_sysctl(struct thread td, int name, u_int namelen, void *old,
	size_t oldlenp, void new, size_t newlen,
	size_t *retval, int flags);
	int kernel_sysctlbyname(struct thread td, char name,
	void old, size_t oldlenp, void *new, size_t newlen,
	size_t *retval, int flags);
	int userland_sysctl(struct thread td, int name, u_int namelen, void *old,
	size_t oldlenp, int inkernel, void new, size_t newlen,
	size_t *retval, int flags);
	int sysctl_find_oid(int name, u_int namelen, struct sysctl_oid *noid,
	int nindx, struct sysctl_req req);
	int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len);

	#else /* !_KERNEL */
	#include <sys/cdefs.h>

	__BEGIN_DECLS
	int sysctl(int , u_int, void , size_t , void , size_t);
	int sysctlbyname(const char , void , size_t , void , size_t);
	int sysctlnametomib(const char , int , size_t *);
	__END_DECLS
	#endif /* _KERNEL */

	#endif /* !_SYS_SYSCTL_H_ */
	Index: head/sys/sys/vimage.h
	===================================================================
	--- head/sys/sys/vimage.h (revision 183549)
	+++ head/sys/sys/vimage.h (revision 183550)
	@@ -1,349 +1,66 @@
	/*-
	* Copyright (c) 2006-2008 University of Zagreb
	* Copyright (c) 2006-2008 FreeBSD Foundation
	*
	* This software was developed by the University of Zagreb and the
	* FreeBSD Foundation under sponsorship by the Stichting NLnet and the
	* FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _SYS_VIMAGE_H_
	#define _SYS_VIMAGE_H_

	-#define V_hostname hostname
	-#define G_hostname hostname
	-#define V_domainname domainname
	-#define V_acq_seq acq_seq
	-#define V_acqtree acqtree
	-#define V_addrsel_policytab addrsel_policytab
	-#define V_ah_cleartos ah_cleartos
	-#define V_ah_enable ah_enable
	-#define V_ahstat ahstat
	-#define V_arp_maxtries arp_maxtries
	-#define V_arp_proxyall arp_proxyall
	-#define V_arpt_keep arpt_keep
	-#define V_autoinc_step autoinc_step
	-#define V_blackhole blackhole
	-#define V_crypto_support crypto_support
	-#define V_curr_dyn_buckets curr_dyn_buckets
	-#define V_dad_ignore_ns dad_ignore_ns
	-#define V_dad_init dad_init
	-#define V_dad_maxtry dad_maxtry
	-#define V_dadq dadq
	-#define V_defaultaddrpolicy defaultaddrpolicy
	-#define V_divcb divcb
	-#define V_divcbinfo divcbinfo
	-#define V_drop_synfin drop_synfin
	-#define V_dyn_ack_lifetime dyn_ack_lifetime
	-#define V_dyn_buckets dyn_buckets
	-#define V_dyn_count dyn_count
	-#define V_dyn_fin_lifetime dyn_fin_lifetime
	-#define V_dyn_keepalive dyn_keepalive
	-#define V_dyn_keepalive_interval dyn_keepalive_interval
	-#define V_dyn_keepalive_period dyn_keepalive_period
	-#define V_dyn_max dyn_max
	-#define V_dyn_rst_lifetime dyn_rst_lifetime
	-#define V_dyn_short_lifetime dyn_short_lifetime
	-#define V_dyn_syn_lifetime dyn_syn_lifetime
	-#define V_dyn_udp_lifetime dyn_udp_lifetime
	-#define V_esp_enable esp_enable
	-#define V_esp_max_ivlen esp_max_ivlen
	-#define V_espstat espstat
	-#define V_ether_ipfw ether_ipfw
	-#define V_frag6_nfragpackets frag6_nfragpackets
	-#define V_frag6_nfrags frag6_nfrags
	-#define V_fw6_enable fw6_enable
	-#define V_fw_debug fw_debug
	-#define V_fw_deny_unknown_exthdrs fw_deny_unknown_exthdrs
	-#define V_fw_enable fw_enable
	-#define V_fw_one_pass fw_one_pass
	-#define V_fw_verbose fw_verbose
	-#define V_gif_softc_list gif_softc_list
	-#define V_icmp6_nodeinfo icmp6_nodeinfo
	-#define V_icmp6_rediraccept icmp6_rediraccept
	-#define V_icmp6_redirtimeout icmp6_redirtimeout
	-#define V_icmp6errpps_count icmp6errpps_count
	-#define V_icmp6errppslim icmp6errppslim
	-#define V_icmp6errppslim_last icmp6errppslim_last
	-#define V_icmp6stat icmp6stat
	-#define V_icmp_may_rst icmp_may_rst
	-#define V_icmpstat icmpstat
	-#define V_if_index if_index
	-#define V_if_indexlim if_indexlim
	-#define V_ifaddr_event_tag ifaddr_event_tag
	-#define V_ifg_head ifg_head
	-#define V_ifindex_table ifindex_table
	-#define V_ifklist ifklist
	-#define V_ifnet ifnet
	-#define V_igmpstat igmpstat
	-#define V_in6_ifaddr in6_ifaddr
	-#define V_in6_maxmtu in6_maxmtu
	-#define V_in6_tmpaddrtimer_ch in6_tmpaddrtimer_ch
	-#define V_in_ifaddrhashtbl in_ifaddrhashtbl
	-#define V_in_ifaddrhead in_ifaddrhead
	-#define V_in_ifaddrhmask in_ifaddrhmask
	-#define V_in_multihead in_multihead
	-#define V_ip4_ah_net_deflev ip4_ah_net_deflev
	-#define V_ip4_ah_offsetmask ip4_ah_offsetmask
	-#define V_ip4_ah_trans_deflev ip4_ah_trans_deflev
	-#define V_ip4_def_policy ip4_def_policy
	-#define V_ip4_esp_net_deflev ip4_esp_net_deflev
	-#define V_ip4_esp_randpad ip4_esp_randpad
	-#define V_ip4_esp_trans_deflev ip4_esp_trans_deflev
	-#define V_ip4_ipsec_dfbit ip4_ipsec_dfbit
	-#define V_ip4_ipsec_ecn ip4_ipsec_ecn
	-#define V_ip6_accept_rtadv ip6_accept_rtadv
	-#define V_ip6_ah_net_deflev ip6_ah_net_deflev
	-#define V_ip6_ah_trans_deflev ip6_ah_trans_deflev
	-#define V_ip6_auto_flowlabel ip6_auto_flowlabel
	-#define V_ip6_auto_linklocal ip6_auto_linklocal
	-#define V_ip6_dad_count ip6_dad_count
	-#define V_ip6_defhlim ip6_defhlim
	-#define V_ip6_defmcasthlim ip6_defmcasthlim
	-#define V_ip6_desync_factor ip6_desync_factor
	-#define V_ip6_esp_net_deflev ip6_esp_net_deflev
	-#define V_ip6_esp_trans_deflev ip6_esp_trans_deflev
	-#define V_ip6_forward_rt ip6_forward_rt
	-#define V_ip6_forward_srcrt ip6_forward_srcrt
	-#define V_ip6_forwarding ip6_forwarding
	-#define V_ip6_gif_hlim ip6_gif_hlim
	-#define V_ip6_hdrnestlimit ip6_hdrnestlimit
	-#define V_ip6_ipsec_ecn ip6_ipsec_ecn
	-#define V_ip6_keepfaith ip6_keepfaith
	-#define V_ip6_log_interval ip6_log_interval
	-#define V_ip6_log_time ip6_log_time
	-#define V_ip6_maxfragpackets ip6_maxfragpackets
	-#define V_ip6_maxfrags ip6_maxfrags
	-#define V_ip6_mcast_pmtu ip6_mcast_pmtu
	-#define V_ip6_mrouter_ver ip6_mrouter_ver
	-#define V_ip6_opts ip6_opts
	-#define V_ip6_ours_check_algorithm ip6_ours_check_algorithm
	-#define V_ip6_prefer_tempaddr ip6_prefer_tempaddr
	-#define V_ip6_rr_prune ip6_rr_prune
	-#define V_ip6_sendredirects ip6_sendredirects
	-#define V_ip6_sourcecheck ip6_sourcecheck
	-#define V_ip6_sourcecheck_interval ip6_sourcecheck_interval
	-#define V_ip6_temp_preferred_lifetime ip6_temp_preferred_lifetime
	-#define V_ip6_temp_regen_advance ip6_temp_regen_advance
	-#define V_ip6_temp_valid_lifetime ip6_temp_valid_lifetime
	-#define V_ip6_use_defzone ip6_use_defzone
	-#define V_ip6_use_deprecated ip6_use_deprecated
	-#define V_ip6_use_tempaddr ip6_use_tempaddr
	-#define V_ip6_v6only ip6_v6only
	-#define V_ip6q ip6q
	-#define V_ip6qmaxlen ip6qmaxlen
	-#define V_ip6stat ip6stat
	-#define V_ip6stealth ip6stealth
	-#define V_ip_checkinterface ip_checkinterface
	-#define V_ip_defttl ip_defttl
	-#define V_ip_do_randomid ip_do_randomid
	-#define V_ip_gif_ttl ip_gif_ttl
	-#define V_ip_id ip_id
	-#define V_ip_keepfaith ip_keepfaith
	-#define V_ip_mrouter ip_mrouter
	-#define V_ip_rsvp_on ip_rsvp_on
	-#define V_ip_rsvpd ip_rsvpd
	-#define V_ip_sendsourcequench ip_sendsourcequench
	-#define V_ipcomp_enable ipcomp_enable
	-#define V_ipcompstat ipcompstat
	-#define V_ipfastforward_active ipfastforward_active
	-#define V_ipforwarding ipforwarding
	-#define V_ipfw_dyn_v ipfw_dyn_v
	-#define V_ipfw_timeout ipfw_timeout
	-#define V_ipip_allow ipip_allow
	-#define V_ipipstat ipipstat
	-#define V_ipport_firstauto ipport_firstauto
	-#define V_ipport_hifirstauto ipport_hifirstauto
	-#define V_ipport_hilastauto ipport_hilastauto
	-#define V_ipport_lastauto ipport_lastauto
	-#define V_ipport_lowfirstauto ipport_lowfirstauto
	-#define V_ipport_lowlastauto ipport_lowlastauto
	-#define V_ipport_randomcps ipport_randomcps
	-#define V_ipport_randomized ipport_randomized
	-#define V_ipport_randomtime ipport_randomtime
	-#define V_ipport_reservedhigh ipport_reservedhigh
	-#define V_ipport_reservedlow ipport_reservedlow
	-#define V_ipport_stoprandom ipport_stoprandom
	-#define V_ipport_tcpallocs ipport_tcpallocs
	-#define V_ipport_tcplastcount ipport_tcplastcount
	-#define V_ipq ipq
	-#define V_ipq_zone ipq_zone
	-#define V_ipsec4stat ipsec4stat
	-#define V_ipsec6stat ipsec6stat
	-#define V_ipsec_ah_keymin ipsec_ah_keymin
	-#define V_ipsec_debug ipsec_debug
	-#define V_ipsec_esp_auth ipsec_esp_auth
	-#define V_ipsec_esp_keymin ipsec_esp_keymin
	-#define V_ipsec_integrity ipsec_integrity
	-#define V_ipsec_replay ipsec_replay
	-#define V_ipsendredirects ipsendredirects
	-#define V_ipstat ipstat
	-#define V_ipstealth ipstealth
	-#define V_isn_ctx isn_ctx
	-#define V_isn_last_reseed isn_last_reseed
	-#define V_isn_offset isn_offset
	-#define V_isn_offset_old isn_offset_old
	-#define V_isn_secret isn_secret
	-#define V_key_blockacq_count key_blockacq_count
	-#define V_key_blockacq_lifetime key_blockacq_lifetime
	-#define V_key_cb key_cb
	-#define V_key_debug_level key_debug_level
	-#define V_key_int_random key_int_random
	-#define V_key_larval_lifetime key_larval_lifetime
	-#define V_key_preferred_oldsa key_preferred_oldsa
	-#define V_key_spi_maxval key_spi_maxval
	-#define V_key_spi_minval key_spi_minval
	-#define V_key_spi_trycnt key_spi_trycnt
	-#define V_key_src key_src
	-#define V_layer3_chain layer3_chain
	-#define V_llinfo_arp llinfo_arp
	-#define V_llinfo_nd6 llinfo_nd6
	-#define V_lo_list lo_list
	-#define V_loif loif
	-#define V_max_gif_nesting max_gif_nesting
	-#define V_maxfragsperpacket maxfragsperpacket
	-#define V_maxnipq maxnipq
	-#define V_mrt6debug mrt6debug
	-#define V_nd6_allocated nd6_allocated
	-#define V_nd6_debug nd6_debug
	-#define V_nd6_defifindex nd6_defifindex
	-#define V_nd6_defifp nd6_defifp
	-#define V_nd6_delay nd6_delay
	-#define V_nd6_gctimer nd6_gctimer
	-#define V_nd6_inuse nd6_inuse
	-#define V_nd6_maxndopt nd6_maxndopt
	-#define V_nd6_maxnudhint nd6_maxnudhint
	-#define V_nd6_maxqueuelen nd6_maxqueuelen
	-#define V_nd6_mmaxtries nd6_mmaxtries
	-#define V_nd6_prune nd6_prune
	-#define V_nd6_recalc_reachtm_interval nd6_recalc_reachtm_interval
	-#define V_nd6_slowtimo_ch nd6_slowtimo_ch
	-#define V_nd6_timer_ch nd6_timer_ch
	-#define V_nd6_umaxtries nd6_umaxtries
	-#define V_nd6_useloopback nd6_useloopback
	-#define V_nd_defrouter nd_defrouter
	-#define V_nd_prefix nd_prefix
	-#define V_nextID nextID
	-#define V_ng_ID_hash ng_ID_hash
	-#define V_ng_eiface_unit ng_eiface_unit
	-#define V_ng_iface_unit ng_iface_unit
	-#define V_ng_name_hash ng_name_hash
	-#define V_nipq nipq
	-#define V_nolocaltimewait nolocaltimewait
	-#define V_norule_counter norule_counter
	-#define V_parallel_tunnels parallel_tunnels
	-#define V_path_mtu_discovery path_mtu_discovery
	-#define V_pfkeystat pfkeystat
	-#define V_pim6 pim6
	-#define V_pmtu_expire pmtu_expire
	-#define V_pmtu_probe pmtu_probe
	-#define V_policy_id policy_id
	-#define V_rawcb_list rawcb_list
	-#define V_regtree regtree
	-#define V_rip6_recvspace rip6_recvspace
	-#define V_rip6_sendspace rip6_sendspace
	-#define V_rip6stat rip6stat
	-#define V_ripcb ripcb
	-#define V_ripcbinfo ripcbinfo
	-#define V_router_info_head router_info_head
	-#define V_rsvp_on rsvp_on
	-#define V_rt_tables rt_tables
	-#define V_rtq_minreallyold rtq_minreallyold
	-#define V_rtq_minreallyold6 rtq_minreallyold6
	-#define V_rtq_mtutimer rtq_mtutimer
	-#define V_rtq_reallyold rtq_reallyold
	-#define V_rtq_reallyold6 rtq_reallyold6
	-#define V_rtq_timeout rtq_timeout
	-#define V_rtq_timeout6 rtq_timeout6
	-#define V_rtq_timer rtq_timer
	-#define V_rtq_timer6 rtq_timer6
	-#define V_rtq_toomany rtq_toomany
	-#define V_rtq_toomany6 rtq_toomany6
	-#define V_rtstat rtstat
	-#define V_rttrash rttrash
	-#define V_sahtree sahtree
	-#define V_sameprefixcarponly sameprefixcarponly
	-#define V_saorder_state_alive saorder_state_alive
	-#define V_saorder_state_any saorder_state_any
	-#define V_set_disable set_disable
	-#define V_sid_default sid_default
	-#define V_spacqtree spacqtree
	-#define V_sptree sptree
	-#define V_ss_fltsz ss_fltsz
	-#define V_ss_fltsz_local ss_fltsz_local
	-#define V_static_len static_len
	-#define V_static_count static_count
	-#define V_subnetsarelocal subnetsarelocal
	-#define V_tcb tcb
	-#define V_tcbinfo tcbinfo
	-#define V_tcp_autorcvbuf_inc tcp_autorcvbuf_inc
	-#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
	-#define V_tcp_autosndbuf_inc tcp_autosndbuf_inc
	-#define V_tcp_autosndbuf_max tcp_autosndbuf_max
	-#define V_tcp_delack_enabled tcp_delack_enabled
	-#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
	-#define V_tcp_do_autosndbuf tcp_do_autosndbuf
	-#define V_tcp_do_ecn tcp_do_ecn
	-#define V_tcp_do_newreno tcp_do_newreno
	-#define V_tcp_do_rfc1323 tcp_do_rfc1323
	-#define V_tcp_do_rfc3042 tcp_do_rfc3042
	-#define V_tcp_do_rfc3390 tcp_do_rfc3390
	-#define V_tcp_do_sack tcp_do_sack
	-#define V_tcp_do_tso tcp_do_tso
	-#define V_tcp_hc_callout tcp_hc_callout
	-#define V_tcp_ecn_maxretries tcp_ecn_maxretries
	-#define V_tcp_hostcache tcp_hostcache
	-#define V_tcp_inflight_enable tcp_inflight_enable
	-#define V_tcp_inflight_max tcp_inflight_max
	-#define V_tcp_inflight_min tcp_inflight_min
	-#define V_tcp_inflight_rttthresh tcp_inflight_rttthresh
	-#define V_tcp_inflight_stab tcp_inflight_stab
	-#define V_tcp_insecure_rst tcp_insecure_rst
	-#define V_tcp_isn_reseed_interval tcp_isn_reseed_interval
	-#define V_tcp_minmss tcp_minmss
	-#define V_tcp_mssdflt tcp_mssdflt
	-#define V_tcp_reass_maxqlen tcp_reass_maxqlen
	-#define V_tcp_reass_maxseg tcp_reass_maxseg
	-#define V_tcp_reass_overflows tcp_reass_overflows
	-#define V_tcp_reass_qsize tcp_reass_qsize
	-#define V_tcp_sack_globalholes tcp_sack_globalholes
	-#define V_tcp_sack_globalmaxholes tcp_sack_globalmaxholes
	-#define V_tcp_sack_maxholes tcp_sack_maxholes
	-#define V_tcp_sc_rst_sock_fail tcp_sc_rst_sock_fail
	-#define V_tcp_syncache tcp_syncache
	-#define V_tcp_v6mssdflt tcp_v6mssdflt
	-#define V_tcpstat tcpstat
	-#define V_twq_2msl twq_2msl
	-#define V_udb udb
	-#define V_udbinfo udbinfo
	-#define V_udp_blackhole udp_blackhole
	-#define V_udp6_recvspace udp6_recvspace
	-#define V_udp6_sendspace udp6_sendspace
	-#define V_udpstat udpstat
	-#define V_useloopback useloopback
	-#define V_verbose_limit verbose_limit
	+/* Non-VIMAGE null-macros */
	+#define CURVNET_SET(arg)
	+#define CURVNET_SET_QUIET(arg)
	+#define CURVNET_RESTORE()
	+#define VNET_ASSERT(condition)
	+#define VSYM(base, sym) (sym)
	+#define INIT_FROM_VNET(vnet, modindex, modtype, sym)
	+#define VNET_ITERATOR_DECL(arg)
	+#define VNET_FOREACH(arg)
	+#define VNET_LIST_RLOCK()
	+#define VNET_LIST_RUNLOCK()
	+#define INIT_VPROCG(arg)
	+#define INIT_VCPU(arg)
	+#define TD_TO_VIMAGE(td)
	+#define TD_TO_VNET(td)
	+#define TD_TO_VPROCG(td)
	+#define TD_TO_VCPU(td)
	+#define P_TO_VIMAGE(p)
	+#define P_TO_VNET(p)
	+#define P_TO_VPROCG(p)
	+#define P_TO_VCPU(p)
	+
	+/* XXX those defines bellow should probably go into vprocg.h and vcpu.h */
	+#define VPROCG(sym) VSYM(vprocg, sym)
	+#define VCPU(sym) VSYM(vcpu, sym)
	+
	+#define V_hostname VPROCG(hostname)
	+#define G_hostname VSYM(basevprocg, hostname) /* global hostname */
	+#define V_domainname VPROCG(domainname)

	#endif /* !_SYS_VIMAGE_H_ */

File Metadata

Mime Type: application/octet-stream
Expires: Sat, Oct 25, 5:10 PM (2 d)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: WfU2_mcDktUq
Default Alt Text: (4 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions